在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
php 多线程控制(说明:本文代码在查询百度排名时并不好使,仅供批评) 1 <html> 2 <head> 3 <meta http-equiv="content-type" content="text/html;charset=utf-8" /> 4 <title>百度关键词排名批量查询</title> 5 </head> 6 <body> 7 <h3>百度关键词排名批量查询</h3> 8 <form action="/baidu/index.php" method="post"> 9 输入关键词(每行一个)<br /> 10 <textarea name="keyword" style="resize:none;width:243px;height:70px;"></textarea> 11 <br />输入网址: 12 <input type="text" name="url" size="20" value="39.net" />(如:39.net 勿加http://)<br /> 13 <input type="submit" name="sub" value="查询" /> 14 </form> 15 <hr /> 16 </body> 17 </html> 18 <?php 19 if(isset($_POST['sub'])){ 20 $start_time = microtime_float(); 21 $kw = $_POST['keyword']; 22 $findurl = $_POST['url']; 23 $httpcurl = new CoreHttpCurl(); 24 $keywords = $httpcurl->get_keywords($kw); //查询的关键词数组 25 $urls = $httpcurl->get_urls($keywords); //百度搜索结果页面,array("关键词"=>"url",) 26 $ranks = $httpcurl->get($urls,10,$findurl); //关键词排名,array("关键词"=>"排名",) 27 $output = "<table border='1' bordercolor='green' cellspacing='0'><tr><th>关键词</th><th>排名</th></tr>"; 28 foreach($ranks as $keyword=>$rank){ 29 $output .= "<tr><td>{$keyword}</td><td>{$rank}</td></tr>"; 30 } 31 $output .= "</table>"; 32 echo $output; 33 34 $end_time = microtime_float(); 35 $con_time = $end_time - $start_time; 36 echo "查询耗时:".$con_time; 37 } 38 39 /** 40 * 计算耗时 41 **/ 42 function microtime_float(){ 43 list($usec,$sec) = explode(" ",microtime()); 44 return ((float)$usec+(float)$sec); 45 } 46 47 class CoreHttpCurl{ 48 protected $keywords = array(); //查询的关键词 49 protected $findurl = null; //查询的网站url 50 protected $urls = array(); //获取到的所有urls请求地址 51 52 protected $http_data = array(); //.... 53 protected $multi_exec_num = 10; //多列队任务进程数,0表示不限制 54 static protected $connecttimeout_ms = 3000; //默认连接超时时间 55 56 function __construct(){ 57 } 58 59 /** 60 *分析提交的关键词,并拆分成数组 61 **/ 62 public function get_keywords($keyword){ 63 $keyword = str_replace("\r\n","\n",$keyword); //换行符替换 64 $this->keywords = explode("\n",$keyword); //关键词数组 65 return $this->keywords; 66 } 67 68 /** 69 *获取请求的URL数组 70 @param array $keywords 71 @return array $urls key为关键词,value为对应的查询网址 72 **/ 73 public function get_urls($keywords){ 74 foreach($keywords as $word){ 75 $this->urls[$word] = "http://www.baidu.com/s?wd={$word}&cl=3&pn=0&rn=50"; 76 } 77 return $this->urls; 78 } 79 80 /** 81 *创建一个 CURL 对象 82 @param string $url 每个url请求地址 83 @param int $timeout 超时时间 84 @return curl_init() 85 **/ 86 protected function create_curl($url,$timeout){ 87 $ch = curl_init(); 88 curl_setopt($ch, CURLOPT_URL, $url); 89 curl_setopt($ch, CURLOPT_HEADER, true); 90 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); 91 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); 92 curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); 93 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT_MS,CoreHttpCurl::$connecttimeout_ms); 94 95 return $ch; 96 } 97 98 /** 99 *支持多线程获取网页 100 @param Array $urls 101 @param int $timeout 102 @return array() 103 **/ 104 function request_urls($urls,$timeout){ 105 $urls = array_unique($urls); // 去重 106 if(!$urls) return array(); // $urls不存在,直接返回空数组 107 $mh = curl_multi_init(); // cURL批处理句柄 108 109 $listener_list = array(); // 监听列表 110 $result = array(); // 返回的数据 111 $list_num = 0; // 总列队数 112 $multi_list = array(); // 排队列表 113 114 foreach($urls as $kw=>$url){ 115 $current = $this->create_curl($url,$timeout); // 创建一个curl对象 116 if($this->multi_exec_num > 0 && $list_num >= $this->multi_exec_num){ 117 $multi_list[] = $url; // 加入排队列表 118 }else{ 119 // 列队数控制 120 curl_multi_add_handle($mh, $current); 121 $listener_list[$kw] = $current; 122 $list_num++; 123 } 124 $result[$kw] = null; //与原文不同,这里使用关键词做键名 125 $this->http_data[$kw] = null; 126 } 127 unset($current); // 删除已加入队列的 128 $running = null; 129 130 $done_num = 0; // 已完成数 131 132 do{ 133 while(($execrun = curl_multi_exec($mh, $running)) == CURLM_CALL_MULTI_PERFORM); 134 if($execrun != CURLM_OK) break; 135 136 while(($done = curl_multi_info_read($mh)) == true){ 137 foreach ($listener_list as $done_kw=>$listener){ 138 if($listener === $done['handle']){ 139 //获取内容 140 $this->http_data[$done_kw] = $this->get_data(curl_multi_getcontent($done['handle']),$done['handle']); 141 142 if($this->http_data[$done_kw]['code'] != 200){ 143 $result[$done_kw] = false; 144 }else{ 145 // 返回内容 146 $result[$done_kw] = $this->http_data[$done_kw]['data']; 147 } 148 149 curl_close($done['handle']); //关闭已经处理完的 curl 会话 150 curl_multi_remove_handle($mh, $done['handle']); //从 $mh 中移除 151 unset($listener_list[$done_kw],$listener); //从监听列表中移除 152 $done_num++; 153 154 //如果还有排队列表,则继续加入 155 if($multi_list){ 156 $current_url = array_shift($multi_list); // 获取队列中的第一条url 157 $current = $this->create_curl($current_url, $timeout); // 创建 curl 对象 158 curl_multi_add_handle($mh, $current); // 加入到队列中 159 160 $listen_list[$current_url] = $current; // 更新监听队列信息 161 unset($current); 162 163 $list_num++; //更新队列数 164 } 165 break; 166 } 167 } 168 } 169 if($done_num >= $list_num) break; 170 }while(true); 171 curl_multi_close($mh); //关闭列队 172 return $result; 173 } 174 175 /** 176 * GET方式获取数据,支持多个URL 177 **/ 178 public function get($urls, $timeout=10,$findurl){ 179 $data = $this->request_urls($urls, $timeout); 180 //$this->clear_set(); 181 $ranks = $this->baid_rank($data, $findurl); //查询排名 182 return $ranks; 183 } 184 185 /** 186 * 获取内容的函数 187 */ 188 protected function get_data($data,$ch){ 189 $header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE); 190 $result['code'] = curl_getinfo($ch, CURLINFO_HTTP_CODE); 191 $result['data'] = substr($data, $header_size); 192 $result['header'] = explode("\r\n", substr($data, 0, $header_size)); 193 $result['time'] = curl_getinfo($ch, CURLINFO_TOTAL_TIME); 194 return $result; 195 } 196 197 /** 198 * 排名查询 199 @param array $serp 搜索结果返回数据(key为关键词,value为页面源代码) 200 @param string $findurl 查询关键词排名的网站URL,如39.net,勿加http:// 201 @return array 202 **/ 203 protected function baid_rank(array $serp, $findurl){ 204 $ranks = array(); 205 foreach($serp as $keyword=>$source){ 206 $pattern = "#<span class=\"g\">.*<\/span>#U"; 207 preg_match_all($pattern, $source, $m); 208 if(!strpos(implode($m[0]),$findurl)){ 209 $ranks[$keyword] = 0; 210 }else{ 211 foreach($m[0] as $k=>$v){ 212 if(strpos($v, $findurl)){ 213 $ranks[$keyword] = $k+1; 214 break; 215 } 216 } 217 } 218 } 219 return $ranks; 220 } 221 222 /** 223 *清理设置 224 **/ 225 } 226 227 ?> 上面似乎不好用,下面稍微好一点: <html> <head> <meta http-equiv="content-type" content="text/html;charset=utf-8" /> <title>百度关键词排名批量查询</title> </head> <body> <h3>百度关键词排名批量查询</h3> <form action="test5.php" method="post"> 输入关键词(每行一个)<br /> <textarea name="keyword" style="resize:none;width:243px;height:70px;"></textarea> <br />输入网址: <input type="text" name="url" size="20" value="39.net" />(如:39.net 勿加http://)<br /> <input type="submit" name="sub" value="查询" /> </form> <hr /> </body> </html> <?php /** * Wget Curl驱动核心 * * @author jonwang([email protected]) * @category MyQEE * @package System * @subpackage Core * @copyright Copyright (c) 2008-2012 myqee.com * @license http://www.myqee.com/license.html */ set_time_limit(0); if(isset($_POST['sub'])){ $start_time = microtime_float(); $kw = $_POST['keyword']; $findurl = $_POST['url']; $httpcurl = new Core_HttpClient_Driver_Curl(); $keywords = $httpcurl->get_keywords($kw);//获取关键词数组 $urls = $httpcurl->get_urls($keywords);//获取请求的url array("关键词"=>"搜索url") $data = $httpcurl->get($urls);//获取搜索结果页面的源代码 array("关键词"=>"网页内容") $ranks = $httpcurl->get_rank($data, $findurl); //获取排名 $end_time = microtime_float(); $con_time = $end_time - $start_time; echo "查询耗时:".$con_time; $output = "<table border='1' bordercolor='green' cellspacing='0'><tr><th>序号</th><th>关键词</th><th>排名</th></tr>"; $i=1; foreach($ranks as $keyword=>$rank){ $output .= "<tr><td>{$i}</td><td>{$keyword}</td><td>{$rank}</td></tr>"; $i++; } $output .= "</table>"; echo $output; } /** * 计算耗时 **/ function microtime_float(){ list($usec,$sec) = explode(" ",microtime()); return ((float)$usec+(float)$sec); } class Core_HttpClient_Driver_Curl{ protected $http_data = array(); protected $agent; protected $cookies; protected $referer; protected $ip; protected $header = array(); protected $_option = array(); protected $_post_data = array(); protected $keywords = array(); //提交的关键词数组 protected $urls = array(); //百度查询页面URL /** * 多列队任务进程数,0表示不限制 * 采集百度,太大会被封,伪装来路和ip似乎也没有用;太小耗时间 * @var int */ protected $multi_exec_num = 3; /** * 默认连接超时时间,毫秒 * * @var int */ protected static $connecttimeout_ms = 3000; const ERROR_HOST = '请求的URL错误'; const ERROR_GET = 'GET请求错误'; const ERROR_POST = 'POST请求错误'; function __construct(){ } /** * 设置$cookie * * @param $agent * @return HttpClient_Driver_Curl */ public function set_agent($agent) { $this->agent = $agent; return $this; } /** * 设置$cookie * * @param string $cookie * @return HttpClient_Driver_Curl */ public function set_cookies($cookies) { $this->cookies = $cookies; return $this; } /** * 设置$referer * * @param string $referer * @return HttpClient_Driver_Curl */ public function set_referer($referer) { $this->referer = $referer; return $this; } /** * 设置IP * * @param string $ip * @return HttpClient_Driver_Curl */ public function set_ip($ip) { $this->ip = $ip; return $this; } /** * 设置curl参数 * * @param string $key * @param value $value * @return HttpClient_Driver_Curl */ public function set_option($key, $value) { if ( $key===CURLOPT_HTTPHEADER ) { $this->header = array_merge($this->header,$value); } else { $this->_option[$key] = $value; } return $this; } /** * 设置多个列队默认排队数上限 * * @param int $num * @return HttpClient_Driver_Curl */ public function set_multi_max_num($num=0) { $this->multi_exec_num = (int)$num; return $this; } /** * 用POST方式提交,支持多个URL * * $urls = array * ( * 'http://www.baidu.com/', * 'http://mytest.com/url', * 'http://www.abc.com/post', * ); * $data = array * ( * array('k1'=>'v1','k2'=>'v2'), * array('a'=>1,'b'=>2), * 'aa=1&bb=3&cc=3', * ); * HttpClient::factory()->post($url,$data); * * @param $url * @param string/array $vars * @param $timeout 超时时间,默认120秒 * @return string, false on failure */ public function post($url, $vars, $timeout = 60) { # POST模式 $this->set_option( CURLOPT_HTTPHEADER, array('Expect:') ); $this->set_option( CURLOPT_POST, true ); if (is_array |
2022-08-30
2022-08-17
2022-11-06
2022-08-18
2022-07-18
请发表评论