使用php命令脚本批量抓取百度搜索url
用法 php.exe 1.php “关键词” “抓取页数”
结果将保存在同目录下baidu.txt 如没有这个文件请手动创建
PHP
error_reporting(0);
@$keyword = $argv[1];
@$zpage = $argv[2];
if((!$keyword) or (!$zpage)){
die(‘Require keyword and page’);
}
$keyword = urlencode($keyword);
for($p=0;$p
$url = ‘http://www.baidu.com/s?wd=’.$keyword.’&pn=’.$p.’0&oq=1&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&rsv_pq=8292c42600001067&rsv_t=5e14MUzgVAGXxjHEqvWPfyBfPeJioaXg83h6Bm5Nlfi4ScTL4Qg1IKNLNtIEbbmFKHyl&f=8&rsv_bp=1&rsv_spt=1′;
$txt = file_get_contents($url);
preg_match_all(‘/(data)(-)(tools)(=)(\’)(\{)(“title”)(:)(“)((.+))(“)(,)(“url”)(:)(“)((.+))(“)(\})(\’)/i’,$txt,$matches);
for($i=0;$i
$json = str_replace(‘data-tools=’,”,$matches[0][$i]);
$json = str_replace(‘\”,”,$json);
$data = json_decode($json,true);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $data[‘url’]);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_NOBODY,true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_AUTOREFERER,true);
curl_setopt($ch, CURLOPT_TIMEOUT,5);
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
‘Accept: */*’,
‘User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)’,
‘Connection: Keep-Alive’));
$header = curl_exec($ch);
curl_close($ch);
$header = explode(“\n”,$header);
$num = array_find($header,’Location:’);
$header = explode(‘: ‘,$header[$num]);
$link = trim($header[1]);
//if(stristr($link,’baidu.com’)){
//continue;
//}
file_put_contents(‘baidu.txt’,$data[‘title’].”\r\n”.$link.”\r\n”,FILE_APPEND);
$a = $i+1;
echo $a.’ Complete!’.”\n”;
}
$b = $p+1;
echo ‘Page’.$b.’Complete!’.”\n”;
}
echo ‘Complete!’.”\n”;
function array_find($array,$word){
foreach($array AS $num => $key){
if(strpos($key,$word) !== false){
return $num;
break;
}
}
}
?>