现使用php+redis 爬取公司信息的爬虫代码,主要用于获取公司信息,公司联系人相关信息,
主要是根据天眼查 https://www.tianyancha.com/search 来作为源地址.
function pachong($url,Redis $redis){
$contents=[];
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.113 Safari/537.36");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_URL, $url);
$html = curl_exec($ch);
curl_close($ch);
var_dump($url);
//xPATH解析html
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xPath = new DOMXPath($dom);
//公司信息
$companys = $xPath->query('//span[3][@class=\'tt hidden\']');
//跳转地址nextPage
$url = $xPath->query('//ul[@class=\'pagination\']/li/a[@class