淘宝店铺的信息的爬取
直接上代码
$link = input('link');
$content = $this->getRquest($link);
$content = mb_convert_encoding($content, 'utf-8','GB2312');
\think\Loader::import('phpQuery.phpQuery', EXTEND_PATH);
$html = \phpQuery::newDocumentFiles($link);
$shop_name= pq(".shop-name>a")->text();
$shop_name = mb_convert_encoding($shop_name,'ISO-8859-1','utf-8');
$shop_name = mb_convert_encoding($shop_name,'utf-8','GBK');
$shop_name = trim(str_replace("进入店铺","",$shop_name));
preg_match_all('/<a class=\"seller-name J_TGoldlog\"[\s\S]*?target=\"_blank\">掌柜:([\s\S]* ?)<\/a><br>/',$content,$store_accounts);
if (!$store_accounts[0]) {
preg_match_all('/<p class=\"info-item\"[\s\S]*?"><span class="title">掌[\s\S]*?柜:<\/span>([\s\S]*?)<\/p>[\s\S]*?<span class="title">客[\s\S]*?服:/',$content,$store_accounts);
}
$owner = trim($store_accounts[1][0]);
private function getRquest($url)
{
$headers = $this->randIp();
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_ENCODING, "");
$output = curl_exec($ch);
curl_close($ch);
return $output;
}
private function getHeaderRequest($url){
$ip_long = array(
array('607649792', '608174079'),
array('1038614528', '1039007743'),
array('1783627776', '1784676351'),
array('2035023872', '2035154943'),
array('2078801920', '2079064063'),
array('-1950089216', '-1948778497'),
array('-1425539072', '-1425014785'),
array('-1236271104', '-1235419137'),
array('-770113536', '-768606209'),
array('-569376768', '-564133889'),
);
$rand_key = mt_rand(0, 9);
$ip= long2ip(mt_rand($ip_long[$rand_key][0], $ip_long[$rand_key][1]));
$ch = curl_init();
$headers = array(
"Content-type: text/xml;charset=\"utf-8\"",
"Accept: text/html, application/xml;q=0.9, application/xhtml+xml, image/png, image/webp, image/jpeg, image/gif, image/x-xbitmap, */*;q=0.1",
"Cache-Control: no-cache",
"Pragma: no-cache",
"cookie:lid=%E9%87%8E%E7%8B%BChy; cna=B129FQUBcC0CAXoEOzNJdBZo; ali_apache_track=c_mid=b2b-1791803016|c_lid=%E9%87%8E%E7%8B%BChy|c_ms=1; UM_distinctid=16cb31d9d5fbdc-0f653114eac331-SD; _is_show_loginId_chang-gsd6_false; __rn_alert__=false; isg=BD4-dSADSADs; l=dSD-sdSD-VC..",
"user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
'CLIENT-IP:'.$ip,'X-FORWARDED-FOR:'.$ip
);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_ENCODING, "");
curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
$output = curl_exec($ch);
curl_close($ch);
return $output;
}
天猫店铺
$shop_name= pq("#shopExtra>.slogo>.slogo-shopname>strong")->text();
$owner = pq(".extend>ul>.shopkeeper>.right>a")->text();