php采集器分析功能实现

最新推荐文章于 2022-07-30 22:21:56 发布

musttieying

最新推荐文章于 2022-07-30 22:21:56 发布

阅读量2.4k

点赞数

分类专栏： HTML+CSS+JS php编程 Ajax 文章标签： php url buffer 正则表达式 function fp

本文链接：https://blog.csdn.net/musttieying/article/details/1719091

版权

HTML+CSS+JS 同时被 3 个专栏收录

67 篇文章 1 订阅

订阅专栏

php编程

44 篇文章 0 订阅

订阅专栏

Ajax

4 篇文章 0 订阅

订阅专栏

php采集器分析功能实现

a)URL地址分析
根据数据库中采集规则的定义字段，分为一页地址，多页地址，**—**页地址三种形式。
//流程控制 switch (URL地址形式) { case 1: //单页的设置 $url[0] = URL地址; break; case 2: //多页的设置 $manypage = explode("/n",$row->URL地址); //分割一行一个网址 $url = $manypage; break; case 3: //XX页—XX页的设置 $rowurl = explode('[分页]', $row->URL地址); $a_num = 0; for ($a=$a_num;$a<=$row->结束页;$a++) { $url[$a] = $rowurl[0].$a.$rowurl[1]; if ($a==0) { $url[$a] = $rowurl[0].$row->url_start.$rowurl[1]; } } }

b)内容页面链接分析
//载入URL地址页 $handles = @file_get_contents($url); //正则表达式匹配链接规则 preg_match_all ("/".链接规则."/is",$handles,$matches); //所得内容页面链接地址写入数据库Link表 INSERT INTO `links` ( `title` , `url` , `rules` , `date` )
c)延时函数
//参考：http://cn.php.net/manual/zh/ref.curl.php $ch = curl_init(); $timeout = 10; // set to zero for no timeout curl_setopt ($ch, CURLOPT_URL, $url); curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout); $handles = curl_exec($ch); curl_close($ch);
d)分页分析：全部列出形式
//正则匹配分页区域 preg_match ("/".分页规则."/is",$buffer, $regs2); //查找每个分割里面的网址 for保证网址不重复 preg_match_all ("/".'<[^<>]*(href|value)=(/”|/’)?([^/’/”<>]*)(/”|/’)?[^<>]*>’.”/is”,$regs2[1],$regs3); for ($i = 0; $i <= count($regs3[3]); $i++) { $gethttp = $string->gethttp($regs3[3][$i]); $buffer2 = @file_get_contents($gethttp); //延时 if(empty($buffer2)){ if($phpcurl_init == “yes”){ $ch = curl_init(); $timeout = 10; // set to zero for no timeout curl_setopt ($ch, CURLOPT_URL, $regsar[$i]); curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout); $buffer2 = curl_exec($ch); curl_close($ch); } preg_match (”/”.内容规则.”/is”,$buffer2, $regss); $cont .= $regss[1]; }
e)分页分析：上下页形式
if(preg_match ("/".分页规则."/is",$buffer, $ljregs)) { while($ljregs[1] != "") { $ljregs[1] = $string->gethttp($ljregs[1]); $buffer = @file_get_contents($ljregs[1]); //延时 if(empty($buffer)){ if($phpcurl_init == "yes"){ $ch = curl_init(); $timeout = 10; // set to zero for no timeout curl_setopt ($ch, CURLOPT_URL, $ljregs[1]); curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout); $buffer = curl_exec($ch); curl_close($ch); } } preg_match ("/".$body_rule."/is",$buffer, $regs) ; $cont .= $regs[1]; } }
f)Cookie部分
$string->fileFsock(url, cookie内容); //fsockopen采集 function fileFsock($url,$cookie) { set_time_limit(0); $cookie = $this->getCookie($cookie); preg_match("/^(http:)([^//]+)(.*)/i", $url, $matches); $fp = fsockopen($matches[2], 80, $errno, $errstr, 30); if(!$fp) { echo "$errstr ($errno) /n”; } else { $out = “POST $matches[3] HTTP/1.1/r/n”; $out .= “Host:$matches[2]/r/n”; $out .= “Cookie: “.$cookie.”/r/n”; $out .= “Connection: Close/r/n/r/n”; fputs($fp, $out); fclose($fp); } }
g)其他相关函数：给无HOST头的网址加入HOST
function gethttp($url) { $url = trim($url); if(!preg_match ('/http:/i',$url,$out)) { if(preg_match ('/^(/.{2})//(.+)?/i',$url,$out)) //带有目录的网址加+网址 { return $this->outurl[3].$out[2];//有目录的或有/的 }