- <?php
- /**
- * 多进程抓取百度结果页自然结果,包括标题、摘要、图片、链接、来源
- * @since 2016-04-15
- */
- class NaturalResultSpider {
- private $_strQuery = null;
- public $worker_process = 4; //开启进程数
- private $_arrPids = array();
- private $_intPageNum; //需要抓取的自然结果页数
- public $arrAllResult = array();
- public $dataHandler = null; //钩子,可以回调指定的函数完成对应功能
- private $masterPid = null;
- private $retry_times = 1;
- private $strReg = '/<div\sclass="result\sc-result\sc-clk-recommend"(.*)?>(.*)?(<img\ssrc="(.*)?">)?(.*)?(<p\sclass="c-line-clamp3\sc-color">(.*)?)+<\/div>/Uis';
- private static $_arrPattern = array(
- array('name'=>'nature_result', 'reg'=>'/data-log=\"(.*?)\"/', 'location'=>1),
- array('name'=>'title', 'reg'=>'/<h3(.*?)>(.*?)<\/h3>/', 'location'=>2),
- array('name'=>'abstract', 'reg'=>'/<p class=\"c-line-clamp3 c-color\">(.*?)<\/p>/', 'location'=>1),
- array('name'=>'source_url', 'reg'=>'/<div class=\"c-showurl c-line-clamp1\"><span>(.*?)<\/span>/', 'location'=>1),
- array('name'=>'url', 'reg'=>'/<div class=\"c-container\"><a(.*?)class=\"c-blocka\" href=\"(.*?)\">/', 'location'=>2),
- array('name'=>'img', 'reg'=>'/<div class=\"c-img c-img-s\"><img data-imagedelaysrc=\"(.*?)\"/', 'location'=>1),
- );
- public function __construct($strQuery, $intPageNum=76) {
- $this->_strQuery = $strQuery;
- $this->_intPageNum = $intPageNum;
- }
- public function execute() {
- $this->setMasterPid();
- $this->forkWorker();
- $this->monitorWorker();
- }
- private function setMasterPid() {
- $this->masterPid = posix_getpid();
- }
- public function setWorkerProcess($intWorkerProcess) {
- if ($intWorkerProcess <= 0) {
- return false;
- }
- $this->worker_process = $intWorkerProcess;
- }
- public function setRetryTimes($intTimes) {
- if ($intTimes <= 0) {
- return false;
- }
- $this->retry_times = $intTimes;
- }
- public function setRegPattern($strReg) {
- if (empty($strReg)) {
- return false;
- }
- $this->strReg = $strReg;
- }
- public function setPattern($arrPattern) {
- if (!is_array($arrPattern) || empty($arrPattern)) {
- return false;
- }
- self::$_arrPattern[] = $arrPattern;
- }
- private function monitorWorker() {
- if ($this->masterPid === posix_getpid()) {
- foreach ($this->_arrPids as $intPid) {
- pcntl_waitpid($intPid, $status, WUNTRACED);
- $status = pcntl_wexitstatus($status);
- if ($status === 100) {
- unset($this->_arrPids[$inPid]);
- }
- }
- }
- }
- /*主调用方法*/
- public function forkWorker() {
- for ($i=0; $i<$this->worker_process; ++$i) {
- $pid = pcntl_fork();
- if ($pid === -1) {
- exit;
- } elseif ($pid > 0) {
- $this->_arrPids[$pid] = $pid;
- } else {
- $arrResult = $this->run($i);
- if ($this->dataHandler) {
- call_user_func($this->dataHandler, $arrResult);
- }
- exit(100);
- }
- }
- }
- /*为worker分配任务*/
- private function run($intWorkerId) {
- $intPage = ceil($this->_intPageNum / $this->worker_process);
- $intBegin = $intWorkerId * $intPage;
- $intEnd = ($intWorkerId + 1) * $intPage;
- $intEnd = $intEnd > $this->_intPageNum ? $this->_intPageNum : $intEnd;
- for ($i=$intBegin; $i<$intEnd; ++$i) {
- $strUrl = 'm.baidu.com/s?word=' . urlencode($this->_strQuery);
- $strUrl .= $i == 0 ? '' : '&pn=' . $i*10;
- //如果失败则重试
- $error_times = 0;
- while (true) {
- if ($error_times >= $this->retry_times) {
- break;
- }
- $strHtml = $this->curl($strUrl);
- $arrMatches = $this->getHtmlContent($strHtml);
- $arrNaturalResult = $this->getNaturalResult($arrMatches);
- if (!empty($arrNaturalResult)) {
- $arrResult[$i] = $arrNaturalResult;
- break;
- }
- $error_times++;
- }
- }
- return $arrResult;
- }
- private function curl($url) {
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
- curl_setopt($ch, CURLOPT_TIMEOUT, 10);
- curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
- $result = curl_exec($ch);
- if (curl_errno($ch)) {
- exit;
- }
- return $result;
- }
- public function getHtmlContent($strHtml) {
- if (empty($strHtml)) {
- return false;
- }
- preg_match_all($this->strReg, $strHtml, $arrMatches);
- return $arrMatches[0];
- }
- public function getNaturalResult($arrMatches) {
- if (empty($arrMatches) || !is_array($arrMatches)) {
- return false;
- }
- $arrNaturalResult = array();
- foreach ($arrMatches as $key=>$div) {
- foreach (self::$_arrPattern as $val) {
- $strName = $val['name'];
- $$strName = '';
- }
- foreach (self::$_arrPattern as $val) {
- $strName = $val['name'];
- preg_match_all($val['reg'], $div, $matches);
- if (!isset($matches[$val['location']][0])) {
- continue;
- }
- $$strName = isset($matches[$val['location']][0]) ? $matches[$val['location']][0] : '';
- if ($val['name'] === 'nature_result') {
- $$strName = str_replace('\'', '"', $$strName);
- $$strName = json_decode($$strName, true);
- } else {
- $$strName = strip_tags($$strName);
- }
- $arrNaturalResult[$key][$val['name']] = $$strName;
- }
- }
- return $arrNaturalResult;
- }
- }
调用方法:
$obj = new NaturalResultSpider($strQuery, $pageNo);
指定需要抓取什么query的搜索结果,和抓取的页数,最多76页
$obj->setWorkerProcess(4);
指定4个进程进行抓取
$obj->setRetryTimes(3);
抓取失败重试次数
$obj->dataHandler = 'printRes';
指定回调方法进行数据处理
$obj->execute();
以上设置好之后开始运行