PHPquery php爬虫

最新推荐文章于 2024-01-05 15:25:56 发布

River_Crab

最新推荐文章于 2024-01-05 15:25:56 发布

阅读量1.3k

点赞数

文章标签： php爬虫

本文链接：https://blog.csdn.net/River_Crab/article/details/78253065

版权

PHPQuery

include 'phpQuery.php'; 
phpQuery::newDocumentFileXHTML('my-xhtml.html')->find('p'); $ul = pq('ul');
$url='http://wwwbaidu.com';
$data=file_git_content($url);
$data=phpQuery::newDocumentFile('http://wwwbaidu.com'); 
echo pq("title")->text();   // 获取网页标题
echo pq("div#header")->html();  

$proxyArray = array();
foreach (pq('.articleList2 ul li', $doc) as $liOne) {
    $proxyOne = array();
    foreach (pq('a', $liOne) as $aOne) {
      $a = pq($aOne)->text();
      $href=$aOne->getAttribute('href');
      $proxyOne['href'] = $base.$href;
      $proxyOne['content']=getContent($proxyOne['href']);
      $proxyOne['title'] = trim($a);
    }
    foreach (pq('span', $liOne) as $spanOne) {
        $span = pq($spanOne)->text();
        $proxyOne['time'] = strtotime(trim($span,'[]'));
    }
$proxyArray[] = $proxyOne;
}

phpQuery::newDocument($html, $contentType = null) 根据标记URL新建一个文档。如果 $contentType为空,则根据文档自动检测编码。检测失败, 则对于text/html类型文档自动赋予utf-8编码。
phpQuery::newDocumentFile($file, $contentType = null) 根据文件新建一个文档。类似于newDocument()
phpQuery::newDocumentHTML($html, $charset = 'utf-8')
phpQuery::newDocumentXHTML($html, $charset = 'utf-8')
phpQuery::newDocumentXML($html, $charset = 'utf-8')
phpQuery::newDocumentPHP($html, $contentType = null) 
phpQuery::newDocumentFileHTML($file, $charset = 'utf-8')
phpQuery::newDocumentFileXHTML($file, $charset = 'utf-8')
phpQuery::newDocumentFileXML($file, $charset = 'utf-8')
phpQuery::newDocumentFilePHP($file, $contentType)

pq($param, $context = null);
pq(); 相当于 jQuery的$();。它主要完成三件事情：
1. 载入标记资源：
输入到载入的文档: 
对于最开始输入的字符串不接收文本类型的节点：pq('<div/>')
从`$pq->getDocumentID()根据ID载入到文档： pq('<div/>', $pq->getDocumentID())`
// 根据DOM节点的归属将同样的文档载入：pq('<div/>', DOMNode)
// 从phpQuery 对象载入文档: pq('<div/>', $pq)
2. 运行查询
// 根据最后一个选择的文档执行查询：pq('div.myClass')
// 根据$pq->getDocumentID()的ID从文档中进行查询：pq('div.myClass', $pq->getDocumentID())
// 在同样的文档上根据DOM节点的归属进行查询并且使用节点作为查询的根节点：pq('div.myClass', DOMNode)
// 在文档上使用phpQuery对象进行查询
// 同时使用对象的栈作为根节点进行查询: pq('div.myClass', $pq) 
3. 使用phpQuery对象对DOM节点进行原型化操作
foreach(pq('li') as $li) // $li是纯DOM节点, 将它变为phpQuery对象： pq($li);**

//curl获得页面
function request($url,$https=true,$proxy=false,$method='get',$data=null){
    //1.初始化
    $ch = curl_init($url);
    //2.设置curl
    //返回数据不输出
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    //开启支持gzip
    curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
    //设置超时限制
    // curl_setopt($ch, CURLOPT_TIMEOUT, 5);
    //根据url设置referer
    $host = parse_url($url);
    $host = $host['host'];
    curl_setopt($ch, CURLOPT_REFERER, 'http://'.$host);
    curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36');
    //确认是否开启代理
    if($proxy === true){
      // $proxyArray = $this->getProxy();
      // $proxyOne = $proxyArray[rand(1,(count($proxyArray)-1))];
      // // file_put_contents('./dbug',json_encode($proxyOne));
      // //开启代理
      // curl_setopt($ch, CURLOPT_PROXY, $proxyOne[0]);
      // curl_setopt($ch, CURLOPT_PROXYPORT,$proxyOne[1]);
      curl_setopt($ch, CURLOPT_PROXY, '61.191.41.130');
      curl_setopt($ch, CURLOPT_PROXYPORT,80);
    }
    //满足https
    if($https === true){
      //绕过ssl验证
      curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
      curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
    }
    //满足post
    if($method === 'post'){
      curl_setopt($ch, CURLOPT_POST, true);
      curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
    }
    //3.发送请求
    $content = curl_exec($ch);
    //4.关闭资源
    curl_close($ch);
    return $content;
  }