php 爬虫,爬过的几个坑（未完）

最新推荐文章于 2024-06-25 14:23:25 发布

luochengquan

最新推荐文章于 2024-06-25 14:23:25 发布

阅读量1.4k

点赞数

分类专栏： php 文章标签：爬虫 php xml dom

本文链接：https://blog.csdn.net/luochengquan/article/details/66971586

版权

php 专栏收录该内容

14 篇文章 0 订阅

订阅专栏

零、用PHP做爬虫的优势，可以直接集成到已有的PHP网站中。

一、屏蔽错误

libxml_use_internal_errors(true);

二、创建URL请求

public function curl($url,$post_data=null)
    {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_HEADER, 0);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

        if ( !is_null($post_data) ) {
            curl_setopt($ch, CURLOPT_POST, 1);
            curl_setopt($ch, CURLOPT_POSTFIELDS, $post_data);
        }

        $html = curl_exec($ch);
        //$html = utf8_encode($html);
        curl_close($ch);
        return $data;
    }/*curl*/

三、解析键名缺少引号的json数据

public function ext_json_decode($str, $mode=true)
    {
        if(preg_match('/\w:/', $str)){
            $str = preg_replace('/(\w+):/is', '"$1":', $str);
        }
        return json_decode($str, $mode);
    }

四、用simplexml 直接解析html 文档碰到特殊字符经常解析不来，DOMdocument 的容错能力更强。

/*html 转 simplexml */
    public function html_to_xml($html)
    {
        $meta = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>';

        $dom = new DOMdocument();

        $dom->loadHtml($meta.$html);

        $xml = simplexml_import_dom($dom);

        return $xml;
    }

五、用DOMdocument 操作数据，用simplexml 查找节点，两个交替使用更方便，用过才会明白。

function parse_xml($xml) {

	$xml_items = $xml->xpath('//div[@class="p"]/div');

	foreach ($xml_list as $key => $xml) {

		$result = dom_import_simplexml($xml);

		$div_items = $result->getElementsByTagName('div');

		foreach ($div_items as $key => $item) {

			$video['title'] = $item->getElementsByTagName('a')->item(0)->getAttribute('title');

		}

	}

} /*parse_xml*/