queryList爬虫获取内容的几种方法总结 queryList给抓取的内容增加html追加元素html 代码实例...

最新推荐文章于 2023-06-30 15:10:43 发布

weixin_30954265

最新推荐文章于 2023-06-30 15:10:43 发布

阅读量3.5k

点赞数 1

文章标签：爬虫 javascript java ViewUI

原文链接：http://www.cnblogs.com/stillstep/p/10953512.html

版权

//简略内容:
1.
$data1 = $ql->find('.two img')->map(function($item){
    return $item->alt;
});
// 等价下面这句话
$data2 = $ql->find('.two img')->attrs('alt');

2.
$texts = $ql->find('.two>a')->texts();
$htmls = $ql->find('#one span')->htmls();

3.
$ql = QueryList::get('https://www.ithome.com/html/discovery/358585.htm');
$rt = [];
// 采集文章标题
$rt['title'] = $ql->find('h1')->text();

4.采集列表所有  用all
$rt = QueryList::get($url)->rules($rules)->query()->getData();
print_r($rt->all());
//QueryList V4.0.4版本新增了一个queryData()语法糖来简化这种操作:
$rt = QueryList::get($url)->rules($rules)->queryData();
queryData()方法等同于query()->getData()->all()

$ql = QueryList::html($html); // 获取div元素对象 $div = $ql->find('div:eq(0)'); // 向div元素中追加一个img元素 queryList给抓取的内容增加html追加元素html 
//(在元素操作页面文档 不是结果处理 http://www.querylist.cc/docs/guide/v4/modify-dom) $div->append('<img src="1.jpg" />');

//获取HTTP响应头等信息
use GuzzleHttp\Client;

$client = new Client();
$response = $client->get('http://httpbin.org/get');
// 获取响应头部信息
$headers = $response->getHeaders();

//内容过滤
// 采集正文内容
$eles = QueryList::html($html)->find('#content');
// 选择正文内容中要移除的元素，并移除
$eles->find('.tt,span:last,p:last')->remove();
//获取纯净的正文内容
$content = $eles->html();

$rt = QueryList::rules($rules)->html($html)->query()->getData();

$rt = QueryList::rules($rules)
    ->html($html)
    ->query()
    ->getData(function($item){
        $ql = QueryList::html($item['content']);
        $ql->find('.tt,span:last,p:last')->remove();
        $item['content'] = $ql->find('')->html();
        return $item;
    });

//QueryList内置的HTTP客户端
//更多高级参数
//还可以携带更多高级参数，如：设置超时时间、设置代理等。
$ql = QueryList::get('http://httpbin.org/get',[
    'param1' => 'testvalue',
    'params2' => 'somevalue'
],[
                         // 设置代理
                         'proxy' => 'http://222.141.11.17:8118',
                         //设置超时时间，单位：秒
                         'timeout' => 30,
                         'headers' => [
                             'Referer' => 'https://querylist.cc/',
                             'User-Agent' => 'testing/1.0',
                             'Accept'     => 'application/json',
                             'X-Foo'      => ['Bar', 'Baz'],
                             'Cookie'    => 'abc=111;xxx=222'
                         ]
                     ]);

//使用文件缓存驱动
//// 缓存文件夹路径
//$cache_path = __DIR__.'/temp/';
$ql = QueryList::get($url,null,[
    'cache' => $cache_path,
    'cache_ttl' => 600 // 缓存有效时间，单位：秒，可以不设置缓存有效时间
]);

//使用 HTTP Cache
//use GuzzleHttp\Psr7\Response;
//use QL\QueryList;
//
//$urls = [
//    'https://github.com/trending/go?since=daily',
//    'https://github.com/trending/html?since=daily',
//    'https://github.com/trending/java?since=daily'
//];

QueryList::multiGet($urls)
    ->success(function(QueryList $ql,Response $response, $index) use($urls){
        echo 'Current url: '.$urls[$index]."\r\n";
        $data = $ql->find('h3>a')->texts();
        print_r($data->all());
    })->send();

//更高级的用法

//use GuzzleHttp\Psr7\Response;
//use QL\QueryList;
//
//$urls = [
//    'https://github.com/trending/go?since=daily',
//    'https://github.com/trending/html?since=daily',
//    'https://github.com/trending/java?since=daily'
//];
//
//$rules = [
//    'name' => ['h3>a','text'],
//    'desc' => ['.py-1','text']
//];
//$range = '.repo-list>li';
QueryList::rules($rules)
    ->range($range)
    ->multiGet($urls)
    // 设置并发数为2
    ->concurrency(2)
    // 设置GuzzleHttp的一些其他选项
    ->withOptions([
                      'timeout' => 60
                  ])
    // 设置HTTP Header
    ->withHeaders([
                      'User-Agent' => 'QueryList'
                  ])
    // HTTP success回调函数
    ->success(function (QueryList $ql, Response $response, $index){
        $data = $ql->queryData();
        print_r($data);
    })
    // HTTP error回调函数
    ->error(function (QueryList $ql, $reason, $index){
        // ...
    })
    ->send();

//详细版
1.
$data1 = $ql->find('.two img')->map(function($item){
    return $item->alt;
});
// 等价下面这句话
$data2 = $ql->find('.two img')->attrs('alt');

print_r($data1->all());

2.
$texts = $ql->find('.two>a')->texts();
$htmls = $ql->find('#one span')->htmls();

print_r($texts->all());

3.
use QL\QueryList;

$ql = QueryList::get('https://www.ithome.com/html/discovery/358585.htm');

$rt = [];
// 采集文章标题
$rt['title'] = $ql->find('h1')->text();

4.采集列表所有  用all
//use QL\QueryList;

//$url = 'https://www.ithome.com/html/discovery/358585.htm';
//// 定义采集规则
//$rules = [
//    // 采集文章标题
//    'title' => ['h1','text'],
//    // 采集文章作者
//    'author' => ['#author_baidu>strong','text'],
//    // 采集文章内容
//    'content' => ['.post_content','html']
//];
$rt = QueryList::get($url)->rules($rules)->query()->getData();
print_r($rt->all());

$rt = QueryList::get($url)->rules($rules)->query()->getData();
print_r($rt->all());
//QueryList V4.0.4版本新增了一个queryData()语法糖来简化这种操作:

$rt = QueryList::get($url)->rules($rules)->queryData();
queryData()方法等同于query()->getData()->all()


//QueryList内置的HTTP客户端
//更多高级参数
//还可以携带更多高级参数，如：设置超时时间、设置代理等。

$ql = QueryList::get('http://httpbin.org/get',[
    'param1' => 'testvalue',
    'params2' => 'somevalue'
],[
                         // 设置代理
                         'proxy' => 'http://222.141.11.17:8118',
                         //设置超时时间，单位：秒
                         'timeout' => 30,
                         'headers' => [
                             'Referer' => 'https://querylist.cc/',
                             'User-Agent' => 'testing/1.0',
                             'Accept'     => 'application/json',
                             'X-Foo'      => ['Bar', 'Baz'],
                             'Cookie'    => 'abc=111;xxx=222'
                         ]
                     ]);

//使用文件缓存驱动
//// 缓存文件夹路径
//$cache_path = __DIR__.'/temp/';
$ql =  = QueryList::get($url,null,[
    'cache' => $cache_path,
    'cache_ttl' => 600 // 缓存有效时间，单位：秒，可以不设置缓存有效时间
]);

//使用 HTTP Cache
//use GuzzleHttp\Psr7\Response;
//use QL\QueryList;
//
//$urls = [
//    'https://github.com/trending/go?since=daily',
//    'https://github.com/trending/html?since=daily',
//    'https://github.com/trending/java?since=daily'
//];

QueryList::multiGet($urls)
    ->success(function(QueryList $ql,Response $response, $index) use($urls){
        echo 'Current url: '.$urls[$index]."\r\n";
        $data = $ql->find('h3>a')->texts();
        print_r($data->all());
    })->send();

//更高级的用法

//use GuzzleHttp\Psr7\Response;
//use QL\QueryList;
//
//$urls = [
//    'https://github.com/trending/go?since=daily',
//    'https://github.com/trending/html?since=daily',
//    'https://github.com/trending/java?since=daily'
//];
//
//$rules = [
//    'name' => ['h3>a','text'],
//    'desc' => ['.py-1','text']
//];
//$range = '.repo-list>li';
QueryList::rules($rules)
    ->range($range)
    ->multiGet($urls)
    // 设置并发数为2
    ->concurrency(2)
    // 设置GuzzleHttp的一些其他选项
    ->withOptions([
                      'timeout' => 60
                  ])
    // 设置HTTP Header
    ->withHeaders([
                      'User-Agent' => 'QueryList'
                  ])
    // HTTP success回调函数
    ->success(function (QueryList $ql, Response $response, $index){
        $data = $ql->queryData();
        print_r($data);
    })
    // HTTP error回调函数
    ->error(function (QueryList $ql, $reason, $index){
        // ...
    })
    ->send();

//获取HTTP响应头等信息
use GuzzleHttp\Client;

$client = new Client();
$response = $client->get('http://httpbin.org/get');
// 获取响应头部信息
$headers = $response->getHeaders();

//内容过滤
// 采集正文内容
$eles = QueryList::html($html)->find('#content');
// 选择正文内容中要移除的元素，并移除
$eles->find('.tt,span:last,p:last')->remove();
//获取纯净的正文内容
$content = $eles->html();

//$rules = [
//    // 移除内容中所有的超链接，但保留超链接的内容，并移除内容中所有p标签，但保留p标签的内容
//    'content_html' => ['#content','html','a p'],
//    // 保留内容中的超链接，以及保留p标签及内容
//    'content_text' => ['#content','text','a p'],
//];

$rt = QueryList::rules($rules)->html($html)->query()->getData();

//
//$rules = [
//    'content' => ['#content','html']
//];

$rt = QueryList::rules($rules)
    ->html($html)
    ->query()
    ->getData(function($item){
        $ql = QueryList::html($item['content']);
        $ql->find('.tt,span:last,p:last')->remove();
        $item['content'] = $ql->find('')->html();
        return $item;
    });

$data1 = $ql->find('.two img')->map(function($item){ return $item->alt;});$data1 = $ql->find('.two img')->map(function($item){ return $item->alt;});// 等价下面这句话$data2 = $ql->find('.two img')->attrs('alt');
print_r($data1->all());$texts = $ql->find('.two>a')->texts();$htmls = $ql->find('#one span')->htmls();
print_r($texts->all());

转载于:https://www.cnblogs.com/stillstep/p/10953512.html

weixin_30954265

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
queryList爬虫获取内容的几种方法总结 queryList给抓取的内容增加html追加元素html 代码实例...

//简略内容:1.$data1 = $ql->find('.two img')->map(function($item){ return $item->alt;});// 等价下面这句话$data2 = $ql->find('.two img')->attrs('alt');2.$texts = $ql->find(...
复制链接

扫一扫