中国日报网--搜索结果数据采集

支持所有搜索结果内容提取

#!/usr/bin/python3
# -*- coding:utf-8 -*-
import requests
import re
import json
from lxml import etree
from sdk.utils.util_decorate import retry

@retry(retry=3,sleep=5)
def get_html(url):
    response = requests.get(url)
    response.encoding = "utf-8"
    if response.status_code == 200:
        return response.text
    else:
        print(response.status_code)
        return "ERROR"

def get_text(text):
    if isinstance(text,str):
        return re.sub("\\r|\\n|\\t| | ", "", text).strip(" ")
    elif isinstance(text,list):
        return "".join([re.sub("\\r|\\n|\\t| | ", "", i).strip(" ") for i in text])


def anlise_detail(detail_html,url_head):
    tree = etree.HTML(detail_html)
    lis = tree.xpath('//div[@class="container-left2"]')
    for li in lis:
        title = get_text(li.xpath('.//h1[@class="dabiaoti"]/text()'))
        print("标题",title)
        pusblish_info = get_text(li.xpath('.//div[@class="fenx"]//text()'))
        print("文章信息",pusblish_info)
        content = get_text(li.xpath('.//div[@id="Content"]//p/text()'))
        print("content",content)
        img_name_list = li.xpath('.//font[@color="blue"]/text()')
        img_list = [i.replace("../../", "https://world.chinadaily.com.cn/") if not i.startswith("https://") else i for
                    i in
                    li.xpath('.//div[@id="Content"]//img/@src')]
        if img_list:
            for img_name,img_url in zip(img_name_list,img_list):
                print(img_name,img_url)

        next_page = tree.xpath('//div[@id="div_currpage"]/a[last()]/text()')
        if next_page:
            if next_page[0] == "下一页":
                next_page_url = url_head+tree.xpath('//div[@id="div_currpage"]/a[last()]/@href')[0]
                print("next_page_url",next_page_url)
                next_detail_page = get_html(next_page_url)["msg"]
                if next_detail_page != "ERROR":
                    anlise_detail(next_detail_page,url_head)



if __name__ == '__main__':
    keywords, page = "女权", 1
    url = "https://newssearch.chinadaily.com.cn/rest/cn/search?keywords={}&page={}".format(keywords, page)
    response = get_html(url)
    data = json.loads(response["msg"])
    for args in data["content"]:
        title = args["title"]
        # print(title)
        keywords = args["keywords"]
        # print(keywords)
        source = args["source"]
        detail_url = args["url"]
        print(title, detail_url)
        res = get_html(detail_url)
        detail_html = res["msg"]
        url_head = detail_url.split("content")[0]
        anlise_detail(detail_html,url_head)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值