中国日报网--搜索结果数据采集

JHC000000

已于 2023-08-28 15:44:23 修改

阅读量62

点赞数

分类专栏：爬虫文章标签： python 开发语言

于 2023-08-28 15:15:00 首次发布

本文链接：https://blog.csdn.net/CXY00000/article/details/132539629

版权

爬虫专栏收录该内容

30 篇文章 3 订阅

订阅专栏

支持所有搜索结果内容提取

#!/usr/bin/python3
# -*- coding:utf-8 -*-
import requests
import re
import json
from lxml import etree
from sdk.utils.util_decorate import retry

@retry(retry=3,sleep=5)
def get_html(url):
    response = requests.get(url)
    response.encoding = "utf-8"
    if response.status_code == 200:
        return response.text
    else:
        print(response.status_code)
        return "ERROR"

def get_text(text):
    if isinstance(text,str):
        return re.sub("\\r|\\n|\\t|　| ", "", text).strip(" ")
    elif isinstance(text,list):
        return "".join([re.sub("\\r|\\n|\\t|　| ", "", i).strip(" ") for i in text])


def anlise_detail(detail_html,url_head):
    tree = etree.HTML(detail_html)
    lis = tree.xpath('//div[@class="container-left2"]')
    for li in lis:
        title = get_text(li.xpath('.//h1[@class="dabiaoti"]/text()'))
        print("标题",title)
        pusblish_info = get_text(li.xpath('.//div[@class="fenx"]//text()'))
        print("文章信息",pusblish_info)
        content = get_text(li.xpath('.//div[@id="Content"]//p/text()'))
        print("content",content)
        img_name_list = li.xpath('.//font[@color="blue"]/text()')
        img_list = [i.replace("../../", "https://world.chinadaily.com.cn/") if not i.startswith("https://") else i for
                    i in
                    li.xpath('.//div[@id="Content"]//img/@src')]
        if img_list:
            for img_name,img_url in zip(img_name_list,img_list):
                print(img_name,img_url)

        next_page = tree.xpath('//div[@id="div_currpage"]/a[last()]/text()')
        if next_page:
            if next_page[0] == "下一页":
                next_page_url = url_head+tree.xpath('//div[@id="div_currpage"]/a[last()]/@href')[0]
                print("next_page_url",next_page_url)
                next_detail_page = get_html(next_page_url)["msg"]
                if next_detail_page != "ERROR":
                    anlise_detail(next_detail_page,url_head)



if __name__ == '__main__':
    keywords, page = "女权", 1
    url = "https://newssearch.chinadaily.com.cn/rest/cn/search?keywords={}&page={}".format(keywords, page)
    response = get_html(url)
    data = json.loads(response["msg"])
    for args in data["content"]:
        title = args["title"]
        # print(title)
        keywords = args["keywords"]
        # print(keywords)
        source = args["source"]
        detail_url = args["url"]
        print(title, detail_url)
        res = get_html(detail_url)
        detail_html = res["msg"]
        url_head = detail_url.split("content")[0]
        anlise_detail(detail_html,url_head)