支持所有搜索结果内容提取
#!/usr/bin/python3
# -*- coding:utf-8 -*-
import requests
import re
import json
from lxml import etree
from sdk.utils.util_decorate import retry
@retry(retry=3,sleep=5)
def get_html(url):
response = requests.get(url)
response.encoding = "utf-8"
if response.status_code == 200:
return response.text
else:
print(response.status_code)
return "ERROR"
def get_text(text):
if isinstance(text,str):
return re.sub("\\r|\\n|\\t| | ", "", text).strip(" ")
elif isinstance(text,list):
return "".join([re.sub("\\r|\\n|\\t| | ", "", i).strip(" ") for i in text])
def anlise_detail(detail_html,url_head):
tree = etree.HTML(detail_html)
lis = tree.xpath('//div[@class="container-left2"]')
for li in lis:
title = get_text(li.xpath('.//h1[@class="dabiaoti"]/text()'))
print("标题",title)
pusblish_info = get_text(li.xpath('.//div[@class="fenx"]//text()'))
print("文章信息",pusblish_info)
content = get_text(li.xpath('.//div[@id="Content"]//p/text()'))
print("content",content)
img_name_list = li.xpath('.//font[@color="blue"]/text()')
img_list = [i.replace("../../", "https://world.chinadaily.com.cn/") if not i.startswith("https://") else i for
i in
li.xpath('.//div[@id="Content"]//img/@src')]
if img_list:
for img_name,img_url in zip(img_name_list,img_list):
print(img_name,img_url)
next_page = tree.xpath('//div[@id="div_currpage"]/a[last()]/text()')
if next_page:
if next_page[0] == "下一页":
next_page_url = url_head+tree.xpath('//div[@id="div_currpage"]/a[last()]/@href')[0]
print("next_page_url",next_page_url)
next_detail_page = get_html(next_page_url)["msg"]
if next_detail_page != "ERROR":
anlise_detail(next_detail_page,url_head)
if __name__ == '__main__':
keywords, page = "女权", 1
url = "https://newssearch.chinadaily.com.cn/rest/cn/search?keywords={}&page={}".format(keywords, page)
response = get_html(url)
data = json.loads(response["msg"])
for args in data["content"]:
title = args["title"]
# print(title)
keywords = args["keywords"]
# print(keywords)
source = args["source"]
detail_url = args["url"]
print(title, detail_url)
res = get_html(detail_url)
detail_html = res["msg"]
url_head = detail_url.split("content")[0]
anlise_detail(detail_html,url_head)