崔大大的 gerapy_auto_extractor库练习
利用gerapy_auto_extractor库 高效通杀大部分新闻页面
gerapy_auto_extractor 可以提取大部分新闻列表页中的详情页url,并且 可以提取详情页中的 标题,时间,文章内容等数据。 不过无法提取文章中的图片url。如果需要提取文章中图片的url, 可以将html源码中所有图片的url提取出来, 并将原来的img标签及信息替换掉
pip install gerapy-auto-extractor
from gerapy_auto_extractor import extract_list, extract_detail
from six.moves.urllib.parse import urljoin
import re
import requests
def handling_garbled(response):
"""处理response乱码"""
if response.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(response.text)
if encodings:
encoding = encodings[0]
else:
encoding = response.apparent_encoding
else:
encoding = response.encoding
response.encoding = encoding
return response
class Spider:
def __init__(self, list_url):
self.list_url = list_url
def get_html(self, url, headers={}) -> str:
response = handling_garbled(requests.get(url))
return response.text
def clean_html(self, html, url):
"""替换将页面中所有的img标签全部替换为 【IMG】imgUrl【/IMG】"""
imgs = re.findall(r'<img.*?/>', html)
img_urls = [re.findall(r'src="(.*?)"|src=\'(.*?)\'', i) for i in imgs]
img_urls = [urljoin(url, i[0][0]) if urljoin(url, i[0][0]) else i[0][1] for i in img_urls]
img_urls = ["【IMG】" + i + "【/IMG】" for i in img_urls]
img_urls_length = len(img_urls)
for i in range(img_urls_length):
html = html.replace(imgs[i], img_urls[i], 1)
return html
def get_detail_urls(self, list_url) -> list:
list_html = self.get_html(list_url)
list_urls = extract_list(list_html)
list_urls = [urljoin(list_url, i["url"]) for i in list_urls]
return list_urls
def extract_detail(self, detail_url):
detail_html = self.get_html(detail_url)
detail_html = self.clean_html(detail_html, detail_url)
detail_data = extract_detail(detail_html)
detail_data["url"] = detail_url
return detail_data
def run(self):
data = []
detail_urls = self.get_detail_urls(self.list_url)
for i in detail_urls:
detail_data = self.extract_detail(i)
data.append(detail_data) if detail_data["content"] else ""
return data
spider = Spider("http://www.nlc.cn/dsb_zx/zxgg/")
print(spider.run())