本文主要记录如何基于 Scrapy 获取 国家统计局 中最新发布的新闻资讯内容。
items.py
import scrapy
class NewsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() # 标题
summary = scrapy.Field() # 主旨
origin = scrapy.Field() # 文章原文链接
cover = scrapy.Field() # 文章封面图片
author = scrapy.Field() # 文章作者
publish_date = scrapy.Field() # 发布日期
publish_time = scrapy.Field() # 发布日期含时间
publish_from = scrapy.Field() # 文章来源
category = scrapy.Field() # 文章所属分类 report,
type = scrapy.Field() # 类型,图片(img)或者文字(txt)
html = scrapy.Field() # html内容
text = scrapy.Field() # 文本内容
image_urls = scrapy.Field() # 正文图片Url地址
images = scrapy.Field() # 正文图片本地地址
statsNews.py
# -*- coding: utf-8 -*-
import re
import scrapy
import logging
import datetime
from news.items import NewsItem
from news.captureImage import webshot
# 设置日志的输出样式
logging.basicConfig(level=logging.INFO,
format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
datefmt='%Y-%m-%d %T'
)
logger = logging.getLogger(__name__)
class StatsNewsSpider(scrapy.Spider):
name = 'statsNews'
allowed_domains = ['www.stats.gov.cn']
start_urls = ['http://www.stats.gov.cn/tjsj/zxfb/index.html']
current_page = 1
yesterday = str(datetime.date.today() + datetime.timedelta(-1))
has_next = True
def parse(self, response): # 提取最新发布列表页数据
logger.info('当前页数: ' + str(self.current_page))
self.current_page = self.current_page +1
news_list = response.xpath("//ul[@class='center_list_contlist']/li/a")
for news in news_list:
ahref = news.xpath("./@href").extract_first()
if ahref is not None:
publish_date = news.xpath("./node()//font[@class='cont_tit02']/text()").extract_first()
# 爬取昨天发布的新闻
if publish_date.strip() == self.yesterday:
# 爬取某一天
# if publish_date.strip() == "2020-03-13":
# 首次爬虫,爬取全部
# if True:
item = NewsItem()
item['category'] = '报告'
item['title'] = news.xpath("./node()//font[@class='cont_tit03']/text()").extract_first()
item['publish_date'] = publish_date
if ahref.startswith("/"):
item['origin'] = 'http://www.stats.gov.cn'+ ahref
item['type'] = 'TXT'
# 获取详情
yield scrapy.Request(
item['origin'],
callback=self.parse_detail,
meta={'item': item})
elif ahref.startswith("./"):
item['origin'] = ahref.replace('./','http://www.stats.gov.cn/tjsj/zxfb/',1)
item['type'] = 'TXT'
# 获取详情
yield scrapy.Request(
item['origin'],
callback=self.parse_detail,
meta={'item': item})
else:
item['origin'] = ahref
item['type'] = 'IMG'
item['publish_from'] = '微信公众号'
picname = webshot(ahref)
item['html'] = picname
item['text'] = picname
item['publish_time'] = item['publish_date']+' 09:30'
yield item
else:
self.has_next = False
break
if self.has_next:
# 获取下一页地址
next_url = response.xpath("//dl[@class='fenye']//a[text()='下一页']/@href").extract_first()
if next_url is not None:
next_url = 'http://www.stats.gov.cn/tjsj/zxfb/' + next_url
yield scrapy.Request(next_url, callback=self.parse)
def parse_detail(self, response): # 提取详情页数据
item = response.meta['item']
item['publish_from'] = response.xpath("//font[@class='xilan_titf']/font/font/text()").extract_first()
time_str = response.xpath("//font[@class='xilan_titf']/font/child::text()[last()]").extract_first()
time_str = time_str.strip().replace('发布时间:','').replace('\xa0',' ')
item['publish_time'] = time_str
news_html = response.xpath("//div[@class='xilan_con']")[0]
texts = news_html.xpath("normalize-space(string(.))").extract()
for i in range(0, len(texts)):
texts[i] = re.sub('\s', ' ', texts[i])
item['text'] = ''.join(texts)
logging.info(item['text'])
nPos = item['origin'].rindex("/")
prefix = item['origin'][0:nPos+1]
html = news_html.extract().replace('src="./','src="'+ prefix )
item['html'] = html
# with open('C:\\Users\\pjli\\Desktop\\pyworks\\{}.txt'.format(item['title']), 'w', encoding='utf-8') as f:
# f.write(html)
yield item
captureImage.py
from selenium import webdriver
from news.settings import WINDOWS_IMG_FOLDER , LINUX_IMG_FOLDER
import time
import hashlib
def webshot(url): # 对于某些特殊文章需截图保存
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=options)
driver.maximize_window()
# 返回网页的高度的js代码
js_height = "return document.body.clientHeight"
sha = hashlib.sha1(url.encode('utf-8'))
picname = sha.hexdigest() + ".png"
try:
driver.get(url)
k = 1
height = driver.execute_script(js_height)
while True:
if k * 500 < height:
js_move = "window.scrollTo(0,{})".format(k * 500)
# print(js_move)
driver.execute_script(js_move)
time.sleep(0.2)
height = driver.execute_script(js_height)
k += 1
else:
break
scroll_width = driver.execute_script('return document.body.parentNode.scrollWidth')
scroll_height = driver.execute_script('return document.body.parentNode.scrollHeight')
driver.set_window_size(scroll_width, scroll_height)
driver.get_screenshot_as_file(
WINDOWS_IMG_FOLDER + picname)
return picname
except Exception as e:
print(picname, e)
if __name__ == '__main__':
webshot("https://mp.weixin.qq.com/s/C9dIWk9jRGcmrq-aqzxusw")
pipelines.py
# -*- coding: utf-8 -*-
from pymongo import MongoClient
from news.items import NewsItem
class NewsPipeline(object):
@classmethod
def from_crawler(cls, crawler):
return cls()
def open_spider(self, spider):
self.client = MongoClient(host='172.16.250.238', port=27017)
self.client.test.authenticate('bigdata', 'bigdata')
self.db = self.client['test']
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
collection = self.db['statsNews']
if isinstance(item, NewsItem):
collection.insert_one(dict(item))
# with open('C:\\Users\\pjli\\Desktop\\pyworks\\news.txt', 'w', encoding='utf-8') as f:
# json.dump(dict(item), f, ensure_ascii=False, indent=2)