国家统计局爬虫样例

最新推荐文章于 2024-04-28 10:44:40 发布

pengjunlee

最新推荐文章于 2024-04-28 10:44:40 发布

阅读量580

点赞数 2

分类专栏：网络爬虫文章标签： scrapy 爬虫

本文链接：https://blog.csdn.net/pengjunlee/article/details/104901014

版权

网络爬虫专栏收录该内容

28 篇文章 9 订阅

订阅专栏

本文主要记录如何基于 Scrapy 获取国家统计局中最新发布的新闻资讯内容。

items.py

import scrapy

class NewsItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field() # 标题
    summary = scrapy.Field() # 主旨
    origin = scrapy.Field()  # 文章原文链接
    cover = scrapy.Field()  # 文章封面图片
    author = scrapy.Field()  # 文章作者
    publish_date = scrapy.Field() # 发布日期
    publish_time = scrapy.Field()  # 发布日期含时间
    publish_from = scrapy.Field() # 文章来源
    category = scrapy.Field() # 文章所属分类 report,
    type = scrapy.Field() # 类型，图片（img）或者文字（txt）
    html = scrapy.Field() # html内容
    text = scrapy.Field() # 文本内容
    image_urls = scrapy.Field() # 正文图片Url地址
    images = scrapy.Field() # 正文图片本地地址

statsNews.py

# -*- coding: utf-8 -*-
import re
import scrapy
import logging
import datetime
from news.items import NewsItem
from news.captureImage import webshot

# 设置日志的输出样式
logging.basicConfig(level=logging.INFO,
                    format='[%(asctime)-15s] [%(levelname)8s] [%(name)10s ] - %(message)s (%(filename)s:%(lineno)s)',
                    datefmt='%Y-%m-%d %T'
                    )
logger = logging.getLogger(__name__)

class StatsNewsSpider(scrapy.Spider):
    name = 'statsNews'
    allowed_domains = ['www.stats.gov.cn']
    start_urls = ['http://www.stats.gov.cn/tjsj/zxfb/index.html']
    current_page = 1
    yesterday = str(datetime.date.today() + datetime.timedelta(-1))
    has_next = True

    def parse(self, response): # 提取最新发布列表页数据
        logger.info('当前页数： ' + str(self.current_page))
        self.current_page = self.current_page +1
        news_list = response.xpath("//ul[@class='center_list_contlist']/li/a")
        for news in news_list:
            ahref = news.xpath("./@href").extract_first()
            if ahref is not None:
                publish_date = news.xpath("./node()//font[@class='cont_tit02']/text()").extract_first()
                # 爬取昨天发布的新闻
                if publish_date.strip() == self.yesterday:
                # 爬取某一天
                # if publish_date.strip() == "2020-03-13":
                # 首次爬虫，爬取全部
                # if True:
                    item = NewsItem()
                    item['category'] = '报告'
                    item['title'] = news.xpath("./node()//font[@class='cont_tit03']/text()").extract_first()
                    item['publish_date'] = publish_date
                    if ahref.startswith("/"):
                        item['origin'] = 'http://www.stats.gov.cn'+ ahref
                        item['type'] = 'TXT'
                        # 获取详情
                        yield scrapy.Request(
                            item['origin'],
                            callback=self.parse_detail,
                            meta={'item': item})
                    elif ahref.startswith("./"):
                        item['origin'] = ahref.replace('./','http://www.stats.gov.cn/tjsj/zxfb/',1)
                        item['type'] = 'TXT'
                        # 获取详情
                        yield scrapy.Request(
                            item['origin'],
                            callback=self.parse_detail,
                            meta={'item': item})
                    else:
                        item['origin'] = ahref
                        item['type'] = 'IMG'
                        item['publish_from'] = '微信公众号'
                        picname = webshot(ahref)
                        item['html'] = picname
                        item['text'] = picname
                        item['publish_time'] = item['publish_date']+' 09:30'
                        yield item
                else:
                    self.has_next = False
                    break
        if self.has_next:
            # 获取下一页地址
            next_url = response.xpath("//dl[@class='fenye']//a[text()='下一页']/@href").extract_first()
            if next_url is not None:
                next_url = 'http://www.stats.gov.cn/tjsj/zxfb/' + next_url
                yield scrapy.Request(next_url, callback=self.parse)

    def parse_detail(self, response):  # 提取详情页数据
        item = response.meta['item']
        item['publish_from'] = response.xpath("//font[@class='xilan_titf']/font/font/text()").extract_first()
        time_str = response.xpath("//font[@class='xilan_titf']/font/child::text()[last()]").extract_first()
        time_str = time_str.strip().replace('发布时间：','').replace('\xa0',' ')
        item['publish_time'] = time_str
        news_html = response.xpath("//div[@class='xilan_con']")[0]
        texts = news_html.xpath("normalize-space(string(.))").extract()
        for i in range(0, len(texts)):
            texts[i] = re.sub('\s', ' ', texts[i])
        item['text'] = ''.join(texts)
        logging.info(item['text'])
        nPos = item['origin'].rindex("/")
        prefix = item['origin'][0:nPos+1]
        html = news_html.extract().replace('src="./','src="'+ prefix )
        item['html'] = html
        # with open('C:\\Users\\pjli\\Desktop\\pyworks\\{}.txt'.format(item['title']), 'w', encoding='utf-8') as f:
        #     f.write(html)
        yield item

captureImage.py

from selenium import webdriver
from news.settings import WINDOWS_IMG_FOLDER , LINUX_IMG_FOLDER
import time
import hashlib

def webshot(url): # 对于某些特殊文章需截图保存
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=options)
    driver.maximize_window()
    # 返回网页的高度的js代码
    js_height = "return document.body.clientHeight"
    sha = hashlib.sha1(url.encode('utf-8'))
    picname = sha.hexdigest() + ".png"
    try:
        driver.get(url)
        k = 1
        height = driver.execute_script(js_height)
        while True:
            if k * 500 < height:
                js_move = "window.scrollTo(0,{})".format(k * 500)
                # print(js_move)
                driver.execute_script(js_move)
                time.sleep(0.2)
                height = driver.execute_script(js_height)
                k += 1
            else:
                break
        scroll_width = driver.execute_script('return document.body.parentNode.scrollWidth')
        scroll_height = driver.execute_script('return document.body.parentNode.scrollHeight')
        driver.set_window_size(scroll_width, scroll_height)
        driver.get_screenshot_as_file(
            WINDOWS_IMG_FOLDER + picname)
        return picname
    except Exception as e:
        print(picname, e)

if __name__ == '__main__':
    webshot("https://mp.weixin.qq.com/s/C9dIWk9jRGcmrq-aqzxusw")

pipelines.py

# -*- coding: utf-8 -*-

from pymongo import MongoClient
from news.items import NewsItem

class NewsPipeline(object):

    @classmethod
    def from_crawler(cls, crawler):

        return cls()

    def open_spider(self, spider):
        self.client = MongoClient(host='172.16.250.238', port=27017)
        self.client.test.authenticate('bigdata', 'bigdata')
        self.db = self.client['test']

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        collection = self.db['statsNews']
        if isinstance(item, NewsItem):
            collection.insert_one(dict(item))
        # with open('C:\\Users\\pjli\\Desktop\\pyworks\\news.txt', 'w', encoding='utf-8') as f:
        #     json.dump(dict(item), f, ensure_ascii=False, indent=2)