虎嗅网数据爬取(Scrapy Splash)

最新推荐文章于 2021-12-10 14:41:59 发布

风的芸芸

最新推荐文章于 2021-12-10 14:41:59 发布

阅读量1.7k

点赞数 2

分类专栏：爬虫系统文章标签： scrapy-爬虫

本文链接：https://blog.csdn.net/CFK0104/article/details/77715936

版权

爬虫系统专栏收录该内容

2 篇文章 0 订阅

订阅专栏

huxiu_spider.py

提取当前网页的所有article链接，通过解析artile内部的推荐文章（Splash动态爬取）进行跟踪爬取。

import logging
import scrapy
from coolscrapy.items import HuxiuItem
from scrapy_splash import SplashRequest
from scrapy.spider import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError
from random import randint
class HuxiuSpider(CrawlSpider):
    name = "huxiu"
    start_urls = [
        "https://www.huxiu.com/article/212265.html",
        "https://www.huxiu.com/",
        "https://www.huxiu.com/channel/103.html",
        "https://www.huxiu.com/channel/22.html",
        "https://www.huxiu.com/channel/106.html",
        "https://www.huxiu.com/channel/104.html",
        "https://www.huxiu.com/channel/21.html",
        "https://www.huxiu.com/channel/105.html",
        "https://www.huxiu.com/channel/111.html",
        "https://www.huxiu.com/channel/102.html",
        "https://www.huxiu.com/channel/110.html",
        "https://www.huxiu.com/channel/2.html",
        "https://www.huxiu.com/channel/112.html",
        "https://www.huxiu.com/channel/107.html",
        "https://www.huxiu.com/channel/4.html",
    ]

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, self.parse,
                                 meta={
                                     'splash': {
                                         'endpoint': "render.html"
                                     }
                                 }
                                 )
    def parse(self, response):
        print("选取 =》", response)
        # 获取当前网页内部所有网页链接
        article_url = response.xpath("//a[starts-with(@href, '/article')]/@href").extract()

        if article_url is not None:
            for item_url in article_url:
                url = response.urljoin(item_url)
                print("网页 =》", url)
                yield scrapy.Request(url, callback=self.parse_article, errback=self.errback_httpbin,
                                     meta = {
                                         'splash': {
                                             'endpoint': "render.html"
                                         }
                                     })

    def parse_article(self, response):
        # 解析网页内容，提取相关推荐文章
        print("解析 =》", response)
        try:
            detail = response.xpath('//div[@class="article-wrap"]')
            item = HuxiuItem()
            item['title'] = "".join([item.extract() for item in detail.xpath('h1/text()')])
            item['link'] = response.url
            author_detail = detail.xpath("//div[@class='article-author']")[0]
            item["author_name"] = author_detail.xpath("//span[@class='author-name']/a/text()")[0].extract()
            item["author_link"] = author_detail.xpath("//span[@class='author-name']/a/@href")[0].extract()
            try:
                item["article_class"] = "|".join(
                    [item.extract() for item in author_detail.xpath("//a[@class='column-link']/text()")])
                item["article_class_url"] = "|".join(
                    [item.extract() for item in author_detail.xpath("//a[@class='column-link']/@href")])
            except:
                item["article_class"] = ""
                item["article_class_url"] = ""
            if item["author_link"].find("member") != -1:
                item["author_link"] = "https://www.huxiu.com" + item["author_link"]
            item['article_time'] = detail.xpath("//span[contains(@class, 'article-time')]/text()")[0].extract()
            item["article_share"] = detail.xpath("//span[contains(@class,'article-share')]/text()")[0].re("\d+")[0]
            item['article_comment'] = detail.xpath("//span[contains(@class, 'article-pl')]/text()")[0].re("\d+")[0]
            item["article_html"] = detail.xpath("div[contains(@id, 'article_content')]")[0].extract()
            try:
                item["article_text"] = " ".join(detail.xpath("div[contains(@id, 'article_content')]/*/text()").extract())
            except:
                item["article_text"] = " ".join(detail.xpath("div[contains(@id, 'article_content')]/text()").extract())

            relate_url = ["/article/%s.html" % item for item in response.xpath("//a[@class='js-related-article']/@data-id").extract()]

            if relate_url:
                for item_url in relate_url:
                    url = response.urljoin(item_url)
                    print("relate_url =》", url)
                    yield scrapy.Request(url, callback=self.parse, errback=self.errback_httpbin,
                                   meta={
                                       'splash': {
                                           'endpoint': "render.html"
                                       }
                                   })
            yield item
        except:
            pass

    def errback_httpbin(self, failure):
        # log all errback failures,
        # in case you want to do something special for some errors,
        # you may need the failure's type
        self.logger.error(repr(failure))

        # if isinstance(failure.value, HttpError):
        if failure.check(HttpError):
            # you can get the response
            response = failure.value.response
            self.logger.error('HttpError on %s', response.url)

        # elif isinstance(failure.value, DNSLookupError):
        elif failure.check(DNSLookupError):
            # this is the original request
            request = failure.request
            self.logger.error('DNSLookupError on %s', request.url)

        # elif isinstance(failure.value, TimeoutError):
        elif failure.check(TimeoutError):
            request = failure.request
            self.logger.error('TimeoutError on %s', request.url)