虎嗅网数据爬取(Scrapy Splash)

huxiu_spider.py

提取当前网页的所有article链接,通过解析artile内部的推荐文章(Splash动态爬取)进行跟踪爬取。

import logging
import scrapy
from coolscrapy.items import HuxiuItem
from scrapy_splash import SplashRequest
from scrapy.spider import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError
from random import randint
class HuxiuSpider(CrawlSpider):
    name = "huxiu"
    start_urls = [
        "https://www.huxiu.com/article/212265.html",
        "https://www.huxiu.com/",
        "https://www.huxiu.com/channel/103.html",
        "https://www.huxiu.com/channel/22.html",
        "https://www.huxiu.com/channel/106.html",
        "https://www.huxiu.com/channel/104.html",
        "https://www.huxiu.com/channel/21.html",
        "https://www.huxiu.com/channel/105.html",
        "https://www.huxiu.com/channel/111.html",
        "https://www.huxiu.com/channel/102.html",
        "https://www.huxiu.com/channel/110.html",
        "https://www.huxiu.com/channel/2.html",
        "https://www.huxiu.com/channel/112.html",
        "https://www.huxiu.com/channel/107.html",
        "https://www.huxiu.com/channel/4.html",
    ]

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, self.parse,
                                 meta={
                                     'splash': {
                                         'endpoint': "render.html"
                                     }
                                 }
                                 )
    def parse(self, response):
        print("选取 =》", response)
        # 获取当前网页内部所有网页链接
        article_url = response.xpath("//a[starts-with(@href, '/article')]/@href").extract()

        if article_url is not None:
            for item_url in article_url:
                url = response.urljoin(item_url)
                print("网页 =》", url)
                yield scrapy.Request(url, callback=self.parse_article, errback=self.errback_httpbin,
                                     meta = {
                                         'splash': {
                                             'endpoint': "render.html"
                                         }
                                     })

    def parse_article(self, response):
        # 解析网页内容,提取相关推荐文章
        print("解析 =》", response)
        try:
            detail = response.xpath('//div[@class="article-wrap"]')
            item = HuxiuItem()
            item['title'] = "".join([item.extract() for item in detail.xpath('h1/text()')])
            item['link'] = response.url
            author_detail = detail.xpath("//div[@class='article-author']")[0]
            item["author_name"] = author_detail.xpath("//span[@class='author-name']/a/text()")[0].extract()
            item["author_link"] = author_detail.xpath("//span[@class='author-name']/a/@href")[0].extract()
            try:
                item["article_class"] = "|".join(
                    [item.extract() for item in author_detail.xpath("//a[@class='column-link']/text()")])
                item["article_class_url"] = "|".join(
                    [item.extract() for item in author_detail.xpath("//a[@class='column-link']/@href")])
            except:
                item["article_class"] = ""
                item["article_class_url"] = ""
            if item["author_link"].find("member") != -1:
                item["author_link"] = "https://www.huxiu.com" + item["author_link"]
            item['article_time'] = detail.xpath("//span[contains(@class, 'article-time')]/text()")[0].extract()
            item["article_share"] = detail.xpath("//span[contains(@class,'article-share')]/text()")[0].re("\d+")[0]
            item['article_comment'] = detail.xpath("//span[contains(@class, 'article-pl')]/text()")[0].re("\d+")[0]
            item["article_html"] = detail.xpath("div[contains(@id, 'article_content')]")[0].extract()
            try:
                item["article_text"] = " ".join(detail.xpath("div[contains(@id, 'article_content')]/*/text()").extract())
            except:
                item["article_text"] = " ".join(detail.xpath("div[contains(@id, 'article_content')]/text()").extract())

            relate_url = ["/article/%s.html" % item for item in response.xpath("//a[@class='js-related-article']/@data-id").extract()]

            if relate_url:
                for item_url in relate_url:
                    url = response.urljoin(item_url)
                    print("relate_url =》", url)
                    yield scrapy.Request(url, callback=self.parse, errback=self.errback_httpbin,
                                   meta={
                                       'splash': {
                                           'endpoint': "render.html"
                                       }
                                   })
            yield item
        except:
            pass

    def errback_httpbin(self, failure):
        # log all errback failures,
        # in case you want to do something special for some errors,
        # you may need the failure's type
        self.logger.error(repr(failure))

        # if isinstance(failure.value, HttpError):
        if failure.check(HttpError):
            # you can get the response
            response = failure.value.response
            self.logger.error('HttpError on %s', response.url)

        # elif isinstance(failure.value, DNSLookupError):
        elif failure.check(DNSLookupError):
            # this is the original request
            request = failure.request
            self.logger.error('DNSLookupError on %s', request.url)

        # elif isinstance(failure.value, TimeoutError):
        elif failure.check(TimeoutError):
            request = failure.request
            self.logger.error('TimeoutError on %s', request.url)
  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值