huxiu_spider.py
提取当前网页的所有article链接,通过解析artile内部的推荐文章(Splash动态爬取)进行跟踪爬取。
import logging
import scrapy
from coolscrapy.items import HuxiuItem
from scrapy_splash import SplashRequest
from scrapy.spider import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError
from random import randint
class HuxiuSpider(CrawlSpider):
name = "huxiu"
start_urls = [
"https://www.huxiu.com/article/212265.html",
"https://www.huxiu.com/",
"https://www.huxiu.com/channel/103.html",
"https://www.huxiu.com/channel/22.html",
"https://www.huxiu.com/channel/106.html",
"https://www.huxiu.com/channel/104.html",
"https://www.huxiu.com/channel/21.html",
"https://www.huxiu.com/channel/105.html",
"https://www.huxiu.com/channel/111.html",
"https://www.huxiu.com/channel/102.html",
"https://www.huxiu.com/channel/110.html",
"https://www.huxiu.com/channel/2.html",
"https://www.huxiu.com/channel/112.html",
"https://www.huxiu.com/channel/107.html",
"https://www.huxiu.com/channel/4.html",
]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, self.parse,
meta={
'splash': {
'endpoint': "render.html"
}
}
)
def parse(self, response):
print("选取 =》", response)
# 获取当前网页内部所有网页链接
article_url = response.xpath("//a[starts-with(@href, '/article')]/@href").extract()
if article_url is not None:
for item_url in article_url:
url = response.urljoin(item_url)
print("网页 =》", url)
yield scrapy.Request(url, callback=self.parse_article, errback=self.errback_httpbin,
meta = {
'splash': {
'endpoint': "render.html"
}
})
def parse_article(self, response):
# 解析网页内容,提取相关推荐文章
print("解析 =》", response)
try:
detail = response.xpath('//div[@class="article-wrap"]')
item = HuxiuItem()
item['title'] = "".join([item.extract() for item in detail.xpath('h1/text()')])
item['link'] = response.url
author_detail = detail.xpath("//div[@class='article-author']")[0]
item["author_name"] = author_detail.xpath("//span[@class='author-name']/a/text()")[0].extract()
item["author_link"] = author_detail.xpath("//span[@class='author-name']/a/@href")[0].extract()
try:
item["article_class"] = "|".join(
[item.extract() for item in author_detail.xpath("//a[@class='column-link']/text()")])
item["article_class_url"] = "|".join(
[item.extract() for item in author_detail.xpath("//a[@class='column-link']/@href")])
except:
item["article_class"] = ""
item["article_class_url"] = ""
if item["author_link"].find("member") != -1:
item["author_link"] = "https://www.huxiu.com" + item["author_link"]
item['article_time'] = detail.xpath("//span[contains(@class, 'article-time')]/text()")[0].extract()
item["article_share"] = detail.xpath("//span[contains(@class,'article-share')]/text()")[0].re("\d+")[0]
item['article_comment'] = detail.xpath("//span[contains(@class, 'article-pl')]/text()")[0].re("\d+")[0]
item["article_html"] = detail.xpath("div[contains(@id, 'article_content')]")[0].extract()
try:
item["article_text"] = " ".join(detail.xpath("div[contains(@id, 'article_content')]/*/text()").extract())
except:
item["article_text"] = " ".join(detail.xpath("div[contains(@id, 'article_content')]/text()").extract())
relate_url = ["/article/%s.html" % item for item in response.xpath("//a[@class='js-related-article']/@data-id").extract()]
if relate_url:
for item_url in relate_url:
url = response.urljoin(item_url)
print("relate_url =》", url)
yield scrapy.Request(url, callback=self.parse, errback=self.errback_httpbin,
meta={
'splash': {
'endpoint': "render.html"
}
})
yield item
except:
pass
def errback_httpbin(self, failure):
# log all errback failures,
# in case you want to do something special for some errors,
# you may need the failure's type
self.logger.error(repr(failure))
# if isinstance(failure.value, HttpError):
if failure.check(HttpError):
# you can get the response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
# elif isinstance(failure.value, DNSLookupError):
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
# elif isinstance(failure.value, TimeoutError):
elif failure.check(TimeoutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)