首先创建项目
进入项目文件夹下用cmd
scrapy startproject DuShu
打开项目创建app
scrapy genspider -t crawl dushu dushu.com
修改settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
ITEM_PIPELINES = {
#注意这里自己定义一个
"scrapy_redis.pipelines.RedisPipeline":400,
}
REDIS_HOST = "www.fanjianbo.com"
REDIS_PORT = 6379
# 密码
# REDIS_PARAMS = {"password":'xxxxxx'}
# 调度器,把scrapy的默认调度器换成分布式调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 调度过程中是否允许暂停
SCHEDULER_PERSIST = True
# 去重组件
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
编写items.py
import scrapy
class DushuproItem(scrapy.Item):
name = scrapy.Field()
price = scrapy.Field()
author = scrapy.Field()
cbs = scrapy.Field()
content = scrapy.Field()
author_info = scrapy.Field()
mulu = scrapy.Field()
编写dushu.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from DushuPro.items import DushuproItem
#导入分布式爬虫类
from scrapy_redis.spiders import RedisCrawlSpider
class BookSpider(RedisCrawlSpider):
name = 'dushu'
allowed_domains = ['dushu.com']
# start_urls = ['http://www.dushu.com/book/1002_1.html']
#分布式的url全部来源于redis数据库中
redis_key = 'book:start_urls'
#当分布式爬虫引擎驱动以后,会从远程redis数据库中根据redis_key来获取
#起始地址
rules = (
# Rule(LinkExtractor(allow=r'/book/1002_\d+\.html'), callback='parse_item', follow=True),
# Rule(LinkExtractor(restrict_xpaths="//div[@class='pages']//a"), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_css=".pages a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
# 解析一级页面
book_list = response.xpath("//div[@class='bookslist']//ul/li")
for book in book_list:
item = {}
item["name"] = book.xpath(".//h3//text()").extract_first()
item["author"] = " ".join(book.xpath(".//p[1]//text()").extract())
# 后面的内容要从二级页面中提取
# 提取出当前书对应的二级页面
next_url = "https://www.dushu.com" + book.xpath(".//h3//a/@href").extract_first()
# 访问二级页面
yield scrapy.Request(url=next_url,callback=self.parse_info,meta={"item":item})
# meta参数,每次Request下载器下载完数据都会封装一个response对象,其中meta是response的一个属性;在这里把item设置到该属性中,响应对象就会携带上item响应给下级页面的请求
def parse_info(self, response):
# print(response.meta)
item = response.meta["item"]
# 解析二级页面提取剩下的内容
item["price"] = response.xpath("//span[@class='num']/text()").extract_first()
item["cbs"] = response.xpath("//tr[2]//td[2]/a/text()").extract()[0] if response.xpath("///tr[2]//td[2]/a/text()").extract() else ""
item["content"] = response.xpath("//div[@class='text txtsummary']//text()").extract()[0]
item["author_info"] = response.xpath("//div[@class='text txtsummary']//text()").extract()[1]
item["mulu"] = response.xpath("//div[contains(@class,'text txtsummary')]")[2].xpath(".//text()").extract()
yield item
运行程序
scrapy crawl dushu
然后链接上redis数据库把start_urls传入
redis-cli -h [ip] -p 6379
LPUSH book:start_urls http://www.dushu.com/book/1002_1.html