文章主题
怎样在一个scrapy框架中运行多个爬虫项目?
实现总过程
- 首先创建scrapy项目:scrapy startproject ScrapyProjects
- 创建具体的spider:
scrapy genspider spider_lianjia lianjia.com (网站1)
scrapy genspider spider_book yousuu.com (网站2) - 项目基本配置:
settings:
BOT_NAME = 'ScrapyProjects'
SPIDER_MODULES = ['ScrapyProjects.spiders']
NEWSPIDER_MODULE = 'ScrapyProjects.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'ScrapyProjects.middlewares.ScrapyprojectsSpiderMiddleware': 543,
}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'ScrapyProjects.middlewares.ScrapyprojectsDownloaderMiddleware': 543,
}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'ScrapyProjects.pipelines.LianJiaPipeline': 200,
'ScrapyProjects.pipelines.BookPipeline': 300,
}
pipelines:
from itemadapter import ItemAdapter
import json
class LianJiaPipeline:
sum = []
def process_item(self, item, spider):
LianJiaPipeline.sum.append(dict(item))
return item
def close_spider(self,spider):
with open('./data/lianjia.json','w',encoding='utf-8')as fp:
fp.write(json.dumps({'data':LianJiaPipeline.sum},ensure_ascii=False))
class BookPipeline:
sum = []
def process_item(self, item, spider):
BookPipeline.sum.append(dict(item))
return item
def close_spider(self, spider):
with open('./data/book.json', 'w', encoding='utf-8')as fp:
fp.write(json.dumps({'data': BookPipeline.sum}, ensure_ascii=False))
middleware:保持自己项目原有的代码,不要改变
items:
import scrapy
class LianJiaItem(scrapy.Item):
title = scrapy.Field()
class BookItem(scrapy.Item):
book_name = scrapy.Field()
具体的spider:
spider_book.py
import scrapy
from ..items import BookItem
class SpiderBookSpider(scrapy.Spider):
name = 'spider_book'
allowed_domains = ['yoususu.com']
custom_settings = {
'ITEM_PIPELINES': {'ScrapyProjects.pipelines.BookPipeline': 300},
}
def start_requests(self):
for i in range(1,11):
url = 'https://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page={}'.format(i)
yield scrapy.Request(url=url,callback=self.parse_data)
def parse_data(self, response):
items = BookItem()
ele_lists = response.xpath('//div[@class="common-card-layout StoreBooks"]/div/div')
for ele in ele_lists:
book_name = ele.xpath('.//a[@class="book-name"]/text()').extract_first()
print(book_name)
items['book_name'] = book_name
yield items
spider_lianjia.py
import scrapy
import requests
import json
import re
import math
import requests
from ..items import LianJiaItem
# from lxml import etree
class SpiderLianjiaSpider(scrapy.Spider):
custom_settings = {
'ITEM_PIPELINES': {'ScrapyProjects.pipelines.LianJiaPipeline': 300},
}
name = 'spider_lianjia'
allowed_domains = ['*']
def start_requests(self):
for i in range(1,10):
url = "https://hz.fang.lianjia.com/loupan/pg{}/".format(i)
data = 'hello world'
yield scrapy.Request(url=url, callback=self.parse_lists,meta={'d':data})
def parse_lists(self,response):
'''
不同spider对应不同user_agent
'''
items = LianJiaItem()
ele_lists = response.xpath('//ul[@class="resblock-list-wrapper"]/li')
for ele in ele_lists:
title = ele.xpath('./a/@title').extract_first()
items['title'] = title
yield items
在settings同级目录创建两个文件夹:data(存放爬取的数据,个人实现的方式)和mycmd(重写底层run方法)
mycmd目录下有init和mycrawl脚本:
mycrawl.py:
from scrapy.commands import BaseRunSpiderCommand
from scrapy.exceptions import UsageError
class Command(BaseRunSpiderCommand):
requires_project = True
def syntax(self):
return "[options] <spider>"
def short_desc(self):
return "Run a spider"
def run(self, args, opts):
# 获取爬虫列表
spd_loader_list = self.crawler_process.spider_loader.list()
# 遍历各爬虫
for spname in spd_loader_list or args:
self.crawler_process.crawl(spname, **opts.spargs)
print("此时启动的爬虫:" + spname)
self.crawler_process.start()
# def run(self, args, opts):
# if len(args) < 1:
# raise UsageError()
# elif len(args) > 1:
# raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported")
# spname = args[0]
#
# crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)
#
# if getattr(crawl_defer, 'result', None) is not None and issubclass(crawl_defer.result.type, Exception):
# self.exitcode = 1
# else:
# self.crawler_process.start()
#
# if (
# self.crawler_process.bootstrap_failed
# or hasattr(self.crawler_process, 'has_exception') and self.crawler_process.has_exception
# ):
# self.exitcode = 1
settings里加:COMMANDS_MODULE = ‘ScrapyProjects.mycmd’
所有的都OK啦:
最后运行scrapy mycrawl
问题记录
settings —>ITEM_PIPELINES = {
‘ScrapyProjects.pipelines.LianJiaPipeline’: 200,
‘ScrapyProjects.pipelines.BookPipeline’: 300,
}
会出现两个项目pipelines区分不开的问题
解决方法
在各自的spider中,再次声明一下就可以了:
custom_settings = {
‘ITEM_PIPELINES’: {‘Lianjia.pipelines.LianjiaPipeline’: 300},
}
致上: 该方法绝对真实有效,所有没有成功的,我们可以交流。。。