像读文献一样,读好的代码
文章目录
tutorial
start.py
from scrapy import cmdline
cmdline.execute("scrapy crawl quotes".split())
totorial
spiders
# -*- coding: utf-8 -*-
import scrapy
from tutorial.items import QuoteItem
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ["quotes.toscrape.com"]
start_urls = ['http://quotes.toscrape.com/page/1/']
# def parse(self, response):
# quotes = response.css('.quote')
# for quote in quotes:
# item = QuoteItem()
# item['text'] = quote.css('.text::text').extract_first()
# item['author'] = quote.css('.author::text').extract_first()
# item['tags'] = quote.css('.tag .tag::text').extract()
# yield item
#
# next = response.css('.page .next a::attr("href")').extract_first()
# url = response.urljoin(next)
# print(url)
# yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
quotes = response.xpath('//div[@class="col-md-8"]/div')
for quote in quotes:
item = QuoteItem()
item['text'] = quote.css('.text::text').extract_first()
item['author'] = quote.css('.author::text').extract_first()
item['tags'] = ','.join(quote.xpath('.//div[@class="tags"]/a/text()').getall()).strip()
yield item
next = response.xpath('//nav/ul/li[@class="next"]/a/@href').get()
url = response.urljoin(next)
yield scrapy.Request(url=url, callback=self.parse)
if not next:
return
items.py
import scrapy
class QuoteItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# collection = scrapy.Field()
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
pipelines.py
from scrapy.exceptions import DropItem
import pymongo
class TextPipeline(object):
def __init__(self):
self.limit = 50
def process_item(self, item, spider):
if item['text']:
if len(item['text']) > self.limit:
item['text'] = item['text'][0:self.limit].rstrip() + '...'
return item
else:
return DropItem('Missing Text')
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
settings.py
ITEM_PIPELINES = {
# 'tutorial.pipelines.TutorialPipeline': 300,
'tutorial.pipelines.TextPipeline': 300,
'tutorial.pipelines.MongoPipeline': 400,
}
MONGO_URI = 'localhost'
MONGO_DB = 'tutorial'
images360
spider
# -*- coding: utf-8 -*-
import json
from urllib.parse import urlencode
import scrapy
from scrapy import Request
from ..items import ImageItem
class ImagesSpider(scrapy.Spider):
name = 'images'
allowed_domains = ['images.so.com']
start_urls = ['http://images.so.com/']
def start_requests(self):
data = {
'q': '摄影',
'src': 'srp',
'correct': '摄影',
'pn': '0',
'ch':'',
'ran': '0',
'ras': '6',
'cn': '0',
'gn': '0',
'kn': '38',
}
base_url = 'https://image.so.com/j?'
for page in range(1, self.settings.get('MAX_PAGE')+1):
data['sn'] = page * 60 + 38
params = urlencode(data)
# 利用urlencode()方法将字典转化为url的GET参数,构造完整URL,构造并生成Request
url = base_url + params
yield Request(url, self.parse)
def parse(self, response):
result = json.loads(response.text)
for image in result.get('list'):
item = ImageItem()
item['id'] = image.get('id')
item['url'] = image.get('img')
item['title'] = image.get('title')
item['thumb'] = image.get('_thumb')
yield item
items.py
class ImageItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 定义四个字段,图片的ID,链接,标题,缩略图,两个属性collection 和 table 都定义成images字符串,
# 分别代表MongoDB存储的Collection名称和MySQL存储的表名称
collection = table = 'image'
id = Field()
url = Field()
title = Field()
thumb = Field()
pipelines.py
import pymongo
import pymysql
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
self.db[item.collection].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
class MysqlPipeline():
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls, crawler):
return cls(
host = crawler.settings.get('MYSQL_HOST'),
database = crawler.settings.get('MYSQL_DATABASE'),
user = crawler.settings.get('MYSQL_USER'),
password = crawler.settings.get('MYSQL_PASSWORD'),
port = crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self, spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', port=self.port)
self.cursor = self.db.cursor()
def close_spider(self, spider):
self.db.close()
def process_item(self, item, spider):
data = dict(item)
keys = ','.join(data.keys())
values = ','.join(['%s'] * len(data))
sql = 'insert into %s (%s) values (%s)'%(item.table, keys, values)
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
return item
# 专门处理下载的Pipeline,包括文件下载和图片下载
class ImagePipeline(ImagesPipeline):
# 先定义存储文件的路径,需要IMAGES_STORE变量,= './images',内置的ImagesPipeline默认读取Item的image_urls字段
# 现在生成的Item图片链接字段不是image_urls字段表示的,也不是列表,而是单个url,需要重新定义部分逻辑,自定义ImagePipeline
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name
def item_completed(self, results, item, info):
# 单个item完成下载时的处理办法,results是该item对应的下载结果,列表,包含下载成功失败的信息,列表空,失败,dropitem,否则返回item
image_path = [x['path'] for ok, x in results if ok]
if not image_path:
raise DropItem('Image Download Failed')
return item
def get_media_requests(self, item, info):
yield Request(item['url'])
settings.py
ITEM_PIPELINES = {
'images360.pipelines.ImagePipeline': 300,
'images360.pipelines.MongoPipeline': 301,
'images360.pipelines.MysqlPipeline': 302,
}
MAX_PAGE = 50
MONGO_URI = 'localhost'
MONGO_DB = 'images360'
MYSQL_HOST = 'localhost'
MYSQL_DATABASE = 'images360'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'Wangjinliang_45'
IMAGES_STORE = './images'
scrapyselenium淘宝
start.py
from scrapy import cmdline
cmdline.execute("scrapy crawl taobao".split())
spiders
# -*- coding: utf-8 -*-
from urllib.parse import quote
import scrapy
from scrapy import Request
from ..items import ProductItem
class TaobaoSpider(scrapy.Spider):
name = 'taobao'
allowed_domains = ['www.taobao.com']
base_url = 'http://www.taobao.com/search?q='
def start_requests(self):
# 遍历分页页码,构造生成Request,分页页码用meta参数传递,设置dont_filter不去重
# quote()方法将内容转化为URL编码的格式,中文字符
for keyword in self.settings.get('KEYWORDS'):
for page in range(1, self.settings.get('MAX_PAGE') + 1):
url = self.base_url + quote(keyword)
yield Request(url=url, callback=self.parse, meta={'page': page}, dont_filter=True)
def parse(self, response):
# xpath规则://从匹配选择的当前节点选择文档中的节点,不考虑位置 最前面加.选取当前节点内部元素 [1]是谓语,选取第一个xxx
# /text()节点的内部文本,具体内容extract() strip()删除开头结尾空字符 ''.join()序列中的元素以字符连接为新字符串
products = response.xpath(
'//div[@id="mainsrp-itemlist"]//div[@class="items"][1]//div[contains(@class, "item")]'
)
for product in products:
item = ProductItem()
item['price'] = ''.join(product.xpath('.//div[contains(@class, "price")]//text()').extract()).strip()
item['title'] = ''.join(product.xpath('.//div[contains(@class, "title")]//text()').extract()).strip()
item['shop'] = ''.join(product.xpath('.//div[contains(@class, "shop")]//text()').extract()).strip()
item['image'] = ''.join(product.xpath('.//div[@class="pic"]//img[contains(@class, "img")]/@data-src').extract()).strip()
item['deal'] = product.xpath('.//div[contains(@class, "deal-cnt")]//text()').extract_first()
item['location'] = product.xpath('.//div[contains(@class, "location")]//text()').extract_first()
yield item
pipelines.py
import pymongo
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DB'))
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
self.db[item.collection].insert(dict(item))
def close_spider(self, spider):
self.client.close()
class ScrapyseleniumtestPipeline(object):
def process_item(self, item, spider):
return item
items.py
from scrapy import Field, Item
class ProductItem(Item):
# 得到商品的图片、名称、价格、购买人数、店铺名称、店铺所在地
collection = 'products'
image = Field()
price = Field()
deal = Field()
title = Field()
shop = Field()
location = Field()
middlewares.py
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from logging import getLogger
class SeleniumMiddleware():
def __init__(self, timeout=None):
self.logger = getLogger(__name__)
self.timeout = timeout
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
# self.browser = webdriver.PhantomJS(service_args=service_args)
self.browser = webdriver.Chrome(chrome_options=chrome_options)
# self.browser.set_window_size(1400, 700)
self.browser.set_page_load_timeout(self.timeout)
self.wait = WebDriverWait(self.browser, self.timeout)
def __del__(self):
self.browser.close()
def process_request(self, request, spider):
"""
用PhantomJS抓取页面
:param request: Request对象
:param spider: Spider对象
:return: HtmlResponse
"""
# 通过request的meta属性获取当前需要爬取的页码
# self.logger.debug('PhantomJS is starting')
self.logger.debug('Chrome is starting')
page = request.meta.get('page', 1)
try:
# 调用PhantomJS对象的get()方法访问Request对应的URL,随后处理等待和翻页
self.browser.get(request.url)
if page > 1:
# 页码>1,跳页操作,否则等待加载页面
# 翻页操作:先获取页码输入框,赋值为input,然后获取“确定”按钮,赋值为submit,先清空输入框,clear()再调用
# send_keys()方法将页码填充到输入框,点击“确定”
input = self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
)
submit = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')) )
input.clear()
input.send_keys(page)
submit.click()
# 确定跳转正确:判断当前高亮页码数是当前的页码数,css选择器和当前跳转的页码通过参数传给text_to_be...
self.wait.until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page)))
# 最终要等到商品信息加载出来,选择器对应页面内容是每个商品的信息快
self.wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, '.m-itemlist .items .item')))
# 用PhantomJS对象的page_source属性获得当前页面的源代码,用它直接构造并返回一个HtmlResponse对象,需要传入多个参数
# 如url,body等,是它的基础属性
return HtmlResponse(url=request.url, body=self.browser.page_source, request=request,encoding='utf-8',status=200)
except TimeoutException:
return HtmlResponse(url=request.url, status=500, request=request)
@classmethod
def from_crawler(cls, crawler):
return cls(timeout=crawler.set
scrapyuniversal中华网
spiders
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..loader import ChinaLoader
from ..items import NewsItem
class ChinaSpider(CrawlSpider):
name = 'china'
allowed_domains = ['tech.china.com']
start_urls = ['http://tech.china.com/articles']
rules = (
Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
callback='parse_item'),
Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
)
def parse_item(self, response):
loader = ChinaLoader(item=NewsItem(), response=response)
loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
loader.add_value('url', response.url)
loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()')
loader.add_value('website', '中华网')
yield loader.load_item()
# item = NewsItem()
# item['title'] = response.xpath('//h1[@id="chan_newsTitle"]/text()').extract_first()
# item['url'] = response.url
# item['text'] = ''.join(response.xpath('//div[@id="chan_newsDetail"]//text()').extract()).strip()
# item['datetime'] = response.xpath('//div[@id="chan_newsInfo"]/text()').re_first('(\d+-\d+-\d+\s\d+:\d+:\d+)')
# item['source'] = response.xpath('//div[@id="chan_newsInfo"]/text()').re_first('来源:(.*)').strip()
# item['website'] = '中华网'
# yield item
# i = {}
# #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
# #i['name'] = response.xpath('//div[@id="name"]').extract()
# #i['description'] = response.xpath('//div[@id="description"]').extract()
# return i
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..rules import rules
from ..utils import get_config
class UniversalSpider(CrawlSpider):
name = 'universal'
def __init__(self, name, *args, **kwargs):
config = get_config(name)
self.config = config
self.rules = rules.get(config.get('rules'))
start_urls = config.get('start_urls')
if start_urls:
if start_urls.get('type') == 'static':
self.start_urls = start_urls.get('value')
elif start_urls.get('type') == 'dynamic':
self.start_urls = list(eval('urls.' + start_urls.get('method'))(*start_urls.get('args', [])))
self.allowed_domains = config.get('allowed_domains')
super(UniversalSpider, self).__init__(*args, **kwargs)
def parse_item(self, response):
item = self.config.get('item')
if item:
cls = eval(item.get('class'))
loader = eval(item.get('loader'))(cls, response=response)
# 动态获取配置
for key, value in item.get('attrs').items():
for extractor in value:
if extractor.get('method') == 'xpath':
loader.add_xpath(key, *extractor.get('args'), **{'re': extractor.get('re')})
if extractor.get('method') == 'css':
loader.add_css(key, *extractor.get('args'), **{'re': extractor.get('re')})
if extractor.get('method') == 'value':
loader.add_value(key, *extractor.get('args'), **{'re': extractor.get('re')})
if extractor.get('method') == 'attr':
loader.add_value(key, *extractor.get('args'), **{'re': extractor.get('re')})
yield loader.load_item()
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
run.py
import sys
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapyuniversal.scrapyuniversal.utils import get_config
def run():
name = sys.argv[1]
custom_settings = get_config(name)
# 爬取使用的Spider名称
spider = custom_settings.get('spider', 'universal')
project_settings = get_project_settings()
settings = dict(project_settings.copy())
# 合并配置
settings.update(custom_settings.get('settings'))
process = CrawlerProcess(settings)
# 启动爬虫
process.crawl(spider, **{'name': name})
process.start()
if __name__ == '__main__':
run()
configs
china.json
{
"spider": "universal",
"website": "中华网科技",
"type": "新闻",
"index": "http://tech.china.com/",
"settings": {
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
},
"start_urls": {
"type": "dynamic",
"method": "china",
"args": [
5, 10
]
},
"allowed_domains": [
"tech.china.com"
],
"rules": "china",
"item": {
"class": "NewsItem",
"loader": "ChinaLoader",
"attrs": {
"title": [
{
"method": "xpath",
"args": [
"//h1[@id='chan_newsTitle']/text()"
]
}
],
"url": [
{
"method": "attr",
"args": [
"url"
]
}
],
"text": [
{
"method": "xpath",
"args": [
"//div[@id='chan_newsDetail']//text()"
]
}
],
"datetime": [
{
"method": "xpath",
"args": [
"//div[@id='chan_newsInfo']/text()"
],
"re": "(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)"
}
],
"source": [
{
"method": "xpath",
"args": [
"//div[@id='chan_newsInfo']/text()"
]
}
],
"website":[
{
"method": "value",
"args": [
"中华网"
]
}
]
}
}
}
//将spider的属性抽离出来配置成JSON,第一个字段spider即Spider的名称,后面是站点的描述,站点名称、类型、首页等,随后的settings是该视频的天特有的settings配置
//随后是Spider的一些属性,start_url,allowed_domains,rules等
items.py
class NewsItem(Item):
# 新闻标题、链接、正文、发布时间、来源、站点名称
# define the fields for your item here like:
# name = scrapy.Field()
title = Field()
url = Field()
text = Field()
datetime = Field()
source = Field()
website = Field()
loader.py
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, Compose, Join
# ChinaLoader继承了Newsloader类, 定义通用的Out Porcessor为TakeFirst,相当于之前定义的extract_first()方法的功能
class NewsLoader(ItemLoader):
default_output_processor = TakeFirst()
# 定义text_out和source_out字段,使用Compose Porcessor,有两个参数,Join也是一个Processor可以把列表拼合成一个字符串,第二个参数
# 为匿名参数,将字符串的头尾空白字符去掉,从而将列表形式的提取结果转化为去除头尾空白的字符串
class ChinaLoader(NewsLoader):
text_out = Compose(Join(), lambda s: s.strip())
source_out = Compose(Join(), lambda s: s.strip())
rules.py
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
rules = {
'china': (
Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
callback='parse_item'),
Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
)
}
urls.py
def china(start, end):
for page in range(start, end + 1):
yield 'http://tech.china.com/articles/index_' + str(page) + '.html'
utils.py
from os.path import realpath, dirname
import json
def get_config(name):
path = dirname(realpath(__file__)) + '/configs' + name + '.json'
with open(path, 'r', encoding='utf-8') as f:
return json.loads(f.read())
README.md
通用爬虫
以中华网为例
目标:抓取新闻列表的所有分页的新闻详情,包括标题、正文、时间、来源等信息
项目梳理
新建项目、创建Spider
CrawlSpider,scrapy提供的通用Spider,命令 scrapy genspider -t crawl china tech.china.com Spider内容多了rules属性的定义,Rule管理爬取规则,包含提取和跟进页面的设置,第一个参数是LinkExtractor,默认回调函数不是parse, 而是parse_item
定义Rule,实现解析函数
先将start_urls修改为起始链接,然后Spider开始爬取start_urls的每一个链接,得到Response之后,Spider根据Rule提取 页面内的超链接,去生成进一步的Request
起始为新闻列表页,下一步提取每条新闻的详情页的链接 链接都在class为con_item的div节点里,用LinkExtractor的 restrict_xpaths属性指定,Spider从该区域提取所有超链接生成Request,但是每篇文章的导航中有其他的超链接标签,真正的 超链接路径以article开头,用正则匹配赋值给allow参数,对应页面就是详情页,是需要解析的,callback到parse_item
另外要让当前页面分页功能,提取下一页的链接,在ID为pageStyle的节点内,直接用LinkExtractor的restrict_xpaths属性 指定提取的链接,不用提取页面详情信息,不需要callback,下一页如果匹配成功,要像上述情况分析,加一个follow参数True ,代表继续跟进匹配分析,follow可以不加,默认
实现页面翻页和详情页的抓取了
解析页面
先定义Item,字段包括新闻标题、链接、正文、发布时间、来源、站点名称,站点名称赋值中华网,通用爬虫,所以需要字段区别站点名
如果和之前一样提取内容,调用response变量的xpath()等方法即可,parse_item()实现,把每条新闻信息提取成一个NewsItem对象
这种提取方式不规整,对Item的提取,用模块Item Loader,通过add_xpath()\add_css()\add_value()等方式实现配置化提取,改写 parse_item() 定义ItemLoader的子类,名为ChinaLoader ChinaLoader继承了Newsloader类, 定义通用的Out Porcessor为TakeFirst, 相当于之前定义的extract_first()方法的功能,返回第一个非空值 ChinaLoader中定义text_out和source_out字段,使用 Compose Porcessor,是用给定的多个函数组合构造的Processor,有两个参数,Join也是一个Processor可以把列表拼合成一个字符串, 第二个参数为匿名参数,将字符串的头尾空白字符去掉,从而将列表形式的提取结果转化为去除头尾空白的字符串
通用配置抽取
如果扩展其他站点,仍需要创建新的CrawlSpider,定义站点的Rule,单独实现parse_item()方法。许多代码是重复的,如 CrawkSpider的变量、方法名几乎一样,可以新建通用Spider,将name\allowed_domains等抽取,在Spider初始化时赋值即可 命令scrapy genspidet -t crawl universal universal将之前Spide内的属性抽离配置成Json,名为china.josn,放到 configs文件夹,和spiders文件夹并列
json代码说明:第一个字段spider为Spider的名称,universal,后面是站点描述,站点名、类型、首页等,随后的settings是 该Spider特有的settings配置,随后是spider的一些属性,start_urls、allowed_domains、rules等,rules也可单独定义为rules.py 做成配置文件,实现Rule的分离
启动爬虫,需要从该配置文件中读取动态加载到Spider,需要定义读取JSON文件的方法get_config()到utils文件中
定义入口文件run.py,放到根目录下,启动Spider 运行入口run(),先获取命令行的参数赋值为name,就是JSON文件的名称,也就是要 爬取网站的名称,先利用get_config()方法传入该名称读取刚才定义的配置文件,获取爬取使用的spider的名称、配置文件中的 settings配置,再把获取的settings配置和项目全局的settings配置合并,新建CrawlerProcess传入爬取使用的配置,调用 crawl()和start()方法启动爬取
在universal中,新建__init__方法初始化配置,赋值start_urls,allowed_domains,rules等属性,rules属性另外读取rules.py配置 实现基础配置,再讲剩下的解析部分也配置抽离出来,变量包括Item Loader类的选用、Item类的选用、Item Loader方法参数的定义
配置JSON文件中,class和loader属性分别代表Item和Item Loader所用的类,定义attrs属性定义每个字段的提取规则,如 title定义的每一项都有method属性,代表使用的提取方法,xpath代表调用Item Loader的add_xpath()方法,args即参数,add_xpath() 的第二个参数,即xpath表达式,针对datetime,还用了一次正则,定义re参数传递正则
最后,是加载配置到parse_item()方法,首先获取item的配置信息,然后获取class的配置,初始化Item Loader,遍历Item 的各个属性依次提取,判断method字段,调用对应处理方法
此外,start_urls的配置,某些情况,也需要动态配置,一种直接配置URL列表,一种调用方法生成,如果动态生成,调用方法传参数