# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapyuniversal.utils import get_config
from scrapyuniversal.rules import rules
from scrapyuniversal.items import NewsItem
import re
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst,Join,Compose
from scrapyuniversal import urls
class UniversalSpider(CrawlSpider):
name = 'universal'
def __init__(self,name,*args,**kwargs):
print('######%s'%name)
config = get_config(name)
self.config = config
self.rules = rules.get(config.get('rules'))
start_urls = config.get('start_urls')
if start_urls:
if start_urls.get('type') == 'static':
self.start_urls = start_urls.get('value')
elif start_urls.get('type') == 'dynamic':
#eval('urls.' + start_urls.get('method'):调用urls.py中的函数def china1(start,end)
#*start_urls.get('args'):得到的结果是配置china.json中start_urls中args参数对应的值1,10
#self.start_urls:urls.py中最终得到的网页链接列表
self.start_urls = list(eval('urls.' + start_urls.get('method'))(*start_urls.get('args',[])))
self.allowed_domains = config.get('allowed_domains')
super(UniversalSpider,self).__init__(*args,**kwargs)
def parse_item(self, response):
#获取configs中配置项item, 可以自己自定义多个item.
item = self.config.get('item')
if item:
#item.get('class'):得到的值为字串:"NewsItem",通过eval转成类地址
#print(type(cls)): <class 'function'>,也就是items.py中类NewsItem的
#class NewsItem(scrapy.Item):
cls = eval(item.get('class'))()
#类似的调用ChinaLoader(item = NewsItem(), response=response)
#下面调用是比较通用的做法:eval(item.get('loader')):对应的是类ChinaLoader, 其中,cls对应的NewsItem()
loader = eval(item.get('loader'))(cls,response=response)
#动态获取属性配置,根据需求的不同, 该字段可以定义不同的字段值
for key,value in item.get('attrs').items():
for extractor in value:
if extractor.get('method') == 'xpath':
#args对应的是xpath解析方法, re是可选的,对应的是正则表达式
loader.add_xpath(key,*extractor.get('args'),**{'re':extractor.get('re')})
if extractor.get('method') == 'css':
#args对应的是css解析方法, re是可选的,对应的是正则表达式
loader.add_css(key,*extractor.get('args'),**{'re':extractor.get('re')})
if extractor.get('method') == 'value':
#args对应的是默认值
loader.add_value(key,*extractor.get('args'),**{'re':extractor.get('re')})
if extractor.get('method') == 'attr':
loader.add_value(key,getattr(response,*extractor.get('args')))
yield loader.load_item()
# def parse_item(self, response):
# print('########')
# loader = ChinaLoader(item = NewsItem(), response=response)
# loader.add_xpath('title','//h1[@id="chan_newsTitle"]/text()')
# loader.add_value('url',response.url)
# loader.add_xpath('text','//div[@id="chan_newsDetail"]//text()')
# loader.add_xpath('datetime','//div[@id="chan_newsInfo"]/text()',re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
# loader.add_xpath('source','//div[@id="chan_newsInfo"]/text()',re='来源: (.*)')
# loader.add_value('website','中华网')
# yield loader.load_item()
class NewsLoader(ItemLoader):
default_output_processor = TakeFirst()
class ChinaLoader(NewsLoader):
#拼接字符串&去掉首位空格
text_out = Compose(Join(), lambda s: s.strip())
source_out = Compose(Join(), lambda s: s.strip())
spiders爬虫文件
最新推荐文章于 2023-11-10 23:25:20 发布