需求:抓取http://quotes.toscrape.com/中quote,author,tags,保存到MongoDB中
环境:pycharm
scrapy流程图:
1,建立工程scrapy startproject toscrapy
2.创建spider
cd toscrapy
scrapy genspider quotes quotes.toscrape.com
3.创建item
items.py
import scrapy
class QuotesItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
quote = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
4,解析response
quotes.py
import scrapy
from ..items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.css('.quote')
for quote in quotes:
item = QuotesItem()
item['quote'] = quote.css('.text::text').extract_first()
item['author'] = quote.css('.author::text').extract_first()
item['tags'] = quote.css('.tags .tag::text').extract()
yield item
next_page = response.css('.next a::attr(href)').extract_first()
url = response.urljoin(next_page)
yield scrapy.Request(url=url,callback=self.parse)
5.数据处理,pipeline
pipelines.py
import pymongo
class TextPipeline(object):
def __init__(self,quote_limit):
self.limit = quote_limit
@classmethod
def from_crawler(cls, crawler):
return cls(
quote_limit = crawler.settings.get('QUOTE_LIMIT'),
)
def process_item(self, item, spider):
if len(item['quote']) > self.limit:
item['quote'] = item['quote'][:self.limit].rstrip() + '...'
return item
class ToscrapyPipeline(object):
collection_name = 'scrapy_items'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(dict(item))
return item
另外需要的配置:
settings.py
ITEM_PIPELINES = {
'toscrapy.pipelines.ToscrapyPipeline': 400,
'toscrapy.pipelines.TextPipeline': 300,
}
#...
MONGO_URI='localhost'
MONGO_DATABASE = 'toscrape'
QUOTE_LIMIT = 50
scrapy.cfg文件目录下创建一个begin.py:
from scrapy import cmdline
cmdline.execute("scrapy crawl quotes".split())
配置pycharm: