scrapy之异步持久化

介绍:以中关村为例采集手机信息异步持久化到mysql和mongo(学习使用,请勿用于商业行为)

1.创建scrapy项目 (Scrapy_test_spider)

1. scrapy startproject Scrapy_test_spider
2. cd Scrapy_test_spider
3. scrapy genspider zgc_spider xxx.com

在这里插入图片描述

2.项目目录结构

在这里插入图片描述

3.setting文件初步配置

# UA池
USER_AGENT_LIST = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
from random import choice
USER_AGENT = choice(USER_AGENT_LIST) #随机UA
ROBOTSTXT_OBEY = False # ROBOT协议
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 5 # 设置并发数为5,默认16

4.item中定义字段信息

  • 比如我们要采集手机的标题,价格,评分,评论数量。
import scrapy
class ScrapyTestSpiderItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field() #标题
    price = scrapy.Field() #价格
    score = scrapy.Field() #评分
    comment_total = scrapy.Field() #评论总数

在这里插入图片描述

5.mysql设计表结构

在这里插入图片描述

6.爬虫代码编写

python
import scrapy
from Scrapy_test_spider.items import ScrapyTestSpiderItem
​
class ZgcSpiderSpider(scrapy.Spider):
    name = 'zgc_spider'
    allowed_domains = ['detail.zol.com.cn']
    start_urls = ['https://detail.zol.com.cn/cell_phone_index/subcate57_list_1.html']
    count = 5 #控制翻页次数
​
    def parse(self, response):
        item = ScrapyTestSpiderItem()
        li_list = response.css('#J_PicMode li')
        for li in li_list:
            price = li.css('div.price-row > span.price.price-normal > b.price-type::text').get()
            title = li.css('li > h3 > a::text').get()
            score = li.css('div.comment-row > span.score::text').get()
            comment_total = li.css('div.comment-row > a.comment-num::text').get()
            if not price or not title or not score or not comment_total:
                continue
            item['title'] = title
            item['price'] = float(price)
            item['score'] = float(score)
            item['comment_total'] = int(comment_total.replace('人点评',''))
            yield item
​
​
        # 翻页操作
        next_page = response.css('a.next::attr(href)').get()
        if next_page and self.count:
            print(response.urljoin(next_page))
            yield scrapy.Request(response.urljoin(next_page),callback=self.parse)
            self.count -= 1
​
​
​
​
if __name__ == '__main__':
    from scrapy.cmdline import execute
    execute('scrapy crawl zgc_spider'.split())

7.pipelines管道异步持久化到mysql

from twisted.enterprise import adbapi
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
​
class ScrapyTestSpiderPipelineMySql(object):
    def __init__(self,dbpool):
        self.dbpool = dbpool
        self.table = settings.get('MYSQL_TABLE')
        self._sql = None
​
    @classmethod
    def from_crawler(cls, crawler):
        parmas = {
            'host': crawler.settings.get('MYSQL_HOST'),
            'port': crawler.settings.get('MYSQL_PORT'),
            'user': crawler.settings.get('MYSQL_USER'),
            'password': crawler.settings.get('MYSQL_PWD'),
            'db': crawler.settings.get('MYSQL_DB'),
            'charset': 'utf8'
        }
        dbpool = adbapi.ConnectionPool(
            'pymysql',
            **parmas
        )
        return cls(dbpool)
​
    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(
            self.insert_data_to_mysql,
            item,
            spider
        )
        #插入失败时回调
        query.addErrback(
            self.handle_error,
            item
        )
​
    @property
    def sql(self):
        if not self._sql:
            self._sql =  f"""
                insert into {self.table}(title, price,  score, comment_total)
                VALUE (%s, %s, %s, %s)
         """
            return self._sql
        return self._sql
​
    def insert_data_to_mysql(self,cursors,item,spider):
        parmas = (item['title'], item['price'], item['score'], item['comment_total'])
        cursors.execute(self.sql,parmas)
        print('数据插入成功!')
​
    def handle_error(self,failure,item):
        print(f'数据插入失败!--------------------{failure}')
​
​
    def close_spider(self, spider):
        self.dbpool.close()

8.setting文件添加如下内容

MYSQL_HOST = '127.0.0.1'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PWD = '123456'
MYSQL_DB = 'my_test'
MYSQL_TABLE = 'zgc_data'
ITEM_PIPELINES = {
'Scrapy_test_spider.pipelines.ScrapyTestSpiderPipelineMySql': 300,
}

9.启动爬虫程序

if __name__ == '__main__':
    from scrapy.cmdline import execute
    execute('scrapy crawl zgc_spider'.split())

在这里插入图片描述

在这里插入图片描述

  • 速度非常快异步持久化到mysql中。

10.异步持久化到mongo

pipelines中代码编写

import pymongo
from copy import deepcopy
from twisted.internet import reactor, defer
​
class ScrapyTestSpiderPipelineMongo(object):
    def __init__(self, mongo_uri, mongo_db, mongo_coll):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
        self.mongo_col = mongo_coll
​
    @classmethod
    def from_crawler(cls, crawler):
        mongo_uri=crawler.settings.get('MONGO_URI')
        mongo_db=crawler.settings.get('MONGO_DB')
        mongo_coll=crawler.settings.get('MONGO_COL')
        return cls(mongo_uri,mongo_db,mongo_coll)
​
​
    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.mongodb = self.client[self.mongo_db]
​
    def close_spider(self, spider):
        self.client.close()
​
    @defer.inlineCallbacks
    def process_item(self, item, spider):
        item = deepcopy(item)
        defer_out = defer.Deferred()
        reactor.callInThread(self._insert, item, defer_out, spider)
        yield defer_out
        defer.returnValue(item)
​
    def _insert(self, item, defer_out, spider):
        self.mongodb[self.mongo_col].insert(dict(item))
        reactor.callFromThread(defer_out.callback, item)
​

settings中添加

MONGO_URL = '127.0.0.1:27017'
MONGO_DB = 'my_test'
MONGO_COL= 'zgc_data'
ITEM_PIPELINES = {
   'Scrapy_test_spider.pipelines.ScrapyTestSpiderPipelineMongo': 300,
}

启动爬虫项目

在这里插入图片描述

mongo数据库

在这里插入图片描述

以上均为学习分享,可能存在不足或者还有其他更优雅的语法,欢迎评论区留言交流!

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值