Python之Scrapy框架Redis实现分布式爬虫详解

最新推荐文章于 2023-12-01 15:57:21 发布

陈伦(colby)

最新推荐文章于 2023-12-01 15:57:21 发布

阅读量3.7k

点赞数 2

分类专栏： Python Redis 分布式爬虫文章标签： Python Redis 分布式爬虫 scrapy

本文链接：https://blog.csdn.net/colby_chenlun/article/details/78133684

版权

Python 同时被 3 个专栏收录

14 篇文章 0 订阅

订阅专栏

分布式

6 篇文章 0 订阅

订阅专栏

爬虫

2 篇文章 0 订阅

订阅专栏

1、创建scrapy工程

scrapy startproject youboy

2、scrapy工程目录介绍

│  main.py    #爬虫程序入口cmdline.execute("scrapy  crawl youboySpider".split())

│  scrapy.cfg

└─spider_youboy

    │  items.py #定义要存储的字段，items通过spider返回，接收来至spider的字典数据

    │  middlewares.py

    │  pipelines.py #管道，从items获取字段，将数据存储到数据库中，可以使Mysql、Mongodb、Json、csv等

    │  settings.py #配置文件，数据库信息，参数以及pipeLine的设置等等

    │  __init__.py

│

    ├─spiders

    │  │  ddl.py

    │  │  mysqldb.py

    │  │  youboySpider.py #爬虫的核心部分，直接解析和处理网页，将数据传输给item和pipe

    │  │  __init__.py

3、安装mongodb以及redis模块

pip install pymongo

pip install scrapy-redis

4、爬虫源码

Mysql表结构，如果使用Mongodb不需要使用表结构，配置好items和settings就行了

 
      
    
#coding=utf-8
#Version:python3.5.2
#Tools:Pycharm
#Date:
__author__ = "Colby"
'''
drop table youboy_diqu;
drop table youboy_enterprise;
CREATE TABLE
    youboy_diqu
    (
        provinceName VARCHAR(50) NOT NULL COMMENT '省份',
        cityName VARCHAR(50) NOT NULL COMMENT '市区',
        url VARCHAR(255) COMMENT 'url地址',
        flag VARCHAR(1),
        PRIMARY KEY (provinceName, cityName)
    )
    ENGINE=InnoDB DEFAULT CHARSET=utf8;
    
CREATE TABLE youboy_enterprise
    (
        provinceName VARCHAR(50) comment '省份、直辖市、自治区',
        cityName VARCHAR(50) comment '市、自治州',
        catagory_1_Name VARCHAR(50) comment '一级类目url',
        catagory_1_Url VARCHAR(50) comment '一级类目名称',
        catagory_2_Name VARCHAR(50) comment '二级类目名称',
        catagory_2_Url VARCHAR(50) comment '二级类目url',
        catagory_3_Name VARCHAR(50) comment '三级类目名称',
        catagory_3_Url VARCHAR(50) comment '三级类目url',
        enterpriseName VARCHAR(125) comment '企业名称',
        contactPerson VARCHAR(50) comment '企业联系人',
        enterpriseFax VARCHAR(50) comment '企业传真',
        enterprisePhone VARCHAR(50) comment '企业电话',
        enterpriseMobile VARCHAR(50) comment '企业手机',
        enterpriseAddr VARCHAR(255) comment '企业联系地址'
    )
    ENGINE=InnoDB DEFAULT CHARSET=utf8;
'''

4-1、spider.py

 
       
     
 
            
          
 
          
 
           
 
          
#coding=utf-8
'''
Tools:PyCharm 2017.1
Version:Python3.5
Author:colby_chen
Date:2017-09-26
'''
import copy
from scrapy import Request
from scrapy.selector import Selector, HtmlXPathSelector
#from scrapy.spiders import CrawlSpider
from scrapy_redis.spiders import RedisSpider
from .mysqldb import connClose,connDB,exeBath,exeQuery,exeUpdate
import urllib.request
from lxml import etree
from ..items import SpiderYouboyItem
def gethtml(url):
    page = urllib.request.urlopen(url)
    html = page.read().decode('utf-8')
    return html
def getPage(url):
    '''
    根据传过来的url，获取所有分页，并返回一个url列表
    :param url:
    :return:
    '''
    urlList=[]
    startUrl=url
    html=gethtml(startUrl)
    selector=etree.HTML(html)
    nextPageFlag=selector.xpath('//dl[@class="sheng_weizhi_next01"]/a[last()]/text()')
    print('nextPageFlag',nextPageFlag)
    maxPage=None
    if nextPageFlag.__len__()>0:
        endurl=url+'10000'
        endhtml=gethtml(endurl)
        maxPage = selector.xpath('//dl[@class="sheng_weizhi_next01"]/strong/text()')[0]
        print('maxPage', maxPage)
        for i in range(1,int(maxPage)+1):
            currentUrl=url+str(i)
            print('currentUrl',currentUrl)
            urlList.append(currentUrl)
    else:
        urlList.append(startUrl)
    print('urlList...............................................', urlList)
    return urlList
def enterpriseContentDetail(enterpriseUrl,*args,**kwargs):
    page = urllib.request.urlopen(enterpriseUrl)
    html = page.read().decode('utf-8')
    selector = etree.HTML(html)
    # enterpriseContent = selector.xpath('//div[@class="txl_content_con"]/ul[1]/')
    # print('enterpriseContent', enterpriseContent)
    enterpriseDetail = []
    enterpriseName = selector.xpath('//div[@class="txl_content_con"]/ul[1]/h1/text()')[0].replace('\t','').replace('\r\n','')
    contactPerson = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[2]/text()')[0].replace('\t','').replace('\r\n','')
    enterpriseFax = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[3]/text()')[0].replace('\t','').replace('\r\n','')
    enterprisePhone = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[4]/text()')[0].replace('\t','').replace('\r\n','')
    enterpriseMobile = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[5]/text()')[0].replace('\t','').replace('\r\n','')
    enterpriseAddr = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[6]/text()')[0].replace('\t','').replace('\r\n','')
    enterpriseUrl=enterpriseUrl
    base=list(*args)
    enterpriseDetail = [enterpriseName,contactPerson,enterpriseFax,enterprisePhone,enterpriseMobile,enterpriseAddr,enterpriseUrl]
    if enterpriseDetail.__len__() == 0:
        enterpriseDetail = ['', '', '', '', '', '',enterpriseUrl]
    base.extend(enterpriseDetail)
    return base
class youboySpider(RedisSpider):
    name="youboySpider"
    redis_key="youboySpider:start_urls"
    start_urls=['http://book.youboy.com/diqu.html']
    def enterpriseContent(self,response):
        '''企业列表处理'''
        select_enterpriseList = Selector(response)
        items_enterpriseList = response.meta['baseInfo2']
        print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
        enterpriseList = select_enterpriseList.xpath('//*[@id="content"]/ul/div/strong/a')
        provinceName = items_enterpriseList['provinceName']
        cityName = items_enterpriseList['cityName']
        catagory_1_Name = items_enterpriseList['catagory_1_Name']
        catagory_1_Url = items_enterpriseList['catagory_1_Url']
        catagory_2_Name = items_enterpriseList['catagory_2_Name']
        catagory_2_Url = items_enterpriseList['catagory_2_Url']
        catagory_3_Name = items_enterpriseList['catagory_3_Name']
        catagory_3_Url = items_enterpriseList['catagory_3_Url']
        baseInfo = [provinceName, cityName, catagory_1_Name, catagory_1_Url, catagory_2_Name, catagory_2_Url,
                    catagory_3_Name, catagory_3_Url]
        enterpriseContentList = []
        if enterpriseList.__len__()==0:
            items_enterpriseList['enterpriseName']=''
            items_enterpriseList['contactPerson']=''
            items_enterpriseList['enterpriseFax']=''
            items_enterpriseList['enterprisePhone']=''
            items_enterpriseList['enterpriseMobile']=''
            items_enterpriseList['enterpriseAddr']=''
            items_enterpriseList['enterpriseUrl']=''
            #enterpriseContentDict=[(provinceName,cityName,catagory_1_Name,catagory_1_Url,catagory_2_Name,catagory_2_Url,catagory_3_Name,catagory_3_Url,'','','','','','','')]
        for enterpriseInfo in enterpriseList:
            enterpriseUrl=enterpriseInfo.xpath('@href').extract()[0]
            enterpriseContent=enterpriseContentDetail(enterpriseUrl,baseInfo)
            items_enterpriseList['enterpriseName'] = enterpriseContent[8]
            items_enterpriseList['contactPerson'] = enterpriseContent[9]
            items_enterpriseList['enterpriseFax'] = enterpriseContent[10]
            items_enterpriseList['enterprisePhone'] = enterpriseContent[11]
            items_enterpriseList['enterpriseMobile'] = enterpriseContent[12]
            items_enterpriseList['enterpriseAddr'] = enterpriseContent[13]
            items_enterpriseList['enterpriseUrl'] = enterpriseContent[14]
            yield items_enterpriseList
        # sql = "replace into youboy_enterprise(provinceName,cityName,catagory_1_Name,catagory_1_Url,catagory_2_Name,catagory_2_Url,catagory_3_Name,catagory_3_Url" \
        #       ",enterpriseName,contactPerson,enterpriseFax,enterprisePhone,enterpriseMobile,enterpriseAddr,enterpriseUrl) " \
        #       "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        # connMysql = connDB()
        # result = exeBath(connMysql[0], connMysql[1], sql, enterpriseContentList)
        # connClose(connMysql[0], connMysql[1])
    def parse_enterpriseFirstPage(self, response):
        '''企业列表处理'''
        select_enterpriseList=Selector(response)
        baseInfo2 = response.meta['items_catagory_3']
        firstPage = baseInfo2['catagory_3_Url']
        pageList=getPage(firstPage)
        for pageurl in pageList:
            '''
            dont_filter=True 多层循环失效加上此参数
            '''
            yield Request(pageurl,meta={'baseInfo2':copy.deepcopy(baseInfo2)},callback=self.enterpriseContent,dont_filter=True)
    def parse_catagory_3(self,response):
        '''行业三级类目处理函数'''
        '''行业二级类目处理函数'''
        selector_catagory_3 = Selector(response)
        items_catagory_3 = response.meta['items_catagory_2']
        print('二级类目', items_catagory_3['catagory_2_Name'])
        catagory_3_List = selector_catagory_3.xpath('//div[@class="sheng_weizhi_con"]/ul[2]/li/a')
        data=[]
        for catagory_3 in catagory_3_List:
            catagory_3_Name = catagory_3.xpath('text()').extract()[0]
            catagory_3_Url = catagory_3.xpath('@href').extract()[0]
            items_catagory_3['catagory_3_Name'] = catagory_3_Name
            items_catagory_3['catagory_3_Url'] = items_catagory_3['url'] + catagory_3_Url
            #print(items_catagory_3['provinceName'],items_catagory_3['cityName'],items_catagory_3['catagory_1_Name'],items_catagory_3['catagory_1_Url'],items_catagory_3['catagory_2_Name'],items_catagory_3['catagory_2_Url'],items_catagory_3['catagory_3_Name'],items_catagory_3['catagory_3_Url'])
            yield Request(items_catagory_3['catagory_3_Url'], meta={'items_catagory_3': copy.deepcopy(items_catagory_3)}
                          ,callback=self.parse_enterpriseFirstPage)
            #data.append((items_catagory_3['provinceName'],items_catagory_3['cityName'],items_catagory_3['catagory_1_Name'],items_catagory_3['catagory_1_Url'],items_catagory_3['catagory_2_Name'],items_catagory_3['catagory_2_Url'],items_catagory_3['catagory_3_Name'],items_catagory_3['catagory_3_Url']))
    def parse_catagory_2(self, response):
        '''行业二级类目处理函数'''
        selector_catagory_2 = Selector(response)
        items_catagory_2 = response.meta['items_catagory_1']
        print('一级类目', items_catagory_2['catagory_1_Name'])
        catagory_2_List = selector_catagory_2.xpath('//div[@class="sheng_weizhi_con"]/ul[2]/li/a')
        for catagory_2 in catagory_2_List:
            catagory_2_Name = catagory_2.xpath('text()').extract()[0]
            catagory_2_Url = catagory_2.xpath('@href').extract()[0]
            items_catagory_2['catagory_2_Name'] = catagory_2_Name
            items_catagory_2['catagory_2_Url'] = items_catagory_2['url'] + catagory_2_Url
            print(items_catagory_2['provinceName']
                  ,items_catagory_2['cityName']
                  ,items_catagory_2['catagory_1_Name']
                  ,items_catagory_2['catagory_1_Url']
                  ,items_catagory_2['catagory_2_Name']
                  ,items_catagory_2['catagory_2_Url'])
            yield Request(items_catagory_2['catagory_2_Url'], meta={'items_catagory_2': copy.deepcopy(items_catagory_2)}, callback=self.parse_catagory_3)
    def parse_catagory_1(self,response):
        '''行业一级类目处理函数'''
        selector_catagory_1 = Selector(response)
        items_catagory_1=response.meta['items']
        # 大类
        print('当前地区',items_catagory_1['provinceName'],items_catagory_1['cityName'])
        catagory_1_List = selector_catagory_1.xpath('//div[@class="sheng_weizhi_con"]/ul[2]/li/a')
        if catagory_1_List.__len__() ==0:
            catagory_1_List = selector_catagory_1.xpath('//div[@class="sheng_weizhi_con"]/ul/li/a')
        for catagory_1 in catagory_1_List:
            items_catagory_1['catagory_1_Name'] = catagory_1.xpath('text()').extract()[0]
            items_catagory_1['catagory_1_Url'] = items_catagory_1['url']+catagory_1.xpath('@href').extract()[0]
            yield Request(items_catagory_1['catagory_1_Url'], meta={'items_catagory_1':copy.deepcopy(items_catagory_1)}, callback=self.parse_catagory_2)
    def parse(self,response):
        selector=Selector(response)
        url='http://book.youboy.com'
        #获取class="ybs-bcTitle"下所有的a标签
        diquUrl = []
        diqu1 = selector.xpath('//div[@class="ybs-bcTitle"]/a')
        for bg in diqu1:
            cityUrl = bg.xpath('@href').extract()[0]
            cityUrl=url+cityUrl
            cityName=bg.xpath('text()').extract()[0]
            #print(cityName,cityName,cityUrl)
            diquUrl.append((cityName,cityName,cityUrl,'Y'))
        diqu2 = selector.xpath('//div[@class="ybs-bcBody"]/ul/li')
        for bg in diqu2:
            provinceName=bg.xpath('h3/a/text()').extract()[0]
            cityList=bg.xpath('span/a')
            for city in cityList:
                cityName = city.xpath('text()').extract()[0]
                cityUrl = city.xpath('@href').extract()[0]
                cityUrl = url + cityUrl
                diquUrl.append((provinceName,cityName,cityUrl,'Y'))
        #print(diquUrl)
        '''批量加载数据入库'''
        sql = "replace into youboy_diqu(provinceName,cityName,url,flag) " \
              "values(%s,%s,%s,%s)"
        connMysql = connDB()
        result = exeBath(connMysql[0], connMysql[1],sql,diquUrl)
        #print('加载记录数:', result)
        connClose(connMysql[0], connMysql[1])
        #############################################################################################################
        #############################################################################################################
        #############################################################################################################
        #读取url，按省市分别处理
        selectsql = "select provinceName,cityName,url from youboy_diqu where provinceName='上海' and cityName='上海' and flag='Y'"
        connMysql = connDB()
        results = exeQuery(connMysql[1],selectsql)
        # updatesql = "update youboy_diqu set flag='N' where provinceName='%s' and cityName='%s'" %(result[0],result[1])
        # updateresult = exeUpdate(connMysql[0],connMysql[1], updatesql)
        connClose(connMysql[0], connMysql[1])
        for result in results:
            print('当前地区%s-%s' %(result[0],result[1]))
            items = {}
            items['provinceName'] = result[0]
            items['cityName'] = result[1]
            items['cityUrl'] = result[2]
            items['url']=url
            #print('url',items['cityUrl'])
            yield Request(items['cityUrl'], meta={'items':copy.deepcopy(items)},callback=self.parse_catagory_1)
 
      

4-2、items.py

 
      
    
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class SpiderYouboyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    provinceName=scrapy.Field()
    cityName=scrapy.Field()
    catagory_1_Name=scrapy.Field()
    catagory_1_Url=scrapy.Field()
    catagory_2_Name=scrapy.Field()
    catagory_2_Url=scrapy.Field()
    catagory_3_Name=scrapy.Field()
    catagory_3_Url=scrapy.Field()
    enterpriseName=scrapy.Field()
    contactPerson=scrapy.Field()
    enterpriseFax=scrapy.Field()
    enterprisePhone=scrapy.Field()
    enterpriseMobile=scrapy.Field()
    enterpriseAddr=scrapy.Field()
    enterpriseUrl=scrapy.Field()

4-3、pipelines.py

 
       
     
 
          
 
           
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from scrapy.conf import settings
class SpiderYouboyPipeline(object):
    # def process_item(self, item, spider):
    #     return item
    def __init__(self):
        # 链接数据库
        self.client = pymongo.MongoClient(
            host=settings['MONGODB_HOST']
            , port=settings['MONGODB_PORT'])
        # 数据库登录需要帐号密码的话
        # self.client.admin.authenticate(settings['MINGO_USER']
        # , settings['MONGO_PSW'])
        self.db = self.client[settings['MONGODB_DB']]
        # 获得数据库的句柄
        self.postItem = self.db[settings['MONGODB_COLL']]
        # 获得collection的句柄
    def process_item(self, item, spider):
        postItem = dict(item)
        # 把item转化成字典形式
        print('postItem', postItem)
        self.postItem.insert_one(postItem)
        # 向数据库插入一条记录
        # 会在控制台输出原item数据，可以选择不写
        return item
 
      

4-4、settings.py

 
      
    
# -*- coding: utf-8 -*-
# Scrapy settings for spider_youboy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'spider_youboy'
SPIDER_MODULES = ['spider_youboy.spiders']
NEWSPIDER_MODULE = 'spider_youboy.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'spider_youboy (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
   'spider_youboy.pipelines.SpiderYouboyPipeline': 300,
}
MONGODB_HOST = "127.0.0.1"
MONGODB_PORT = 27017
MONGODB_DB = 'youboy'
MONGODB_COLL = 'enterprise'
SCHEDULER="scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST=True
SCHEDULER_QUEUE_CLASS="scrapy_redis.queue.SpiderPriorityQueue"
REDIS_URL = None
REDIS_HOST='127.0.0.1'
REDIS_PORT=6379
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'spider_youboy.middlewares.SpiderYouboySpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'spider_youboy.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
#     'spider_youboy.pipelines.SpiderYouboyPipeline': 300,
# }
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

4-5、mysqldb.py

 
       
     
 
          
 
           
#coding=utf-8
#!/usr/bin/python
'''
Author:chenlun
Date:2017-04-10
'''
import pymysql
def connDB():
    # 连接数据库
    try:
        conn = pymysql.connect(host='localhost', user='root', passwd='root', db='youboy', charset='utf8')
        cur = conn.cursor()
        return (conn, cur)
    except Exception as e:
        return "connect Error!"
def exeUpdate(conn, cur, sql):
    '''更新语句，可执行Update，Insert语句'''
    sta = cur.execute(sql)
    conn.commit()
    return (sta)
def exeBath(conn, cur, sql,data):
    '''批量插入数据'''
    #try:
    sta = cur.executemany(sql,data)
    conn.commit()
    return sta
    #except Exception as e:
    #    return pymysql.err
def exeQuery(cur, sql):
    # 查询语句
    cur.execute(sql)
    result = cur.fetchall()
    return result
def connClose(conn, cur):
    # 关闭所有连接
    cur.close()
    conn.close()
 
      

5、spider注意事项

5-1、yield meta多层传递参数失效：

    加上参数：dont_filter=True ‘多层循环失效加上此参数’

5-2、yield不是立即返回，而是异步执行，完成后调用callback函数，将url和meta字典传给指定函数处理

5-3、xpath的使用注意，常用功能必须熟悉

5-4、最后的落地动作,return items,这一步会将items字典传递给items.py函数

    items['key']与key=scrapy.field()相对应

5-5、注意深度copy和浅copy的使用，一般使用    copy.deepcopy(Info)
5-6、非分布式使用：
    class youboySpider(CrawlSpider):
    分布式Redis的话youboySpider继承RedisSpider类
    class youboySpider(RedisSpider):

6、数据库信息配置

    Mongodb配置：

ITEM_PIPELINES = {

   'spider_youboy.pipelines.SpiderYouboyPipeline': 300,

MONGODB_HOST = "127.0.0.1"

MONGODB_PORT = 27017

MONGODB_DB = 'youboy'

MONGODB_COLL = 'enterprise'

    Redis配置

SCHEDULER="scrapy_redis.scheduler.Scheduler"

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

SCHEDULER_PERSIST=True

SCHEDULER_QUEUE_CLASS="scrapy_redis.queue.SpiderPriorityQueue"

REDIS_URL = None

REDIS_HOST='127.0.0.1'

REDIS_PORT=6379

7、通过main启动爬虫程序

    如果使用了分布式，同时需要运行redis-cli客户端，启动监听，否则redis为空，程序会一直等待：

    lpush youboySpider:start_urls http://book.youboy.com/diqu.html

陈伦(colby)

关注

2
点赞
踩
13

收藏

觉得还不错? 一键收藏
3
评论
Python之Scrapy框架Redis实现分布式爬虫详解

1、创建scrapy工程scrapy startproject youboy2、scrapy工程目录介绍│ main.py #爬虫程序入口cmdline.execute("scrapy crawl youboySpider".split())│ scrapy.cfg └─spider_youboy │ items.py #定义要存储的字段，
复制链接

扫一扫

专栏目录