Python之Scrapy框架Redis实现分布式爬虫详解

6 篇文章 0 订阅
2 篇文章 0 订阅
1、创建scrapy工程
scrapy startproject youboy
2、scrapy工程目录介绍
│  main.py    #爬虫程序入口cmdline.execute("scrapy  crawl youboySpider".split())
│  scrapy.cfg     
└─spider_youboy
    │  items.py #定义要存储的字段,items通过spider返回,接收来至spider的字典数据
    │  middlewares.py
    │  pipelines.py #管道,从items获取字段,将数据存储到数据库中,可以使Mysql、Mongodb、Json、csv等
    │  settings.py #配置文件,数据库信息,参数以及pipeLine的设置等等
    │  __init__.py
    │  
    ├─spiders
    │  │  ddl.py
    │  │  mysqldb.py
    │  │  youboySpider.py #爬虫的核心部分,直接解析和处理网页,将数据传输给item和pipe
    │  │  __init__.py
3、安装mongodb以及redis模块
pip install pymongo
pip install scrapy-redis
4、爬虫源码
Mysql表结构,如果使用Mongodb不需要使用表结构,配置好items和settings就行了
1
#coding=utf-8
2
#Version:python3.5.2
3
#Tools:Pycharm
4
#Date:
5
__author__ = "Colby"
6
'''
7
drop table youboy_diqu;
8
drop table youboy_enterprise;
9
CREATE TABLE
10
    youboy_diqu
11
    (
12
        provinceName VARCHAR(50) NOT NULL COMMENT '省份',
13
        cityName VARCHAR(50) NOT NULL COMMENT '市区',
14
        url VARCHAR(255) COMMENT 'url地址',
15
        flag VARCHAR(1),
16
        PRIMARY KEY (provinceName, cityName)
17
    )
18
    ENGINE=InnoDB DEFAULT CHARSET=utf8;
19
    
20
CREATE TABLE youboy_enterprise
21
    (
22
        provinceName VARCHAR(50) comment '省份、直辖市、自治区',
23
        cityName VARCHAR(50) comment '市、自治州',
24
        catagory_1_Name VARCHAR(50) comment '一级类目url',
25
        catagory_1_Url VARCHAR(50) comment '一级类目名称',
26
        catagory_2_Name VARCHAR(50) comment '二级类目名称',
27
        catagory_2_Url VARCHAR(50) comment '二级类目url',
28
        catagory_3_Name VARCHAR(50) comment '三级类目名称',
29
        catagory_3_Url VARCHAR(50) comment '三级类目url',
30
        enterpriseName VARCHAR(125) comment '企业名称',
31
        contactPerson VARCHAR(50) comment '企业联系人',
32
        enterpriseFax VARCHAR(50) comment '企业传真',
33
        enterprisePhone VARCHAR(50) comment '企业电话',
34
        enterpriseMobile VARCHAR(50) comment '企业手机',
35
        enterpriseAddr VARCHAR(255) comment '企业联系地址'
36
    )
37
    ENGINE=InnoDB DEFAULT CHARSET=utf8;
38
'''
4-1、spider.py
1
#coding=utf-8
2
'''
3
Tools:PyCharm 2017.1
4
Version:Python3.5
5
Author:colby_chen
6
Date:2017-09-26
7
'''
8
import copy
9
from scrapy import Request
10
from scrapy.selector import Selector, HtmlXPathSelector
11
#from scrapy.spiders import CrawlSpider
12
from scrapy_redis.spiders import RedisSpider
13
from .mysqldb import connClose,connDB,exeBath,exeQuery,exeUpdate
14
import urllib.request
15
from lxml import etree
16
from ..items import SpiderYouboyItem
17
def gethtml(url):
18
    page = urllib.request.urlopen(url)
19
    html = page.read().decode('utf-8')
20
    return html
21
22
def getPage(url):
23
    '''
24
    根据传过来的url,获取所有分页,并返回一个url列表
25
    :param url:
26
    :return:
27
    '''
28
    urlList=[]
29
    startUrl=url
30
    html=gethtml(startUrl)
31
    selector=etree.HTML(html)
32
    nextPageFlag=selector.xpath('//dl[@class="sheng_weizhi_next01"]/a[last()]/text()')
33
    print('nextPageFlag',nextPageFlag)
34
    maxPage=None
35
    if nextPageFlag.__len__()>0:
36
        endurl=url+'10000'
37
        endhtml=gethtml(endurl)
38
        maxPage = selector.xpath('//dl[@class="sheng_weizhi_next01"]/strong/text()')[0]
39
        print('maxPage', maxPage)
40
        for i in range(1,int(maxPage)+1):
41
            currentUrl=url+str(i)
42
            print('currentUrl',currentUrl)
43
            urlList.append(currentUrl)
44
    else:
45
        urlList.append(startUrl)
46
    print('urlList...............................................', urlList)
47
    return urlList
48
49
def enterpriseContentDetail(enterpriseUrl,*args,**kwargs):
50
    page = urllib.request.urlopen(enterpriseUrl)
51
    html = page.read().decode('utf-8')
52
    selector = etree.HTML(html)
53
    # enterpriseContent = selector.xpath('//div[@class="txl_content_con"]/ul[1]/')
54
    # print('enterpriseContent', enterpriseContent)
55
    enterpriseDetail = []
56
    enterpriseName = selector.xpath('//div[@class="txl_content_con"]/ul[1]/h1/text()')[0].replace('\t','').replace('\r\n','')
57
    contactPerson = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[2]/text()')[0].replace('\t','').replace('\r\n','')
58
    enterpriseFax = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[3]/text()')[0].replace('\t','').replace('\r\n','')
59
    enterprisePhone = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[4]/text()')[0].replace('\t','').replace('\r\n','')
60
    enterpriseMobile = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[5]/text()')[0].replace('\t','').replace('\r\n','')
61
    enterpriseAddr = selector.xpath('//div[@class="txl_content_con"]/ul[1]/li[6]/text()')[0].replace('\t','').replace('\r\n','')
62
    enterpriseUrl=enterpriseUrl
63
    base=list(*args)
64
    enterpriseDetail = [enterpriseName,contactPerson,enterpriseFax,enterprisePhone,enterpriseMobile,enterpriseAddr,enterpriseUrl]
65
    if enterpriseDetail.__len__() == 0:
66
        enterpriseDetail = ['', '', '', '', '', '',enterpriseUrl]
67
    base.extend(enterpriseDetail)
68
    return base
69
70
class youboySpider(RedisSpider):
71
    name="youboySpider"
72
    redis_key="youboySpider:start_urls"
73
    start_urls=['http://book.youboy.com/diqu.html']
74
    def enterpriseContent(self,response):
75
        '''企业列表处理'''
76
        select_enterpriseList = Selector(response)
77
        items_enterpriseList = response.meta['baseInfo2']
78
        print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
79
        enterpriseList = select_enterpriseList.xpath('//*[@id="content"]/ul/div/strong/a')
80
        provinceName = items_enterpriseList['provinceName']
81
        cityName = items_enterpriseList['cityName']
82
        catagory_1_Name = items_enterpriseList['catagory_1_Name']
83
        catagory_1_Url = items_enterpriseList['catagory_1_Url']
84
        catagory_2_Name = items_enterpriseList['catagory_2_Name']
85
        catagory_2_Url = items_enterpriseList['catagory_2_Url']
86
        catagory_3_Name = items_enterpriseList['catagory_3_Name']
87
        catagory_3_Url = items_enterpriseList['catagory_3_Url']
88
        baseInfo = [provinceName, cityName, catagory_1_Name, catagory_1_Url, catagory_2_Name, catagory_2_Url,
89
                    catagory_3_Name, catagory_3_Url]
90
        enterpriseContentList = []
91
        if enterpriseList.__len__()==0:
92
            items_enterpriseList['enterpriseName']=''
93
            items_enterpriseList['contactPerson']=''
94
            items_enterpriseList['enterpriseFax']=''
95
            items_enterpriseList['enterprisePhone']=''
96
            items_enterpriseList['enterpriseMobile']=''
97
            items_enterpriseList['enterpriseAddr']=''
98
            items_enterpriseList['enterpriseUrl']=''
99
            #enterpriseContentDict=[(provinceName,cityName,catagory_1_Name,catagory_1_Url,catagory_2_Name,catagory_2_Url,catagory_3_Name,catagory_3_Url,'','','','','','','')]
100
        for enterpriseInfo in enterpriseList:
101
            enterpriseUrl=enterpriseInfo.xpath('@href').extract()[0]
102
            enterpriseContent=enterpriseContentDetail(enterpriseUrl,baseInfo)
103
            items_enterpriseList['enterpriseName'] = enterpriseContent[8]
104
            items_enterpriseList['contactPerson'] = enterpriseContent[9]
105
            items_enterpriseList['enterpriseFax'] = enterpriseContent[10]
106
            items_enterpriseList['enterprisePhone'] = enterpriseContent[11]
107
            items_enterpriseList['enterpriseMobile'] = enterpriseContent[12]
108
            items_enterpriseList['enterpriseAddr'] = enterpriseContent[13]
109
            items_enterpriseList['enterpriseUrl'] = enterpriseContent[14]
110
            yield items_enterpriseList
111
112
        # sql = "replace into youboy_enterprise(provinceName,cityName,catagory_1_Name,catagory_1_Url,catagory_2_Name,catagory_2_Url,catagory_3_Name,catagory_3_Url" \
113
        #       ",enterpriseName,contactPerson,enterpriseFax,enterprisePhone,enterpriseMobile,enterpriseAddr,enterpriseUrl) " \
114
        #       "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
115
        # connMysql = connDB()
116
        # result = exeBath(connMysql[0], connMysql[1], sql, enterpriseContentList)
117
        # connClose(connMysql[0], connMysql[1])
118
119
    def parse_enterpriseFirstPage(self, response):
120
        '''企业列表处理'''
121
        select_enterpriseList=Selector(response)
122
        baseInfo2 = response.meta['items_catagory_3']
123
        firstPage = baseInfo2['catagory_3_Url']
124
        pageList=getPage(firstPage)
125
        for pageurl in pageList:
126
            '''
127
            dont_filter=True 多层循环失效加上此参数
128
            '''
129
            yield Request(pageurl,meta={'baseInfo2':copy.deepcopy(baseInfo2)},callback=self.enterpriseContent,dont_filter=True)
130
131
    def parse_catagory_3(self,response):
132
        '''行业三级类目处理函数'''
133
        '''行业二级类目处理函数'''
134
        selector_catagory_3 = Selector(response)
135
        items_catagory_3 = response.meta['items_catagory_2']
136
        print('二级类目', items_catagory_3['catagory_2_Name'])
137
        catagory_3_List = selector_catagory_3.xpath('//div[@class="sheng_weizhi_con"]/ul[2]/li/a')
138
        data=[]
139
        for catagory_3 in catagory_3_List:
140
            catagory_3_Name = catagory_3.xpath('text()').extract()[0]
141
            catagory_3_Url = catagory_3.xpath('@href').extract()[0]
142
            items_catagory_3['catagory_3_Name'] = catagory_3_Name
143
            items_catagory_3['catagory_3_Url'] = items_catagory_3['url'] + catagory_3_Url
144
            #print(items_catagory_3['provinceName'],items_catagory_3['cityName'],items_catagory_3['catagory_1_Name'],items_catagory_3['catagory_1_Url'],items_catagory_3['catagory_2_Name'],items_catagory_3['catagory_2_Url'],items_catagory_3['catagory_3_Name'],items_catagory_3['catagory_3_Url'])
145
            yield Request(items_catagory_3['catagory_3_Url'], meta={'items_catagory_3': copy.deepcopy(items_catagory_3)}
146
                          ,callback=self.parse_enterpriseFirstPage)
147
            #data.append((items_catagory_3['provinceName'],items_catagory_3['cityName'],items_catagory_3['catagory_1_Name'],items_catagory_3['catagory_1_Url'],items_catagory_3['catagory_2_Name'],items_catagory_3['catagory_2_Url'],items_catagory_3['catagory_3_Name'],items_catagory_3['catagory_3_Url']))
148
149
    def parse_catagory_2(self, response):
150
        '''行业二级类目处理函数'''
151
        selector_catagory_2 = Selector(response)
152
        items_catagory_2 = response.meta['items_catagory_1']
153
        print('一级类目', items_catagory_2['catagory_1_Name'])
154
        catagory_2_List = selector_catagory_2.xpath('//div[@class="sheng_weizhi_con"]/ul[2]/li/a')
155
        for catagory_2 in catagory_2_List:
156
            catagory_2_Name = catagory_2.xpath('text()').extract()[0]
157
            catagory_2_Url = catagory_2.xpath('@href').extract()[0]
158
            items_catagory_2['catagory_2_Name'] = catagory_2_Name
159
            items_catagory_2['catagory_2_Url'] = items_catagory_2['url'] + catagory_2_Url
160
            print(items_catagory_2['provinceName']
161
                  ,items_catagory_2['cityName']
162
                  ,items_catagory_2['catagory_1_Name']
163
                  ,items_catagory_2['catagory_1_Url']
164
                  ,items_catagory_2['catagory_2_Name']
165
                  ,items_catagory_2['catagory_2_Url'])
166
            yield Request(items_catagory_2['catagory_2_Url'], meta={'items_catagory_2': copy.deepcopy(items_catagory_2)}, callback=self.parse_catagory_3)
167
168
    def parse_catagory_1(self,response):
169
        '''行业一级类目处理函数'''
170
        selector_catagory_1 = Selector(response)
171
        items_catagory_1=response.meta['items']
172
        # 大类
173
        print('当前地区',items_catagory_1['provinceName'],items_catagory_1['cityName'])
174
        catagory_1_List = selector_catagory_1.xpath('//div[@class="sheng_weizhi_con"]/ul[2]/li/a')
175
        if catagory_1_List.__len__() ==0:
176
            catagory_1_List = selector_catagory_1.xpath('//div[@class="sheng_weizhi_con"]/ul/li/a')
177
        for catagory_1 in catagory_1_List:
178
            items_catagory_1['catagory_1_Name'] = catagory_1.xpath('text()').extract()[0]
179
            items_catagory_1['catagory_1_Url'] = items_catagory_1['url']+catagory_1.xpath('@href').extract()[0]
180
            yield Request(items_catagory_1['catagory_1_Url'], meta={'items_catagory_1':copy.deepcopy(items_catagory_1)}, callback=self.parse_catagory_2)
181
    def parse(self,response):
182
        selector=Selector(response)
183
        url='http://book.youboy.com'
184
        #获取class="ybs-bcTitle"下所有的a标签
185
        diquUrl = []
186
        diqu1 = selector.xpath('//div[@class="ybs-bcTitle"]/a')
187
        for bg in diqu1:
188
            cityUrl = bg.xpath('@href').extract()[0]
189
            cityUrl=url+cityUrl
190
            cityName=bg.xpath('text()').extract()[0]
191
            #print(cityName,cityName,cityUrl)
192
            diquUrl.append((cityName,cityName,cityUrl,'Y'))
193
        diqu2 = selector.xpath('//div[@class="ybs-bcBody"]/ul/li')
194
        for bg in diqu2:
195
            provinceName=bg.xpath('h3/a/text()').extract()[0]
196
            cityList=bg.xpath('span/a')
197
            for city in cityList:
198
                cityName = city.xpath('text()').extract()[0]
199
                cityUrl = city.xpath('@href').extract()[0]
200
                cityUrl = url + cityUrl
201
                diquUrl.append((provinceName,cityName,cityUrl,'Y'))
202
        #print(diquUrl)
203
        '''批量加载数据入库'''
204
        sql = "replace into youboy_diqu(provinceName,cityName,url,flag) " \
205
              "values(%s,%s,%s,%s)"
206
        connMysql = connDB()
207
        result = exeBath(connMysql[0], connMysql[1],sql,diquUrl)
208
        #print('加载记录数:', result)
209
        connClose(connMysql[0], connMysql[1])
210
        #############################################################################################################
211
        #############################################################################################################
212
        #############################################################################################################
213
        #读取url,按省市分别处理
214
        selectsql = "select provinceName,cityName,url from youboy_diqu where provinceName='上海' and cityName='上海' and flag='Y'"
215
        connMysql = connDB()
216
        results = exeQuery(connMysql[1],selectsql)
217
        # updatesql = "update youboy_diqu set flag='N' where provinceName='%s' and cityName='%s'" %(result[0],result[1])
218
        # updateresult = exeUpdate(connMysql[0],connMysql[1], updatesql)
219
        connClose(connMysql[0], connMysql[1])
220
        for result in results:
221
            print('当前地区%s-%s' %(result[0],result[1]))
222
            items = {}
223
            items['provinceName'] = result[0]
224
            items['cityName'] = result[1]
225
            items['cityUrl'] = result[2]
226
            items['url']=url
227
            #print('url',items['cityUrl'])
228
            yield Request(items['cityUrl'], meta={'items':copy.deepcopy(items)},callback=self.parse_catagory_1)
4-2、items.py
1
# -*- coding: utf-8 -*-
2
3
# Define here the models for your scraped items
4
#
5
# See documentation in:
6
# http://doc.scrapy.org/en/latest/topics/items.html
7
8
import scrapy
9
10
class SpiderYouboyItem(scrapy.Item):
11
    # define the fields for your item here like:
12
    # name = scrapy.Field()
13
    provinceName=scrapy.Field()
14
    cityName=scrapy.Field()
15
    catagory_1_Name=scrapy.Field()
16
    catagory_1_Url=scrapy.Field()
17
    catagory_2_Name=scrapy.Field()
18
    catagory_2_Url=scrapy.Field()
19
    catagory_3_Name=scrapy.Field()
20
    catagory_3_Url=scrapy.Field()
21
    enterpriseName=scrapy.Field()
22
    contactPerson=scrapy.Field()
23
    enterpriseFax=scrapy.Field()
24
    enterprisePhone=scrapy.Field()
25
    enterpriseMobile=scrapy.Field()
26
    enterpriseAddr=scrapy.Field()
27
    enterpriseUrl=scrapy.Field()
4-3、pipelines.py
1
# -*- coding: utf-8 -*-
2
3
# Define your item pipelines here
4
#
5
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7
import pymongo
8
from scrapy.conf import settings
9
class SpiderYouboyPipeline(object):
10
    # def process_item(self, item, spider):
11
    #     return item
12
    def __init__(self):
13
        # 链接数据库
14
        self.client = pymongo.MongoClient(
15
            host=settings['MONGODB_HOST']
16
            , port=settings['MONGODB_PORT'])
17
        # 数据库登录需要帐号密码的话
18
        # self.client.admin.authenticate(settings['MINGO_USER']
19
        # , settings['MONGO_PSW'])
20
        self.db = self.client[settings['MONGODB_DB']]
21
        # 获得数据库的句柄
22
        self.postItem = self.db[settings['MONGODB_COLL']]
23
        # 获得collection的句柄
24
25
    def process_item(self, item, spider):
26
        postItem = dict(item)
27
        # 把item转化成字典形式
28
        print('postItem', postItem)
29
        self.postItem.insert_one(postItem)
30
        # 向数据库插入一条记录
31
        # 会在控制台输出原item数据,可以选择不写
32
        return item
4-4、settings.py
1
# -*- coding: utf-8 -*-
2
3
# Scrapy settings for spider_youboy project
4
#
5
# For simplicity, this file contains only settings considered important or
6
# commonly used. You can find more settings consulting the documentation:
7
#
8
#     http://doc.scrapy.org/en/latest/topics/settings.html
9
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11
12
BOT_NAME = 'spider_youboy'
13
14
SPIDER_MODULES = ['spider_youboy.spiders']
15
NEWSPIDER_MODULE = 'spider_youboy.spiders'
16
17
18
# Crawl responsibly by identifying yourself (and your website) on the user-agent
19
#USER_AGENT = 'spider_youboy (+http://www.yourdomain.com)'
20
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
21
# Obey robots.txt rules
22
ROBOTSTXT_OBEY = True
23
ITEM_PIPELINES = {
24
   'spider_youboy.pipelines.SpiderYouboyPipeline': 300,
25
}
26
MONGODB_HOST = "127.0.0.1"
27
MONGODB_PORT = 27017
28
MONGODB_DB = 'youboy'
29
MONGODB_COLL = 'enterprise'
30
31
SCHEDULER="scrapy_redis.scheduler.Scheduler"
32
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
33
SCHEDULER_PERSIST=True
34
SCHEDULER_QUEUE_CLASS="scrapy_redis.queue.SpiderPriorityQueue"
35
REDIS_URL = None
36
REDIS_HOST='127.0.0.1'
37
REDIS_PORT=6379
38
# Configure maximum concurrent requests performed by Scrapy (default: 16)
39
#CONCURRENT_REQUESTS = 32
40
41
# Configure a delay for requests for the same website (default: 0)
42
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
43
# See also autothrottle settings and docs
44
#DOWNLOAD_DELAY = 5
45
# The download delay setting will honor only one of:
46
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
47
#CONCURRENT_REQUESTS_PER_IP = 16
48
49
# Disable cookies (enabled by default)
50
#COOKIES_ENABLED = False
51
52
# Disable Telnet Console (enabled by default)
53
#TELNETCONSOLE_ENABLED = False
54
55
# Override the default request headers:
56
#DEFAULT_REQUEST_HEADERS = {
57
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58
#   'Accept-Language': 'en',
59
#}
60
61
# Enable or disable spider middlewares
62
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
63
#SPIDER_MIDDLEWARES = {
64
#    'spider_youboy.middlewares.SpiderYouboySpiderMiddleware': 543,
65
#}
66
67
# Enable or disable downloader middlewares
68
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
69
#DOWNLOADER_MIDDLEWARES = {
70
#    'spider_youboy.middlewares.MyCustomDownloaderMiddleware': 543,
71
#}
72
73
# Enable or disable extensions
74
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
75
#EXTENSIONS = {
76
#    'scrapy.extensions.telnet.TelnetConsole': None,
77
#}
78
79
# Configure item pipelines
80
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
81
# ITEM_PIPELINES = {
82
#     'spider_youboy.pipelines.SpiderYouboyPipeline': 300,
83
# }
84
85
# Enable and configure the AutoThrottle extension (disabled by default)
86
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
87
#AUTOTHROTTLE_ENABLED = True
88
# The initial download delay
89
#AUTOTHROTTLE_START_DELAY = 5
90
# The maximum download delay to be set in case of high latencies
91
#AUTOTHROTTLE_MAX_DELAY = 60
92
# The average number of requests Scrapy should be sending in parallel to
93
# each remote server
94
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
95
# Enable showing throttling stats for every response received:
96
#AUTOTHROTTLE_DEBUG = False
97
98
# Enable and configure HTTP caching (disabled by default)
99
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
100
#HTTPCACHE_ENABLED = True
101
#HTTPCACHE_EXPIRATION_SECS = 0
102
#HTTPCACHE_DIR = 'httpcache'
103
#HTTPCACHE_IGNORE_HTTP_CODES = []
104
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
105
4-5、mysqldb.py
1
#coding=utf-8
2
#!/usr/bin/python
3
'''
4
Author:chenlun
5
Date:2017-04-10
6
'''
7
import pymysql
8
def connDB():
9
    # 连接数据库
10
    try:
11
        conn = pymysql.connect(host='localhost', user='root', passwd='root', db='youboy', charset='utf8')
12
        cur = conn.cursor()
13
        return (conn, cur)
14
    except Exception as e:
15
        return "connect Error!"
16
def exeUpdate(conn, cur, sql):
17
    '''更新语句,可执行Update,Insert语句'''
18
    sta = cur.execute(sql)
19
    conn.commit()
20
    return (sta)
21
def exeBath(conn, cur, sql,data):
22
    '''批量插入数据'''
23
    #try:
24
    sta = cur.executemany(sql,data)
25
    conn.commit()
26
    return sta
27
    #except Exception as e:
28
    #    return pymysql.err
29
def exeQuery(cur, sql):
30
    # 查询语句
31
    cur.execute(sql)
32
    result = cur.fetchall()
33
    return result
34
def connClose(conn, cur):
35
    # 关闭所有连接
36
    cur.close()
37
    conn.close()
5、spider注意事项
5-1、yield meta多层传递参数失效:
    加上参数:dont_filter=True ‘多层循环失效加上此参数’
5-2、yield不是立即返回,而是异步执行,完成后调用callback函数,将url和meta字典传给指定函数处理
5-3、xpath的使用注意,常用功能必须熟悉
5-4、最后的落地动作,return items,这一步会将items字典传递给items.py函数
    items['key']与key=scrapy.field()相对应
5-5、注意深度copy和浅copy的使用,一般使用
    copy.deepcopy(Info)
5-6、非分布式使用:
    class youboySpider(CrawlSpider):
    分布式Redis的话youboySpider继承RedisSpider类
    class youboySpider(RedisSpider):

6、数据库信息配置  
    Mongodb配置:
ITEM_PIPELINES = {
   'spider_youboy.pipelines.SpiderYouboyPipeline': 300,
}
MONGODB_HOST = "127.0.0.1"
MONGODB_PORT = 27017
MONGODB_DB = 'youboy'
MONGODB_COLL = 'enterprise'
    
    Redis配置
SCHEDULER="scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST=True
SCHEDULER_QUEUE_CLASS="scrapy_redis.queue.SpiderPriorityQueue"
REDIS_URL = None
REDIS_HOST='127.0.0.1'
REDIS_PORT=6379
7、通过main启动爬虫程序
    如果使用了分布式,同时需要运行redis-cli客户端,启动监听,否则redis为空,程序会一直等待:
    lpush youboySpider:start_urls http://book.youboy.com/diqu.html
  • 2
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
Scrapy-Redis是一个基于Scrapy框架分布式爬虫解决方案,它使用Redis作为分布式队列和去重集合,实现了多个爬虫节点共享一个Redis队列和去重集合,从而实现了高效的分布式爬取。 使用Scrapy-Redis,你可以很容易地将一个单机版的Scrapy爬虫转换成一个分布式爬虫。下面是简单的步骤: 1. 安装RedisScrapy-Redis 首先需要安装Redis,并且确保Redis服务正常运行。另外,需要安装Scrapy-Redis库,可以通过pip命令来进行安装: ``` pip install scrapy-redis ``` 2. 修改爬虫设置 在Scrapy爬虫的settings.py文件中,需要添加如下配置: ``` # 启用Redis调度器 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 启用Redis去重过滤器 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 设置Redis为调度器和去重过滤器的数据存储位置 REDIS_URL = 'redis://localhost:6379' ``` 3. 修改爬虫代码 在Scrapy爬虫的代码中,需要将原来的start_urls修改为redis_key,例如: ``` class MySpider(RedisSpider): name = 'myspider' redis_key = 'myspider:start_urls' ``` 另外,需要将原来的parse方法改为如下形式: ``` def parse(self, response): # 爬虫代码 yield scrapy.Request(url, callback=self.parse_item) ``` 4. 运行爬虫 在启动爬虫之前,需要先往Redis队列中添加起始URL,例如: ``` redis-cli lpush myspider:start_urls http://www.example.com ``` 然后在命令行中启动分布式爬虫,例如: ``` scrapy crawl myspider ``` 这样就可以启动一个分布式Scrapy爬虫了。在多个爬虫节点共享同一个Redis队列和去重集合的情况下,可以大大提高爬取效率和速度。
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值