Redis实现分布式爬虫

将单机爬虫jobbole修改为分布式爬虫

伯乐在线爬虫如下:

blog.py

 
  1. # -*- coding: utf-8 -*-

  2. import scrapy

  3. from ..items import JobboleItem

  4. from ..items import ArticleItemLoader

  5.  
  6. class BlogSpider(scrapy.Spider):

  7. name = 'blog'

  8. allowed_domains = ['blog.jobbole.com']

  9. start_urls = ['http://blog.jobbole.com/all-posts/']

  10.  
  11. # 需求:获取所有文章的标题 图片地址 时间 详情页地址 收藏 点赞 评论

  12. def parse(self, response):

  13. item_list = response.xpath('//div[@class="post floated-thumb"]')

  14. for item in item_list :

  15. img = item.xpath('.//div[@class="post-thumb"]/a/img/@src').extract_first('')

  16. url = item.xpath('.//a[@class="archive-title"]/@href').extract_first('')

  17. yield scrapy.Request(url=url,meta={'img':img},callback=self.get_detail_with_url)

  18.  
  19. # next_url = response.xpath('//a[@class="next page-numbers"]/@href').extract()

  20. # if len(next_url) != 0 :

  21. # page_url = next_url[0]

  22. # yield scrapy.Request(url=page_url,callback=self.parse)

  23.  
  24. def get_detail_with_url(self ,response):

  25. # img = response.meta['img']

  26. # # 标题

  27. # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first('')

  28. # #时间

  29. # date_time = response.xpath('//div[@class="entry-meta"]/p/text()').extract_first('')

  30. # time = date_time.split('·')[0].strip()

  31. #

  32. # # 详情页地址

  33. # detail_url = response.url

  34. #

  35. # # 点赞数

  36. # dian_zan = response.xpath('//h10/text()').extract_first('')

  37. #

  38. # # 收藏数

  39. # book_mark = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract_first('')

  40. #

  41. # book_mark_array = book_mark.split(' ')

  42. # book_mark_num = 0

  43. # if len(book_mark_array[1]) != 0:

  44. # book_mark_num = int(book_mark_array[1])

  45. #

  46. # # 评论数

  47. # comment = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first('')

  48. # comment_arr = comment.split(' ')

  49. # comment_num = 0

  50. # if len(comment_arr[1]) != 0:

  51. # comment_num = int(comment_arr[1])

  52. #

  53. # item = JobboleItem()

  54. # item['img'] = img

  55. # item['title'] = title

  56. # item['detail_url'] = detail_url

  57. # item['date_time'] = time

  58. # item['dian_zan'] = dian_zan

  59. # item['book_mark'] = book_mark_num

  60. # item['comment'] = comment_num

  61.  
  62.  
  63. # 创建ItemLoader的实例化对象的时候

  64. # 需要传入两个参数

  65. # 参数1:item的实例化对象 item里面为还要提取的数据的字段

  66. # 参数2:网页的源码

  67. item_loader = ArticleItemLoader(item=JobboleItem(),response=response)

  68. # add_xpath()用于给一个field设置值

  69. # 后面需要追加两个参数

  70. # 参数1;需要设置的field的名称

  71. # 参数2:xpath路径

  72. item_loader.add_xpath('title','//div[@class="entry-header"]/h1/text()')

  73.  
  74. item_loader.add_xpath('date_time','//div[@class="entry-meta"]/p/text()')

  75.  
  76. item_loader.add_xpath('dian_zan','//div[@class="post-adds"]//h10/text()')

  77.  
  78. item_loader.add_xpath('book_mark','//span[contains(@class,"bookmark-btn")]/text()')

  79.  
  80. item_loader.add_xpath('comment','//a[@href="#article-comment"]/span/text()')

  81.  
  82. item_loader.add_value('img',[response.meta['img']])

  83.  
  84. item_loader.add_value('detail_url',response.url)

  85. # 将itemloader加载器中保存的每一个field数据收集起来

  86. # 赋值给item 并且返回到管道

  87. item = item_loader.load_item()

  88.  
  89. yield item

items.py

 
  1. # -*- coding: utf-8 -*-

  2.  
  3. # Define here the models for your scraped items

  4. #

  5. # See documentation in:

  6. # https://doc.scrapy.org/en/latest/topics/items.html

  7.  
  8. import scrapy

  9. from scrapy.loader import ItemLoader

  10. from scrapy.loader.processors import MapCompose ,TakeFirst

  11. import re

  12. # itemload是分离数据的另外一种方式 使用itemloader加载器

  13. # 有这样一些优势:

  14. # 1.默认使用xpath()/css()这种数据提取方式

  15. # 是将数据的提取和数据的过滤等过程放在一个函数中

  16. # 采用itemloader这种数据加载方式

  17. # 可以将数据的提取和分离分成两部分

  18. # 让代码更加清晰,代码更加整洁

  19. # 2.可以将数据的处理函数,单独定义

  20. # 也可以对一个数据使用多个处理函数

  21. # 这样的话对代码的重用有着非常好的实现

  22.  
  23. def changeTitle(value):

  24. value = '标题:' + value

  25. return value

  26. def getNewTime(value):

  27. newTime = value.split('·')[0]

  28. newTime = newTime.strip()

  29. return newTime

  30. def getNum(value):

  31. pattern = re.compile(r'\d+')

  32. result = re.findall(pattern , value)

  33. if result :

  34. return int(result[0])

  35. else :

  36. return 0

  37. # 使用itemloader的话 需要先继承itemloadder

  38. class ArticleItemLoader(ItemLoader):

  39. # default_output_processor 设置输出内容的类型

  40. # TakeFirst获取所有数据当中的第一条数据

  41. # 默认返回的数据为一个列表 列表当中有一条数据

  42. # default_output_processor = ItemLoader.default_output_processor

  43. default_output_processor = TakeFirst()

  44.  
  45. # list = ['hello world']

  46. #

  47. # list = list

  48. #

  49. # list = list[0]

  50.  
  51. class JobboleItem(scrapy.Item):

  52. # define the fields for your item here like:

  53. img = scrapy.Field()

  54. title = scrapy.Field(

  55. # 如果函数以Map...开头 那么内部很大可能是一个可迭代对象

  56. # 在此处 MapCompose括号里面可以追加多个参数 每个参数都是一个函数

  57. # 那么获取的内容 会依次进入到每个函数当中被执行

  58. # title map-reduce

  59. input_processor = MapCompose(changeTitle ,lambda x : x+'------------------')

  60. )

  61. date_time = scrapy.Field(

  62. input_processor = MapCompose(getNewTime)

  63. )

  64. detail_url = scrapy.Field(

  65.  
  66. )

  67. dian_zan = scrapy.Field(

  68.  
  69. )

  70. book_mark = scrapy.Field(

  71. input_processor = MapCompose(getNum)

  72. )

  73. comment = scrapy.Field(

  74. input_processor=MapCompose(getNum)

  75. )

pipelines.py

 
  1. # -*- coding: utf-8 -*-

  2.  
  3. # Define your item pipelines here

  4. #

  5. # Don't forget to add your pipeline to the ITEM_PIPELINES setting

  6. # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

  7.  
  8. import pymysql

  9. from scrapy.pipelines.images import ImagesPipeline

  10. class JobbolePipeline(object):

  11. # def __init__(self):

  12. # # localhost

  13. # self.connect = pymysql.connect(host='localhost',

  14. # user='root',

  15. # password='123456',

  16. # db='jobbole',

  17. # port=3306)

  18. # self.cursor = self.connect.cursor()

  19. def process_item(self, item, spider):

  20.  
  21. # self.cursor.execute('insert into blog (img , title ,detail_url ,time ,dian_zan,book_mark,comment) VALUES ("{}","{}","{}","{}","{}","{}","{}")'.

  22. # format(item['img'],item['title'],item['detail_url'],item['date_time'],item['dian_zan'],item['book_mark'],item['comment']))

  23. #

  24. # self.connect.commit()

  25.  
  26. return item

  27. # def close_spider(self ,spider):

  28. # self.cursor.close()

  29. # self.connect.close()

  30. class jobboleDownImage(ImagesPipeline):

  31. def get_media_requests(self, item, info):

  32. pass

  33. # 用来下载图片 使用图片链接

  34. def file_path(self, request, response=None, info=None):

  35. path = ''

  36. return path

  37.  
  38. # def test(a=1,b=2):

  39. #

  40. # print('123')

  41. # test(1,2)

  42. # test(b=2 ,a = 1)

settings.py

 
  1. # -*- coding: utf-8 -*-

  2.  
  3. # Scrapy settings for jobbole project

  4. #

  5. # For simplicity, this file contains only settings considered important or

  6. # commonly used. You can find more settings consulting the documentation:

  7. #

  8. # https://doc.scrapy.org/en/latest/topics/settings.html

  9. # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

  10. # https://doc.scrapy.org/en/latest/topics/spider-middleware.html

  11.  
  12. BOT_NAME = 'jobbole'

  13.  
  14. SPIDER_MODULES = ['jobbole.spiders']

  15. NEWSPIDER_MODULE = 'jobbole.spiders'

  16.  
  17.  
  18. # Crawl responsibly by identifying yourself (and your website) on the user-agent

  19. #USER_AGENT = 'jobbole (+http://www.yourdomain.com)'

  20.  
  21. # Obey robots.txt rules

  22. ROBOTSTXT_OBEY = False

  23.  
  24. # Configure maximum concurrent requests performed by Scrapy (default: 16)

  25. #CONCURRENT_REQUESTS = 32

  26.  
  27. # Configure a delay for requests for the same website (default: 0)

  28. # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

  29. # See also autothrottle settings and docs

  30. #DOWNLOAD_DELAY = 3

  31. # The download delay setting will honor only one of:

  32. #CONCURRENT_REQUESTS_PER_DOMAIN = 16

  33. #CONCURRENT_REQUESTS_PER_IP = 16

  34.  
  35. # Disable cookies (enabled by default)

  36. #COOKIES_ENABLED = False

  37.  
  38. # Disable Telnet Console (enabled by default)

  39. #TELNETCONSOLE_ENABLED = False

  40.  
  41. # Override the default request headers:

  42. #DEFAULT_REQUEST_HEADERS = {

  43. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

  44. # 'Accept-Language': 'en',

  45. #}

  46.  
  47. # Enable or disable spider middlewares

  48. # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

  49. #SPIDER_MIDDLEWARES = {

  50. # 'jobbole.middlewares.JobboleSpiderMiddleware': 543,

  51. #}

  52.  
  53. # Enable or disable downloader middlewares

  54. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

  55. #DOWNLOADER_MIDDLEWARES = {

  56. # 'jobbole.middlewares.JobboleDownloaderMiddleware': 543,

  57. #}

  58.  
  59. # Enable or disable extensions

  60. # See https://doc.scrapy.org/en/latest/topics/extensions.html

  61. #EXTENSIONS = {

  62. # 'scrapy.extensions.telnet.TelnetConsole': None,

  63. #}

  64.  
  65. # Configure item pipelines

  66. # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

  67. ITEM_PIPELINES = {

  68. 'jobbole.pipelines.JobbolePipeline': 300,

  69. # 'jobbole.pipelines.jobboleDownImage':1

  70. }

  71. # IMAGES_STORE = ''

  72. # scrapy crawl blog -o wenfeng.json -s FEED_EXPORT_ENCODEING=utf-8

  73.  
  74.  
  75.  
  76. # Enable and configure the AutoThrottle extension (disabled by default)

  77. # See https://doc.scrapy.org/en/latest/topics/autothrottle.html

  78. #AUTOTHROTTLE_ENABLED = True

  79. # The initial download delay

  80. #AUTOTHROTTLE_START_DELAY = 5

  81. # The maximum download delay to be set in case of high latencies

  82. #AUTOTHROTTLE_MAX_DELAY = 60

  83. # The average number of requests Scrapy should be sending in parallel to

  84. # each remote server

  85. #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

  86. # Enable showing throttling stats for every response received:

  87. #AUTOTHROTTLE_DEBUG = False

  88.  
  89. # Enable and configure HTTP caching (disabled by default)

  90. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

  91. #HTTPCACHE_ENABLED = True

  92. #HTTPCACHE_EXPIRATION_SECS = 0

  93. #HTTPCACHE_DIR = 'httpcache'

  94. #HTTPCACHE_IGNORE_HTTP_CODES = []

  95. #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

步骤:

1.作为服务端的电脑打开终端输入如下命令开启redis服务(这个终端在爬虫过程中需要一直开启,不能关闭)

2. 服务端打开第二个终端输入如下命令:

3.然后打开Redis的可视化工具RedisDesktopManager,点击左下角新建连接,然后输入name和host ,其他不用改

 4..修改jobbole单机爬虫项目的代码

blog.py

 settings.py

5.将修改好代码的项目压缩后发送给作为客户端的电脑

客户端电脑把项目解压后用pycharm打开,然后运行爬虫

服务端电脑也运行爬虫

6.在上面打开的第二个终端中输入:lpush blogspider:start_urls http://blog.jobbole.com/all-posts/

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值