scrapy-redis 框架

最新推荐文章于 2024-01-02 12:53:12 发布

帅哥大叔

最新推荐文章于 2024-01-02 12:53:12 发布

阅读量145

点赞数

本文链接：https://blog.csdn.net/qq_42535601/article/details/85413743

版权

# -*- coding: utf-8 -*-
import scrapy
from ..items import ShengshiItem
from scrapy_redis.spiders import RedisSpider

class ShengsSpider(RedisSpider):
    ##这个是爬虫的识别名称，必须是唯一的，在不同的爬虫必须定义不同的名字。
    name = 'shengs'
    ##这是搜索的域名范围，也就是爬虫的约束区域，规定爬虫只爬取这个域名下的网页
    # allowed_domains = ['stats.gov.cn']
    # 爬取的url元组/列表，爬虫从这里开始抓取数据，所以，第一下载的数据将会从这些url开始。
    # start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html']
    redis_key ="ShengsSpider:start_urls"

    def parse(self, response):
        #北京市
        fist_city = response.xpath('//tr[@class="provincetr"]/td/a')
        for city in fist_city:
            item = ShengshiItem()
            ##extract_fist()选中对的标签中选第一个内容
            f=city.xpath('./text()').extract_fist()
            item['first_city']=f
            second_href=city.xpath('./@href').extract_fist()
            if second_href is not None:
                new_url=response.urljoin(second_href)
                yield scrapy.Request(new_url,callback=self.city2,meta={'meta1':item})
    def city2(self,response):
        ##市辖区
        second_citys = response.xpath('//tr[@class="citytr"]/td[2]/a')
        meta1=response.meta['meta1']
        for second_city in second_citys:
            item = ShengshiItem()
            se = second_city.xpath('./text()').extract_first()
            item['first_city'] = meta1['first_city']
            item['second_city'] = se
            third_href=second_city.xpath('./@href').extract_first()
            if third_href is not None:
                new_url=response.urljoin(third_href)
                yield scrapy.Request(new_url,meta={'meta2':item},callback=self.city3)
    def city3(self,response):
        meta2 = response.meta['meta2']
        third_citys=response.xpath('//tr[@class="countytr"]/td[2]/a')
        for third_city in third_citys:
            item = ShengshiItem()
            item['first_city'] = meta2['first_city']
            item['second_city']=meta2['second_city']
            th=third_city.xpath('./text()').extract_first()
            item['third_city'] = th
            fourth_href = third_city.xpath('./@href').extract_first()
            if fourth_href is not None:
                new_url=response.urljoin(fourth_href)
                yield scrapy.Request(new_url,meta={'meta3':item},callback=self.city4)
    def city4(self,response):
        meta3=response.meta['meta3']
        forth_citys = response.xpath('//tr[@class="towntr"]/td[2]/a')
        for forth_city in forth_citys:
            item = ShengshiItem()
            fo = forth_city.xpath('./text()').extract_first()
            item['first_city']=meta3['first_city']
            item['second_city']=meta3['second_city']
            item['third_city']=meta3['third_city']
            item['forth_city']=fo
            fifth_href= forth_city.xpath('./@href').extract_first()
            if fifth_href is not None:
                 new_url=response.urljoin(fifth_href)
                 yield  scrapy.Request(new_url,meta={'meta4':item},callback=self.city5)
    def city5(self,response):
        meta4 = response.meta['meta4']
        fifth_citys=response.xpath('//tr[@class="villagetr"]/td[3]/text()').extract_first()
        for fifth_city in fifth_citys:
            item= ShengshiItem()
            item['first_city']=meta4['first_city']
            item['second_city']=meta4['second_city']
            item['third_city']=meta4['third_city']
            item['forth_city']=meta4['forth_city']
            item['fifth_city']=fifth_city
            yield item

item

class ShengshiItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    first_city=scrapy.Field()
    second_city=scrapy.Field()
    third_city=scrapy.Field()
    forth_city=scrapy.Field()
    fifth_city=scrapy.Field()

pipeline

from openpyxl import Workbook
import pymongo
class ShengshiPipeline(object):
    def __init__(self):
        self.d=pymongo.MongoClient('localhost')
        self.db = self.d['shengshi']


    def process_item(self, item, spider):
        self.db['liandong'].insert(dict(item))

        return item

settings

##去重组件，在redis数据库里做去重操作
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#使用scrapy_redis的调速器，在redis里分配请求
SCHEDULER="scrapy_redis.scheduler.Scheduler"
#是否在开始之前清空 调度器和去重记录，True=清空，False=不清空
SCHEDULER_FLUSH_ON_START = False
#去调度器中获取数据时，如果为空，最多等待时间（最后没数据，未获取到）。
SCHEDULER_IDLE_BEFORE_CLOSE = 10

#是否在关闭时候保留原来的调度器和去重记录，True=保留，False=清空
SCHEDULER_PERSIST = True
#将数据促存储到redis数据库里
# ITEM_PIPELINES = {
#    'scrapy_redis.pipelines.RedisPipeline':300
# }
#服务器地址
REDIS_HOST = '127.0.0.1'
#端口
REDIS_PORT = 6379
#存到redis的编码格式
# REDIS_ENCODING = "utf-8"


##日志
LOG_FILE='省市.log' ##日志文件
LOG_ENABLED=True ##启用log
LOG_ENCODING='UTF-8'##编码
LOG_LEVEL='DEBUG'##日志登记

帅哥大叔

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
scrapy-redis 框架

# -*- coding: utf-8 -*-import scrapyfrom ..items import ShengshiItemfrom scrapy_redis.spiders import RedisSpiderclass ShengsSpider(RedisSpider): ##这个是爬虫的识别名称，必须是唯一的，在不同的爬虫必须定义不同的名字。 name...
复制链接

扫一扫