scrapy-redis 框架

# -*- coding: utf-8 -*-
import scrapy
from ..items import ShengshiItem
from scrapy_redis.spiders import RedisSpider

class ShengsSpider(RedisSpider):
    ##这个是爬虫的识别名称,必须是唯一的,在不同的爬虫必须定义不同的名字。
    name = 'shengs'
    ##这是搜索的域名范围,也就是爬虫的约束区域,规定爬虫只爬取这个域名下的网页
    # allowed_domains = ['stats.gov.cn']
    # 爬取的url元组/列表,爬虫从这里开始抓取数据,所以,第一下载的数据将会从这些url开始。
    # start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html']
    redis_key ="ShengsSpider:start_urls"

    def parse(self, response):
        #北京市
        fist_city = response.xpath('//tr[@class="provincetr"]/td/a')
        for city in fist_city:
            item = ShengshiItem()
            ##extract_fist()选中对的标签中选第一个内容
            f=city.xpath('./text()').extract_fist()
            item['first_city']=f
            second_href=city.xpath('./@href').extract_fist()
            if second_href is not None:
                new_url=response.urljoin(second_href)
                yield scrapy.Request(new_url,callback=self.city2,meta={'meta1':item})
    def city2(self,response):
        ##市辖区
        second_citys = response.xpath('//tr[@class="citytr"]/td[2]/a')
        meta1=response.meta['meta1']
        for second_city in second_citys:
            item = ShengshiItem()
            se = second_city.xpath('./text()').extract_first()
            item['first_city'] = meta1['first_city']
            item['second_city'] = se
            third_href=second_city.xpath('./@href').extract_first()
            if third_href is not None:
                new_url=response.urljoin(third_href)
                yield scrapy.Request(new_url,meta={'meta2':item},callback=self.city3)
    def city3(self,response):
        meta2 = response.meta['meta2']
        third_citys=response.xpath('//tr[@class="countytr"]/td[2]/a')
        for third_city in third_citys:
            item = ShengshiItem()
            item['first_city'] = meta2['first_city']
            item['second_city']=meta2['second_city']
            th=third_city.xpath('./text()').extract_first()
            item['third_city'] = th
            fourth_href = third_city.xpath('./@href').extract_first()
            if fourth_href is not None:
                new_url=response.urljoin(fourth_href)
                yield scrapy.Request(new_url,meta={'meta3':item},callback=self.city4)
    def city4(self,response):
        meta3=response.meta['meta3']
        forth_citys = response.xpath('//tr[@class="towntr"]/td[2]/a')
        for forth_city in forth_citys:
            item = ShengshiItem()
            fo = forth_city.xpath('./text()').extract_first()
            item['first_city']=meta3['first_city']
            item['second_city']=meta3['second_city']
            item['third_city']=meta3['third_city']
            item['forth_city']=fo
            fifth_href= forth_city.xpath('./@href').extract_first()
            if fifth_href is not None:
                 new_url=response.urljoin(fifth_href)
                 yield  scrapy.Request(new_url,meta={'meta4':item},callback=self.city5)
    def city5(self,response):
        meta4 = response.meta['meta4']
        fifth_citys=response.xpath('//tr[@class="villagetr"]/td[3]/text()').extract_first()
        for fifth_city in fifth_citys:
            item= ShengshiItem()
            item['first_city']=meta4['first_city']
            item['second_city']=meta4['second_city']
            item['third_city']=meta4['third_city']
            item['forth_city']=meta4['forth_city']
            item['fifth_city']=fifth_city
            yield item

item
class ShengshiItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    first_city=scrapy.Field()
    second_city=scrapy.Field()
    third_city=scrapy.Field()
    forth_city=scrapy.Field()
    fifth_city=scrapy.Field()

pipeline

from openpyxl import Workbook
import pymongo
class ShengshiPipeline(object):
    def __init__(self):
        self.d=pymongo.MongoClient('localhost')
        self.db = self.d['shengshi']


    def process_item(self, item, spider):
        self.db['liandong'].insert(dict(item))

        return item

settings

##去重组件,在redis数据库里做去重操作
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#使用scrapy_redis的调速器,在redis里分配请求
SCHEDULER="scrapy_redis.scheduler.Scheduler"
#是否在开始之前清空 调度器和去重记录,True=清空,False=不清空
SCHEDULER_FLUSH_ON_START = False
#去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。
SCHEDULER_IDLE_BEFORE_CLOSE = 10

#是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
SCHEDULER_PERSIST = True
#将数据促存储到redis数据库里
# ITEM_PIPELINES = {
#    'scrapy_redis.pipelines.RedisPipeline':300
# }
#服务器地址
REDIS_HOST = '127.0.0.1'
#端口
REDIS_PORT = 6379
#存到redis的编码格式
# REDIS_ENCODING = "utf-8"


##日志
LOG_FILE='省市.log' ##日志文件
LOG_ENABLED=True ##启用log
LOG_ENCODING='UTF-8'##编码
LOG_LEVEL='DEBUG'##日志登记

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值