# -*- coding: utf-8 -*- import scrapy from ..items import ShengshiItem from scrapy_redis.spiders import RedisSpider class ShengsSpider(RedisSpider): ##这个是爬虫的识别名称,必须是唯一的,在不同的爬虫必须定义不同的名字。 name = 'shengs' ##这是搜索的域名范围,也就是爬虫的约束区域,规定爬虫只爬取这个域名下的网页 # allowed_domains = ['stats.gov.cn'] # 爬取的url元组/列表,爬虫从这里开始抓取数据,所以,第一下载的数据将会从这些url开始。 # start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'] redis_key ="ShengsSpider:start_urls" def parse(self, response): #北京市 fist_city = response.xpath('//tr[@class="provincetr"]/td/a') for city in fist_city: item = ShengshiItem() ##extract_fist()选中对的标签中选第一个内容 f=city.xpath('./text()').extract_fist() item['first_city']=f second_href=city.xpath('./@href').extract_fist() if second_href is not None: new_url=response.urljoin(second_href) yield scrapy.Request(new_url,callback=self.city2,meta={'meta1':item}) def city2(self,response): ##市辖区 second_citys = response.xpath('//tr[@class="citytr"]/td[2]/a') meta1=response.meta['meta1'] for second_city in second_citys: item = ShengshiItem() se = second_city.xpath('./text()').extract_first() item['first_city'] = meta1['first_city'] item['second_city'] = se third_href=second_city.xpath('./@href').extract_first() if third_href is not None: new_url=response.urljoin(third_href) yield scrapy.Request(new_url,meta={'meta2':item},callback=self.city3) def city3(self,response): meta2 = response.meta['meta2'] third_citys=response.xpath('//tr[@class="countytr"]/td[2]/a') for third_city in third_citys: item = ShengshiItem() item['first_city'] = meta2['first_city'] item['second_city']=meta2['second_city'] th=third_city.xpath('./text()').extract_first() item['third_city'] = th fourth_href = third_city.xpath('./@href').extract_first() if fourth_href is not None: new_url=response.urljoin(fourth_href) yield scrapy.Request(new_url,meta={'meta3':item},callback=self.city4) def city4(self,response): meta3=response.meta['meta3'] forth_citys = response.xpath('//tr[@class="towntr"]/td[2]/a') for forth_city in forth_citys: item = ShengshiItem() fo = forth_city.xpath('./text()').extract_first() item['first_city']=meta3['first_city'] item['second_city']=meta3['second_city'] item['third_city']=meta3['third_city'] item['forth_city']=fo fifth_href= forth_city.xpath('./@href').extract_first() if fifth_href is not None: new_url=response.urljoin(fifth_href) yield scrapy.Request(new_url,meta={'meta4':item},callback=self.city5) def city5(self,response): meta4 = response.meta['meta4'] fifth_citys=response.xpath('//tr[@class="villagetr"]/td[3]/text()').extract_first() for fifth_city in fifth_citys: item= ShengshiItem() item['first_city']=meta4['first_city'] item['second_city']=meta4['second_city'] item['third_city']=meta4['third_city'] item['forth_city']=meta4['forth_city'] item['fifth_city']=fifth_city yield item item
class ShengshiItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() first_city=scrapy.Field() second_city=scrapy.Field() third_city=scrapy.Field() forth_city=scrapy.Field() fifth_city=scrapy.Field()
pipeline
from openpyxl import Workbook import pymongo class ShengshiPipeline(object): def __init__(self): self.d=pymongo.MongoClient('localhost') self.db = self.d['shengshi'] def process_item(self, item, spider): self.db['liandong'].insert(dict(item)) return item
settings
##去重组件,在redis数据库里做去重操作 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" #使用scrapy_redis的调速器,在redis里分配请求 SCHEDULER="scrapy_redis.scheduler.Scheduler" #是否在开始之前清空 调度器和去重记录,True=清空,False=不清空 SCHEDULER_FLUSH_ON_START = False #去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。 SCHEDULER_IDLE_BEFORE_CLOSE = 10 #是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空 SCHEDULER_PERSIST = True #将数据促存储到redis数据库里 # ITEM_PIPELINES = { # 'scrapy_redis.pipelines.RedisPipeline':300 # } #服务器地址 REDIS_HOST = '127.0.0.1' #端口 REDIS_PORT = 6379 #存到redis的编码格式 # REDIS_ENCODING = "utf-8" ##日志 LOG_FILE='省市.log' ##日志文件 LOG_ENABLED=True ##启用log LOG_ENCODING='UTF-8'##编码 LOG_LEVEL='DEBUG'##日志登记