# -*- coding: utf-8 -*-
import scrapy
from ..items import ShengshiItem
from scrapy_redis.spiders import RedisSpider
class ShengsSpider(RedisSpider):
##这个是爬虫的识别名称,必须是唯一的,在不同的爬虫必须定义不同的名字。
name = 'shengs'
##这是搜索的域名范围,也就是爬虫的约束区域,规定爬虫只爬取这个域名下的网页
# allowed_domains = ['stats.gov.cn']
# 爬取的url元组/列表,爬虫从这里开始抓取数据,所以,第一下载的数据将会从这些url开始。
# start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html']
redis_key ="ShengsSpider:start_urls"
def parse(self, response):
#北京市
fist_city = response.xpath('//tr[@class="provincetr"]/td/a')
for city in fist_city:
item = ShengshiItem()
##extract_fist()选中对的标签中选第一个内容
f=city.xpath('./text()').extract_fist()
item['first_city']=f
second_href=city.xpath('./@href').extract_fist()
if second_href is not None:
new_url=response.urljoin(second_href)
yield scrapy.Request(new_url,callback=self.city2,meta={'meta1':item})
def city2(self,response):
##市辖区
second_citys = response.xpath('//tr[@class="citytr"]/td[2]/a')
meta1=response.meta['meta1']
for second_city in second_citys:
item = ShengshiItem()
se = second_city.xpath('./text()').extract_first()
item['first_city'] = meta1['first_city']
item['second_city'] = se
third_href=second_city.xpath('./@href').extract_first()
if third_href is not None:
new_url=response.urljoin(third_href)
yield scrapy.Request(new_url,meta={'meta2':item},callback=self.city3)
def city3(self,response):
meta2 = response.meta['meta2']
third_citys=response.xpath('//tr[@class="countytr"]/td[2]/a')
for third_city in third_citys:
item = ShengshiItem()
item['first_city'] = meta2['first_city']
item['second_city']=meta2['second_city']
th=third_city.xpath('./text()').extract_first()
item['third_city'] = th
fourth_href = third_city.xpath('./@href').extract_first()
if fourth_href is not None:
new_url=response.urljoin(fourth_href)
yield scrapy.Request(new_url,meta={'meta3':item},callback=self.city4)
def city4(self,response):
meta3=response.meta['meta3']
forth_citys = response.xpath('//tr[@class="towntr"]/td[2]/a')
for forth_city in forth_citys:
item = ShengshiItem()
fo = forth_city.xpath('./text()').extract_first()
item['first_city']=meta3['first_city']
item['second_city']=meta3['second_city']
item['third_city']=meta3['third_city']
item['forth_city']=fo
fifth_href= forth_city.xpath('./@href').extract_first()
if fifth_href is not None:
new_url=response.urljoin(fifth_href)
yield scrapy.Request(new_url,meta={'meta4':item},callback=self.city5)
def city5(self,response):
meta4 = response.meta['meta4']
fifth_citys=response.xpath('//tr[@class="villagetr"]/td[3]/text()').extract_first()
for fifth_city in fifth_citys:
item= ShengshiItem()
item['first_city']=meta4['first_city']
item['second_city']=meta4['second_city']
item['third_city']=meta4['third_city']
item['forth_city']=meta4['forth_city']
item['fifth_city']=fifth_city
yield item
item
class ShengshiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
first_city=scrapy.Field()
second_city=scrapy.Field()
third_city=scrapy.Field()
forth_city=scrapy.Field()
fifth_city=scrapy.Field()
pipeline
from openpyxl import Workbook
import pymongo
class ShengshiPipeline(object):
def __init__(self):
self.d=pymongo.MongoClient('localhost')
self.db = self.d['shengshi']
def process_item(self, item, spider):
self.db['liandong'].insert(dict(item))
return item
settings
##去重组件,在redis数据库里做去重操作
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#使用scrapy_redis的调速器,在redis里分配请求
SCHEDULER="scrapy_redis.scheduler.Scheduler"
#是否在开始之前清空 调度器和去重记录,True=清空,False=不清空
SCHEDULER_FLUSH_ON_START = False
#去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。
SCHEDULER_IDLE_BEFORE_CLOSE = 10
#是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空
SCHEDULER_PERSIST = True
#将数据促存储到redis数据库里
# ITEM_PIPELINES = {
# 'scrapy_redis.pipelines.RedisPipeline':300
# }
#服务器地址
REDIS_HOST = '127.0.0.1'
#端口
REDIS_PORT = 6379
#存到redis的编码格式
# REDIS_ENCODING = "utf-8"
##日志
LOG_FILE='省市.log' ##日志文件
LOG_ENABLED=True ##启用log
LOG_ENCODING='UTF-8'##编码
LOG_LEVEL='DEBUG'##日志登记
本文介绍了一个使用Scrapy框架的RedisSpider子类实现的多级城市数据爬虫。爬虫从省级单位开始,递进至区县级,最终获取到村级单位的详细数据。通过XPath解析HTML,提取城市名,并使用MongoDB存储数据。

被折叠的 条评论
为什么被折叠?



