续集
import json
import os.path
import time
import scrapy
from pyquery import PyQuery as pq
from scrapy import Request
from ..settings import *
class CodeSpider(scrapy.Spider):
name = 'code'
allowed_domains = ['www.stats.gov.cn']
# start_urls = ['http://www.stats.gov.cn/']
start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm']
year = ''
dataDirPath = ''
dirSep = '/'
attrClassItems = ['province', 'city', 'county', 'town', 'village']
uItems = {}
cItems = {}
tIimes = []
rItems = {}
def errback(self, failure):
print('errback', failure)
def parse(self, response, **kwargs):
u = response.url
ua = response.request.headers['User-Agent']
_ = pq(response.text)
t = _('title').text()
# print(u, ua, t)
# print(SPIDER_ROOT_DATA_DIR_PATH)
# .center_list_contlist a[href]
# .center_list_contlist a .cont_tit03
a = _('.center_list_contlist a').eq(0) # 第一个a标签 表示最新
href = a.attr('href') # 获取链接
year = a.find('.cont_tit03').text()[:-1] # 获取年份 最后一个字是 "年" 需要去除
# self.dataDirPath = SPIDER_ROOT_DATA_DIR_PATH + year + self.dirSep
# if not os.path.exists(self.dataDirPath):
# os.makedirs(self.dataDirPath)
# if os.path.exists(self.dataDirPath):
yield Request(url=href, callback=self.parse_item)
pass
def parse_item(self, response, **kwargs):
u = response.url
ua = response.request.headers['User-Agent']
_ = pq(response.text)
t = _('title').text()
s = u.split(self.dirSep)
ud = self.dirSep.join(s[:-1]) + self.dirSep
pc = str(s[-1][:-5]).ljust(12, '0')
each = _('table[class$="table"] tr[class$="tr"]') # 标签的class后缀 tag[class$="*"]
if u in self.rItems:
self.rItems[u] = True
if len(each):
attrClass = each.eq(0).attr('class')
attrClass = list(filter(None, attrClass.split(' ')))
attrClass = [_.replace('tr', '') for _ in attrClass]
if len(attrClass):
attrClass = attrClass[0]
if attrClass in self.attrClassItems:
attrClassIndex = self.attrClassItems.index(attrClass)
if attrClass not in self.cItems:
self.cItems.setdefault(attrClass, 0)
# print('-' * 80)
if attrClassIndex == 0:
each = each.find('a')
for k, _ in enumerate(each):
k = str(k).zfill(3)
_ = pq(_)
href = code = text = ''
self.cItems[attrClass] += 1
if attrClassIndex == 0:
pc = ''.ljust(12, '0')
href = ud + _.attr('href')
text = _.text()
code = href.split(self.dirSep)[-1][:-5].ljust(12, '0')
elif attrClassIndex > 0:
td0 = _.find('td').eq(0)
td1 = _.find('td').eq(1)
a = td0.find('a')
hasA = a.text()
if hasA:
href = ud + a.attr('href')
code = td0.text()
if attrClassIndex == 4:
text = _.find('td').eq(2).text()
else:
text = td1.text()
# print(k, href, text, code)
self.uItems.setdefault(code, {
'ic': self.cItems[attrClass],
'pc': pc,
'k': k,
'h': href,
't': text,
'c': code
})
if href:
self.rItems.setdefault(href, False)
# print(code, text, attrClass, href)
try:
yield Request(url=href, callback=self.parse_item, errback=self.errback)
except (Exception, BaseException) as e:
print(e)
int_time = int(time.time())
if int_time not in self.tIimes:
self.tIimes.append(int_time)
print('%d' % int_time)
存储 pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class StatsPipeline:
def process_item(self, item, spider):
print('process_item')
return item
def close_spider(self, spider):
print('close_spider')
dItems = []
cItems = {}
for k, _ in enumerate(spider.uItems):
_ = spider.uItems.get(_)
# print(_)
code = _.get('c')
if code not in cItems:
cItems.setdefault(code, _.get('t'))
dItems.append(_)
dItems.sort(key=lambda _: int(_.get('c')))
sumCi = sum([spider.cItems.get(_) for _ in spider.cItems])
print(33, '-' * 80)
with open('data.txt', 'w', encoding='utf-8') as w:
for k, _ in enumerate(dItems):
k = str(k).zfill(5)
# pc 如果全部是 0 就是顶级
w.write(','.join([
k, _.get('pc'), _.get('c'), _.get('t'), _.get('h')
]) + '\n')
print(k, _.get('pc'), _.get('c'), len(_.get('h')), _.get('t'))
with open('log.txt', 'w', encoding='utf-8') as w:
for k, _ in enumerate(spider.rItems):
k = str(k).zfill(5)
print(k, _, spider.rItems.get(_))
w.write(','.join([
k,
_,
str(spider.rItems.get(_))
]) + '\n')
"""
print(k, str(_.get('ic')).zfill(5), _.get('pc'), _.get('k'), _.get('c'), cItems.get(_.get('pc')),
_.get('t'))
"""
print(sumCi, spider.cItems, len(spider.tIimes))
发现 有部分没有进行爬取,未知什么原因,后续再处理了
感觉失败了,使用这个,这个还是会继续研究的!