CY3761 | 2021-11-18 10:18
行政区划代码-5级12位-scrapy版本
导航: 统计数据 -> 统计标准 -> 统计用区划和城乡划分代码
爬取初始网址: 获取最新数据链接
内含5种布局
序号 | 代码 | 个数 | class前缀 |
---|---|---|---|
1 | 11 00 00 000 000 | 00000 | province |
2 | 11 01 00 000 000 | 00000 | city |
3 | 11 01 01 000 000 | 00000 | county |
4 | 11 01 01 001 000 | 00000 | town |
5 | 11 01 01 001 001 | 00000 | village |
创建项目
scrapy startproject stats
进入项目目录
cd stats
修改 setting.py
是否遵守 robots.txt 的规则
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
开启并设置调用哪个下载器中间件
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'stats.middlewares.StatsDownloaderMiddleware': 543,
#}
DOWNLOADER_MIDDLEWARES = {
'stats.middlewares.StatsDownloaderMiddleware': 543,
}
开启并设置哪个数据处理
#ITEM_PIPELINES = {
# 'stats.pipelines.StatsPipeline': 300,
#}
ITEM_PIPELINES = {
'stats.pipelines.StatsPipeline': 300,
}
开启http缓存
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
设置不显示日志 (测试时需要注释)
LOG_ENABLED = False
修改下载器中间件 middlewares.py
导入 fake_useragent 的 UserAgent
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from fake_useragent import UserAgent
找到 StatsDownloaderMiddleware 类的方法 process_request 并在方法中加上
def process_request(self, request, spider):
request.headers['User-Agent'] = UserAgent().random
新建 start.py 并修改成 (与settings.pu 同级目录)
import scrapy
from pyquery import PyQuery as pq
class CodeSpider(scrapy.Spider):
name = 'code'
allowed_domains = ['www.stats.gov.cn']
# start_urls = ['http://www.stats.gov.cn/']
start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm']
def parse(self, response, **kwargs):
u = response.url
ua = response.request.headers['User-Agent']
_ = pq(response.text)
t = _('title').text()
print(u, ua, t)
pass
获得以下影响, 说明请求处理ok
增加创建文件语句方便进行处理
配置文件 settings.py
SPIDER_ROOT_DATA_DIR_PATH = '行政区划代码/'
下载器中间件 middlewares.py
导入配置
from settings import *
输出配置对应值
def from_crawler(cls, crawler):
print(cls.__name__ + '.from_crawler')
print(SPIDER_ROOT_DATA_DIR_PATH)
if not os.path.exists(SPIDER_ROOT_DATA_DIR_PATH):
os.makedirs(SPIDER_ROOT_DATA_DIR_PATH)
print(os.path.exists(SPIDER_ROOT_DATA_DIR_PATH))
爬虫文件 code.py
import scrapy
from pyquery import PyQuery as pq
from ..settings import *
class CodeSpider(scrapy.Spider):
name = 'code'
allowed_domains = ['www.stats.gov.cn']
# start_urls = ['http://www.stats.gov.cn/']
start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm']
def parse(self, response, **kwargs):
u = response.url
ua = response.request.headers['User-Agent']
_ = pq(response.text)
t = _('title').text()
print(u, ua, t)
print(SPIDER_ROOT_DATA_DIR_PATH)
pass
运行
加上创建目录代码, 运行
爬虫文件继续修改
import os.path
import scrapy
from pyquery import PyQuery as pq
from scrapy import Request
from ..settings import *
class CodeSpider(scrapy.Spider):
name = 'code'
allowed_domains = ['www.stats.gov.cn']
# start_urls = ['http://www.stats.gov.cn/']
start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm']
year = ''
dataDirPath = ''
def parse(self, response, **kwargs):
u = response.url
ua = response.request.headers['User-Agent']
_ = pq(response.text)
t = _('title').text()
# print(u, ua, t)
# print(SPIDER_ROOT_DATA_DIR_PATH)
# .center_list_contlist a[href]
# .center_list_contlist a .cont_tit03
a = _('.center_list_contlist a').eq(0) # 第一个a标签 表示最新
href = a.attr('href') # 获取链接
year = a.find('.cont_tit03').text()[:-1] # 获取年份 最后一个字是 "年" 需要去除
self.dataDirPath = SPIDER_ROOT_DATA_DIR_PATH + year + '/'
if not os.path.exists(self.dataDirPath):
os.makedirs(self.dataDirPath)
if os.path.exists(self.dataDirPath):
yield Request(url=href, callback=self.parse_item)
pass
def parse_item(self, response, **kwargs):
u = response.url
ua = response.request.headers['User-Agent']
_ = pq(response.text)
t = _('title').text()
print(u, ua, t)
print(self.dataDirPath)
效果
经过长时间调试爬虫代码如下
import json
import os.path
import scrapy
from pyquery import PyQuery as pq
from scrapy import Request
from ..settings import *
class CodeSpider(scrapy.Spider):
name = 'code'
allowed_domains = ['www.stats.gov.cn']
# start_urls = ['http://www.stats.gov.cn/']
start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm']
year = ''
dataDirPath = ''
dirSep = '/'
attrClassItems = ['province', 'city'] # 'county', 'town', 'village'
uItems = {}
def parse(self, response, **kwargs):
u = response.url
ua = response.request.headers['User-Agent']
_ = pq(response.text)
t = _('title').text()
# print(u, ua, t)
# print(SPIDER_ROOT_DATA_DIR_PATH)
# .center_list_contlist a[href]
# .center_list_contlist a .cont_tit03
a = _('.center_list_contlist a').eq(0) # 第一个a标签 表示最新
href = a.attr('href') # 获取链接
year = a.find('.cont_tit03').text()[:-1] # 获取年份 最后一个字是 "年" 需要去除
self.dataDirPath = SPIDER_ROOT_DATA_DIR_PATH + year + self.dirSep
if not os.path.exists(self.dataDirPath):
os.makedirs(self.dataDirPath)
if os.path.exists(self.dataDirPath):
yield Request(url=href, callback=self.parse_item)
pass
def parse_item(self, response, **kwargs):
u = response.url
ua = response.request.headers['User-Agent']
_ = pq(response.text)
t = _('title').text()
ud = self.dirSep.join(u.split(self.dirSep)[:-1]) + self.dirSep
each = _('table[class$="table"] tr[class$="tr"]') # 标签的class后缀 tag[class$="*"]
if len(each):
attrClass = each.eq(0).attr('class')
attrClass = list(filter(None, attrClass.split(' ')))
attrClass = [_.replace('tr', '') for _ in attrClass]
if len(attrClass):
attrClass = attrClass[0]
if attrClass in self.attrClassItems:
attrClassIndex = self.attrClassItems.index(attrClass)
# print('-' * 80)
if attrClassIndex == 0:
each = each.find('a')
items = []
for k, _ in enumerate(each):
k = str(k).zfill(3)
_ = pq(_)
href = code = text = ''
if attrClassIndex == 0:
href = ud + _.attr('href')
text = _.text()
code = href.split(self.dirSep)[-1][:-5]
elif attrClassIndex > 0:
td0 = _.find('td').eq(0)
td1 = _.find('td').eq(1)
a = td0.find('a')
hasA = a.text()
if hasA:
href = ud + a.attr('href')
code = td0.text()
text = td1.text()
# print(k, href, text, code)
items.append((href, text, code))
if href:
self.uItems.setdefault(href, {
'k': k,
'h': href,
't': text,
'c': code
})
yield Request(url=href, callback=self.parse_item)
print(112, '-' * 80)
dItems = []
for k, _ in enumerate(self.uItems):
_ = self.uItems.get(_)
# print(_)
dItems.append(_)
dItems.sort(key=lambda _: int(_.get('c')))
for k, _ in enumerate(dItems):
k = str(k).zfill(5)
print(_.get('h'))
print(k, _.get('k'), _.get('c'), _.get('t'))
print()
# print(attrClass)
# print(u, ud, t)
# print(self.dataDirPath)
效果
这是第一部分, 层级比较麻烦 这里只是第一第二级前4位