行政区划代码-5级12位-scrapy版(一)

CY3761 | 2021-11-18 10:18

行政区划代码-5级12位-scrapy版本

导航: 统计数据 -> 统计标准 -> 统计用区划和城乡划分代码
爬取初始网址: 获取最新数据链接

内含5种布局

序号代码个数class前缀
111 00 00 000 00000000province
211 01 00 000 00000000city
311 01 01 000 00000000county
411 01 01 001 00000000town
511 01 01 001 00100000village

创建项目

scrapy startproject stats

进入项目目录

cd stats

终端-001

修改 setting.py

是否遵守 robots.txt 的规则

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False

开启并设置调用哪个下载器中间件

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'stats.middlewares.StatsDownloaderMiddleware': 543,
#}
DOWNLOADER_MIDDLEWARES = {
   'stats.middlewares.StatsDownloaderMiddleware': 543,
}

开启并设置哪个数据处理

#ITEM_PIPELINES = {
#    'stats.pipelines.StatsPipeline': 300,
#}
ITEM_PIPELINES = {
   'stats.pipelines.StatsPipeline': 300,
}

开启http缓存

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

设置不显示日志 (测试时需要注释)

LOG_ENABLED = False

修改下载器中间件 middlewares.py

导入 fake_useragent 的 UserAgent

from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from fake_useragent import UserAgent

找到 StatsDownloaderMiddleware 类的方法 process_request 并在方法中加上

    def process_request(self, request, spider):
        request.headers['User-Agent'] = UserAgent().random

新建 start.py 并修改成 (与settings.pu 同级目录)

import scrapy

from pyquery import PyQuery as pq


class CodeSpider(scrapy.Spider):
    name = 'code'
    allowed_domains = ['www.stats.gov.cn']
    # start_urls = ['http://www.stats.gov.cn/']
    start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm']
    
    def parse(self, response, **kwargs):
        u = response.url
        ua = response.request.headers['User-Agent']
        _ = pq(response.text)
        t = _('title').text()
        
        print(u, ua, t)
        pass

获得以下影响, 说明请求处理ok

终端-002

增加创建文件语句方便进行处理

配置文件 settings.py

SPIDER_ROOT_DATA_DIR_PATH = '行政区划代码/'

下载器中间件 middlewares.py
导入配置

from settings import *

输出配置对应值

        def from_crawler(cls, crawler):
        print(cls.__name__ + '.from_crawler')
        print(SPIDER_ROOT_DATA_DIR_PATH)
        
        if not os.path.exists(SPIDER_ROOT_DATA_DIR_PATH):
            os.makedirs(SPIDER_ROOT_DATA_DIR_PATH)

        print(os.path.exists(SPIDER_ROOT_DATA_DIR_PATH))

爬虫文件 code.py

import scrapy

from pyquery import PyQuery as pq
from ..settings import *


class CodeSpider(scrapy.Spider):
    name = 'code'
    allowed_domains = ['www.stats.gov.cn']
    # start_urls = ['http://www.stats.gov.cn/']
    start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm']
    
    def parse(self, response, **kwargs):
        u = response.url
        ua = response.request.headers['User-Agent']
        _ = pq(response.text)
        t = _('title').text()
        
        print(u, ua, t)
        print(SPIDER_ROOT_DATA_DIR_PATH)
        pass

运行
终端-003

加上创建目录代码, 运行
终端-004

爬虫文件继续修改

import os.path

import scrapy

from pyquery import PyQuery as pq
from scrapy import Request

from ..settings import *

class CodeSpider(scrapy.Spider):
    name = 'code'
    allowed_domains = ['www.stats.gov.cn']
    # start_urls = ['http://www.stats.gov.cn/']
    start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm']
    year = ''
    dataDirPath = ''
    
    def parse(self, response, **kwargs):
        u = response.url
        ua = response.request.headers['User-Agent']
        _ = pq(response.text)
        t = _('title').text()
        
        # print(u, ua, t)
        # print(SPIDER_ROOT_DATA_DIR_PATH)
        
        # .center_list_contlist a[href]
        # .center_list_contlist a .cont_tit03
        a = _('.center_list_contlist a').eq(0)  # 第一个a标签 表示最新
        href = a.attr('href')  # 获取链接
        year = a.find('.cont_tit03').text()[:-1]  # 获取年份 最后一个字是 "年" 需要去除

        self.dataDirPath = SPIDER_ROOT_DATA_DIR_PATH + year + '/'
        
        if not os.path.exists(self.dataDirPath):
            os.makedirs(self.dataDirPath)
            
        if os.path.exists(self.dataDirPath):
            yield Request(url=href, callback=self.parse_item)
        
        pass
    
    def parse_item(self, response, **kwargs):
        u = response.url
        ua = response.request.headers['User-Agent']
        _ = pq(response.text)
        t = _('title').text()

        print(u, ua, t)
        print(self.dataDirPath)

效果
终端-005

经过长时间调试爬虫代码如下

import json
import os.path

import scrapy

from pyquery import PyQuery as pq
from scrapy import Request

from ..settings import *


class CodeSpider(scrapy.Spider):
    name = 'code'
    allowed_domains = ['www.stats.gov.cn']
    # start_urls = ['http://www.stats.gov.cn/']
    start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm']
    year = ''
    dataDirPath = ''
    dirSep = '/'
    attrClassItems = ['province', 'city']  # 'county', 'town', 'village'
    uItems = {}
    
    def parse(self, response, **kwargs):
        u = response.url
        ua = response.request.headers['User-Agent']
        _ = pq(response.text)
        t = _('title').text()
        
        # print(u, ua, t)
        # print(SPIDER_ROOT_DATA_DIR_PATH)
        
        # .center_list_contlist a[href]
        # .center_list_contlist a .cont_tit03
        a = _('.center_list_contlist a').eq(0)  # 第一个a标签 表示最新
        href = a.attr('href')  # 获取链接
        year = a.find('.cont_tit03').text()[:-1]  # 获取年份 最后一个字是 "年" 需要去除
        
        self.dataDirPath = SPIDER_ROOT_DATA_DIR_PATH + year + self.dirSep
        
        if not os.path.exists(self.dataDirPath):
            os.makedirs(self.dataDirPath)
        
        if os.path.exists(self.dataDirPath):
            yield Request(url=href, callback=self.parse_item)
        
        pass
    
    def parse_item(self, response, **kwargs):
        u = response.url
        ua = response.request.headers['User-Agent']
        _ = pq(response.text)
        t = _('title').text()
        
        ud = self.dirSep.join(u.split(self.dirSep)[:-1]) + self.dirSep
        
        each = _('table[class$="table"] tr[class$="tr"]')  # 标签的class后缀 tag[class$="*"]
        
        if len(each):
            attrClass = each.eq(0).attr('class')
            attrClass = list(filter(None, attrClass.split(' ')))
            attrClass = [_.replace('tr', '') for _ in attrClass]
            
            if len(attrClass):
                attrClass = attrClass[0]
                
                if attrClass in self.attrClassItems:
                    attrClassIndex = self.attrClassItems.index(attrClass)
                    
                    # print('-' * 80)
                    
                    if attrClassIndex == 0:
                        each = each.find('a')
                    
                    items = []
                    
                    for k, _ in enumerate(each):
                        k = str(k).zfill(3)
                        _ = pq(_)
                        href = code = text = ''
                        
                        if attrClassIndex == 0:
                            href = ud + _.attr('href')
                            text = _.text()
                            code = href.split(self.dirSep)[-1][:-5]
                        
                        elif attrClassIndex > 0:
                            td0 = _.find('td').eq(0)
                            td1 = _.find('td').eq(1)
                            a = td0.find('a')
                            hasA = a.text()
                            
                            if hasA:
                                href = ud + a.attr('href')
                            
                            code = td0.text()
                            text = td1.text()
                        
                        # print(k, href, text, code)
                        
                        items.append((href, text, code))
                        
                        if href:
                            self.uItems.setdefault(href, {
                                'k': k,
                                'h': href,
                                't': text,
                                'c': code
                            })
                            
                            yield Request(url=href, callback=self.parse_item)
            
            print(112, '-' * 80)
            
            dItems = []
            
            for k, _ in enumerate(self.uItems):
                _ = self.uItems.get(_)
                # print(_)

                dItems.append(_)

            dItems.sort(key=lambda _: int(_.get('c')))
            
            for k, _ in enumerate(dItems):
                k = str(k).zfill(5)
                
                print(_.get('h'))
                print(k, _.get('k'), _.get('c'), _.get('t'))
                print()
            
            # print(attrClass)
        
        # print(u, ud, t)
        # print(self.dataDirPath)

效果
终端-006
终端-007
这是第一部分, 层级比较麻烦 这里只是第一第二级前4位

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

CY3761

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值