行政区划代码-5级12位-scrapy版（二）

最新推荐文章于 2024-07-17 16:10:17 发布

CY3761

最新推荐文章于 2024-07-17 16:10:17 发布

阅读量245

点赞数 1

分类专栏： python爬虫文章标签： pycharm markdown imagej

本文链接：https://blog.csdn.net/weixin_63272654/article/details/121412345

版权

python爬虫专栏收录该内容

23 篇文章 0 订阅

订阅专栏

续集


import json
import os.path
import time

import scrapy

from pyquery import PyQuery as pq
from scrapy import Request

from ..settings import *


class CodeSpider(scrapy.Spider):
    name = 'code'
    allowed_domains = ['www.stats.gov.cn']
    # start_urls = ['http://www.stats.gov.cn/']
    start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm']
    year = ''
    dataDirPath = ''
    dirSep = '/'
    attrClassItems = ['province', 'city', 'county', 'town', 'village']
    uItems = {}
    cItems = {}
    tIimes = []
    rItems = {}
    
    def errback(self, failure):
        print('errback', failure)
    
    def parse(self, response, **kwargs):
        u = response.url
        ua = response.request.headers['User-Agent']
        _ = pq(response.text)
        t = _('title').text()
        
        # print(u, ua, t)
        # print(SPIDER_ROOT_DATA_DIR_PATH)
        
        # .center_list_contlist a[href]
        # .center_list_contlist a .cont_tit03
        a = _('.center_list_contlist a').eq(0)  # 第一个a标签 表示最新
        href = a.attr('href')  # 获取链接
        year = a.find('.cont_tit03').text()[:-1]  # 获取年份 最后一个字是 "年" 需要去除
        
        # self.dataDirPath = SPIDER_ROOT_DATA_DIR_PATH + year + self.dirSep
        
        # if not os.path.exists(self.dataDirPath):
        #     os.makedirs(self.dataDirPath)
        
        # if os.path.exists(self.dataDirPath):
        yield Request(url=href, callback=self.parse_item)
        
        pass
    
    def parse_item(self, response, **kwargs):
        u = response.url
        ua = response.request.headers['User-Agent']
        _ = pq(response.text)
        t = _('title').text()
        s = u.split(self.dirSep)
        
        ud = self.dirSep.join(s[:-1]) + self.dirSep
        pc = str(s[-1][:-5]).ljust(12, '0')
        
        each = _('table[class$="table"] tr[class$="tr"]')  # 标签的class后缀 tag[class$="*"]

        if u in self.rItems:
            self.rItems[u] = True
        
        if len(each):
            attrClass = each.eq(0).attr('class')
            attrClass = list(filter(None, attrClass.split(' ')))
            attrClass = [_.replace('tr', '') for _ in attrClass]
            
            if len(attrClass):
                attrClass = attrClass[0]
                
                if attrClass in self.attrClassItems:
                    attrClassIndex = self.attrClassItems.index(attrClass)
                    
                    if attrClass not in self.cItems:
                        self.cItems.setdefault(attrClass, 0)
                    
                    # print('-' * 80)
                    
                    if attrClassIndex == 0:
                        each = each.find('a')
                    
                    for k, _ in enumerate(each):
                        k = str(k).zfill(3)
                        _ = pq(_)
                        href = code = text = ''
                        
                        self.cItems[attrClass] += 1
                        
                        if attrClassIndex == 0:
                            pc = ''.ljust(12, '0')
                            href = ud + _.attr('href')
                            text = _.text()
                            code = href.split(self.dirSep)[-1][:-5].ljust(12, '0')
                        
                        elif attrClassIndex > 0:
                            td0 = _.find('td').eq(0)
                            td1 = _.find('td').eq(1)
                            a = td0.find('a')
                            hasA = a.text()
                            
                            if hasA:
                                href = ud + a.attr('href')
                            
                            code = td0.text()
                            
                            if attrClassIndex == 4:
                                text = _.find('td').eq(2).text()
                            else:
                                text = td1.text()
                            
                            # print(k, href, text, code)
                        
                        self.uItems.setdefault(code, {
                            'ic': self.cItems[attrClass],
                            'pc': pc,
                            'k': k,
                            'h': href,
                            't': text,
                            'c': code
                        })
                        
                        if href:
                            self.rItems.setdefault(href, False)
                            # print(code, text, attrClass, href)
                            
                            try:
                                yield Request(url=href, callback=self.parse_item, errback=self.errback)
                            except (Exception, BaseException) as e:
                                print(e)
                        
                        int_time = int(time.time())
                        
                        if int_time not in self.tIimes:
                            self.tIimes.append(int_time)
                            print('%d' % int_time)

存储 pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class StatsPipeline:
    def process_item(self, item, spider):
        print('process_item')
        return item
    
    def close_spider(self, spider):
        print('close_spider')
        
        dItems = []
        cItems = {}
        for k, _ in enumerate(spider.uItems):
            _ = spider.uItems.get(_)
            # print(_)
            
            code = _.get('c')
            if code not in cItems:
                cItems.setdefault(code, _.get('t'))
            
            dItems.append(_)
        
        dItems.sort(key=lambda _: int(_.get('c')))
        sumCi = sum([spider.cItems.get(_) for _ in spider.cItems])
        print(33, '-' * 80)
        
        with open('data.txt', 'w', encoding='utf-8') as w:
            
            for k, _ in enumerate(dItems):
                k = str(k).zfill(5)
                # pc 如果全部是 0 就是顶级
                
                w.write(','.join([
                    k, _.get('pc'), _.get('c'), _.get('t'), _.get('h')
                ]) + '\n')
                
                print(k, _.get('pc'), _.get('c'), len(_.get('h')), _.get('t'))
        
        with open('log.txt', 'w', encoding='utf-8') as w:
            for k, _ in enumerate(spider.rItems):
                k = str(k).zfill(5)
                print(k, _, spider.rItems.get(_))
                
                w.write(','.join([
                    k,
                    _,
                    str(spider.rItems.get(_))
                ]) + '\n')
            
            """
            print(k, str(_.get('ic')).zfill(5), _.get('pc'), _.get('k'), _.get('c'), cItems.get(_.get('pc')),
                  _.get('t'))
            """
        
        print(sumCi, spider.cItems, len(spider.tIimes))

发现有部分没有进行爬取，未知什么原因，后续再处理了
感觉失败了，使用这个，这个还是会继续研究的!

CY3761

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
行政区划代码-5级12位-scrapy版（二）

续集import jsonimport os.pathimport timeimport scrapyfrom pyquery import PyQuery as pqfrom scrapy import Requestfrom ..settings import *class CodeSpider(scrapy.Spider): name = 'code' allowed_domains = ['www.stats.gov.cn'] # start_ur
复制链接

扫一扫