经过一整天加上昨晚的部分执行,至于出炉了
代码
# CY3761 | 2021-11-19 17:55
# 行政区划代码-5级12位-单线程对象版本
# 导航: 统计数据 -> 统计标准 -> 统计用区划和城乡划分代码
# 爬取初始网址: [获取最新数据链接](http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm)
## 内含5种布局
# md表格: 表头的下一行中横线最少3个一格
# md表格: 表头的下一行, 使用冒号进行对齐
"""
序号|代码|个数|class前缀
---:|:---:|:---:|:---
1 | 11 00 00 000 000 | 00000 | province | 省份页面
2 | 11 01 00 000 000 | 00000 | city | 城市页面
3 | 11 01 01 000 000 | 00000 | county | 区县页面
4 | 11 01 01 001 000 | 00000 | town | 城镇页面
5 | 11 01 01 001 001 | 00000 | village | 村庄页面
"""
import os.path
import time
from fake_useragent import UserAgent
from pyquery import PyQuery as pq
from requests import get
class CodeSpider:
@staticmethod
def pathIsExists(path):
return os.path.exists(path)
def makedirs(self, dirPath):
if not self.pathIsExists(dirPath):
os.makedirs(dirPath)
fileDataEncoding = 'utf-8'
htmlPos = '.html'
dataPos = '.txt'
fileSep = ','
joinStr = '\n'
def setFileData(self, data, filePath):
if not self.pathIsExists(filePath):
with open(filePath, 'w', encoding=self.fileDataEncoding) as w:
if isinstance(data, list):
for k, _ in enumerate(data):
if isinstance(_, (tuple, list)):
data[k] = self.fileSep.join(_)
data = self.joinStr.join(map(str, data))
if isinstance(data, str) and len(data):
w.write(data + self.joinStr)
def getFileData(self, filePath):
if self.pathIsExists(filePath):
with open(filePath, 'r', encoding=self.fileDataEncoding) as r:
fileData = ''
if filePath.endswith(self.htmlPos):
fileData = r.read()
elif filePath.endswith(self.dataPos):
fileData = r.readlines()
fileData = filter(None, fileData)
fileData = [_.replace(self.joinStr, '').split(self.fileSep) for _ in fileData]
# print('读取数据 %s' % filePath)
return fileData
requestsEncoding = 'utf-8'
requestsItems = []
requestsSleep = 4
requestsPart = 128
def getRequests(self, url):
headers = {
'User-Agent': UserAgent().random
}
requestsItemsLen = len(self.requestsItems)
time.sleep(self.requestsPart * 1e-3)
# print(self.requestsPart * 1e-3)
if requestsItemsLen % self.requestsPart == 0:
time.sleep(self.requestsSleep)
resp = get(url, headers=headers)
resp.encoding = self.requestsEncoding
text = resp.text
if resp.status_code == 200 and text:
secs = time.time()
secs_frac = (secs - int(secs)) * 8e3
print('请求响应 %s %s %8d' % (url, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(secs)), secs_frac))
return text
raise Exception('095 %s %s' % (resp.status_code, url))
def getRequestsData(self, url, filePath):
if not self.pathIsExists(filePath):
text = self.getRequests(url)
if text:
self.setFileData(text, filePath)
if self.pathIsExists(filePath):
return self.getFileData(filePath)
raise Exception('105 %s' % filePath)
dirSep = '/'
def getUrlPre(self, url):
return url[0: url.rindex(self.dirSep) + 1]
eachClassItems = ['province', 'city', 'county', 'town', 'village'] # 'city', 'county', 'town', 'village'
parseItems = {}
continueItem = {}
def parsePage(self, url, pageCode, pageText, level=1):
htmlDirPath = self.htmlDirPath + str(level) + '/'
self.makedirs(htmlDirPath)
dataDirPath = self.dataDirPath + str(level) + '/'
self.makedirs(dataDirPath)
self.requestsEncoding = 'gbk'
key = level - 1
if len(self.eachClassItems) <= key:
return
key = self.eachClassItems[key]
if key not in self.parseItems:
self.parseItems.setdefault(key, 0)
self.continueItem.setdefault(key, 0)
htmlFilePath = htmlDirPath + pageCode + self.htmlPos
dataFilePath = dataDirPath + pageCode + self.dataPos
if url not in self.requestsItems:
self.requestsItems.append(url)
htmlFileIsExists = self.pathIsExists(htmlFilePath)
dataFileIsExists = self.pathIsExists(dataFilePath)
print(str(len(self.requestsItems)).zfill(5), pageCode, pageText, htmlFileIsExists, dataFileIsExists)
if not htmlFileIsExists or not dataFileIsExists:
text = self.getRequestsData(url, htmlFilePath)
_ = pq(text)
t = _('title').text()
up = self.getUrlPre(url)
each = _('table[class$="table"] tr[class$="tr"]') # 不存在会返回 False
if each:
eachClass = each.eq(0).attr('class').split(' ')[0].replace('tr', '')
if eachClass not in self.eachClassItems:
print(154, level, key, eachClass, pageText)
# print(htmlFilePath, dataFilePath, sep=self.joinStr)
return
eachClassIndex = self.eachClassItems.index(eachClass)
if eachClassIndex == 0:
each = each.find('a')
items = []
for _ in each:
_ = pq(_)
h = t = c = ''
if eachClassIndex == 0:
h = up + _.attr('href')
t = _.text()
c = h.split(self.dirSep)[-1][:-5] + ''.zfill(10)
elif eachClassIndex > 0:
td0 = _.find('td').eq(0)
td1 = _.find('td').eq(1)
a = td0.find('a')
if a.text():
h = up + a.attr('href')
if eachClass == 'village':
t = _.find('td').eq(2).text()
else:
t = td1.text()
c = td0.text()
items.append((h, t, pageCode, c))
# print(195, dataFilePath)
if items:
# print(193, dataFilePath)
self.setFileData(items, dataFilePath)
if dataFileIsExists:
items = self.getFileData(dataFilePath)
# print(self.getFileData(dataFilePath))
self.parseItems[key] += len(items)
for _ in items:
h = _[0]
t = _[1]
c = _[3]
# print(_, bool(h))
if h:
self.parsePage(h, c, t, level + 1)
else:
# 不会进行下级读取
self.continueItem[key] += 1
# .............................................................................................................
rootDirPath = '行政区划代码/'
htmlDirPath = 'html/'
dataDirPath = 'data/'
initUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm'
def main(self):
self.makedirs(self.rootDirPath)
# 获取最新数据
text = self.getRequestsData(self.initUrl, self.rootDirPath + 'index' + self.htmlPos)
_ = pq(text)
_ = _('.center_list_contlist a').eq(0)
href = _.attr('href')
text = _.find('.cont_tit03').text()[:-1]
self.htmlDirPath = self.rootDirPath + text + '/' + self.htmlDirPath
self.makedirs(self.htmlDirPath)
self.dataDirPath = self.rootDirPath + text + '/' + self.dataDirPath
self.makedirs(self.dataDirPath)
# 正式数据处理
self.parsePage(href, ''.zfill(12), text)
print(self.parseItems)
print(self.continueItem)
print(len(self.requestsItems))
pass
if __name__ == '__main__':
i = 0
while True:
time.sleep(16)
print('-' * 80)
print(i)
print('-' * 80)
i += 1
try:
CodeSpider().main()
except (Exception, BaseException) as e:
print('except %s' % e)
else: # 当前已开启所有爬取 一般出错在 请求超时 一旦没有错误了就表示执行结束了
break
这个爬取需要一点耐心和时间的,因为到后面,经常请求过多而导致重新执行,还好我做了缓存机制
我一直想获取这些数据。。网上也有贩卖的,贵死了。现在我做到了
好开心,不过数据不公开了。。哈哈哈