序言
最近期末比较忙,挂个可能有用的资源:《中国金融年鉴》(1986-2019)和《中国统计年鉴》(1981-2020)的所有Excel表的资源。数据来源于中国知网的爬虫(下面正文中有提及具体链接网址)。目前为止网上还没有人提供完整的自1986年至今的年鉴数据,基本上只有特定年份的年鉴数据,而且还都是需要付费的。
链接:https://pan.baidu.com/s/13fjrInmjjxaNQRgS_Jv91w
提取码:k5ir
好了需要资源的上面自取即可,后记里的废话就不用看了。
1 《中国统计年鉴》与《中国金融年鉴》Excel数据爬虫
- 鱼已经提供在上面了,下面是渔,不过笔者事先提醒,渔并不好学,建议自己去爬一遍就知道哪里比较坑了。
- 不过第二部分里笔者也简要说明了一下爬虫的细节。
正在经历史上最难期末,放个历年《中国统计年鉴》和《中国金融年鉴》所有Excel表的爬虫脚本:
# -*- coding: utf-8 -*-
# @author: caoyang
# @email: caoyang@163.sufe.edu.cn
import os
import re
import time
import requests
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
def get_cookie(url):
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
driver.get(url)
cookies = driver.get_cookies()
driver.quit()
def _cookie_to_string(cookies):
string = ''
for cookie in cookies:
string += '{}={}; '.format(cookie['name'], cookie['value'])
return string.strip()
return _cookie_to_string(cookies)
def download_chinese_statistical_yearbook(ybcode='N2020100004', year='2020', save_root='csyb', is_initial=True, ignore_caj=True):
with open('system_csyb.log', 'w') as f:
pass
headers = {'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
query_url = 'https://data.cnki.net/Yearbook/PartialGetCatalogResult'
excel_url = 'https://data.cnki.net/{}'.format
caj_url = 'https://data.cnki.net/download/GetCajUrl'
regex = r'<[^>]+>'
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
compiler = re.compile(regex, re.S)
regular_interval = 15
reset_interval = 300
if not os.path.exists(save_root):
os.mkdir(save_root)
# year = ybcode[1:5]
target_path = os.path.join(save_root, year)
if not os.path.exists(target_path):
os.mkdir(target_path)
with open(os.path.join(target_path, 'log.txt'), 'w') as f:
pass
formdata = {
'ybcode': ybcode,
'entrycode': '',
'page': '1',
'pagerow': '20'
}
response = requests.post(query_url, data=formdata, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'lxml')
span = soup.find('span', class_='s_p_listl')
for link in span.find_all('a'):
onclick = link.attrs.get('onclick')
if onclick is not None:
lindex = onclick.find('\'')
rindex = onclick.find('\'', lindex + 1)
n_pages = int(onclick[lindex + 1:rindex])
break
with open('system_csyb.log', 'a') as f:
f.write('正在处理{}年...\t{}\n'.format(year, time.strftime('%Y-%m-%d %H:%M:%S')))
print('正在处理{}年...'.format(year))
with open('system_csyb.log', 'a') as f:
f.write('共计{}页\t{}\n'.format(n_pages, time.strftime('%Y-%m-%d %H:%M:%S')))
print('共计{}页'.format(n_pages))
for page in range(1, n_pages + 1):
with open('system_csyb.log', 'a') as f:
f.write(' - 第{}页..\t{}\n'.format(page, time.strftime('%Y-%m-%d %H:%M:%S')))
print(' - 第{}页..'.format(page))
if not page == '1':
formdata = {
'ybcode': ybcode,
'entrycode': '',
'page': str(page),
'pagerow': '20'
}
while True:
try:
response = requests.post(query_url, data=formdata, headers=headers)
break
except:
with open('system_csyb.log', 'a') as f:
f.write(' 页面访问失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 页面访问失败...')
time.sleep(reset_interval)
html = response.text
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table')
for tr in table.find_all('tr'):
tds = tr.find_all('td')
assert len(tds) == 3
title = compiler.sub('', str(tds[0])).replace('\n', '').replace('\t', '').replace(' ', '').replace('\r', '')
page_range = compiler.sub('', str(tds[1])).replace('\n', '').replace('\t', '').replace(' ', '')
for _link in tds[2].find_all('a'):
href = _link.attrs['href']
if href.startswith('/download/excel'): # excel
filecode = href[href.find('=')+1:]
while True:
_headers = headers.copy()
_headers['Cookie'] = cookies
try:
with open('system_csyb.log', 'a') as f:
f.write(' + 下载{}...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
print(' + 下载{}...'.format(title))
response = requests.get(excel_url(href), headers=_headers)
print(' ' + str(response.status_code))
try:
html = response.text
soup = BeautifulSoup(html, 'lxml')
if str(soup.find('title').string)=='中国经济社会大数据研究平台':
with open('system_csyb.log', 'a') as f:
f.write(' 重置cookie...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 重置cookie...')
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
else:
break
except:
break
except:
with open('system_csyb.log', 'a') as f:
f.write(' 失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 失败...')
time.sleep(reset_interval)
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
time.sleep(regular_interval)
with open(os.path.join(target_path, '{}.xls'.format(filecode)), 'wb') as f:
f.write(response.content)
with open(os.path.join(target_path, 'log.txt'), 'a') as f:
f.write('{}\t{}\t{}.xls\n'.format(title, page_range, filecode))
else: # caj
if ignore_caj:
continue
filecode = _link.attrs['fn']
pagerange = _link.attrs['pg']
disk = _link.attrs['disk']
_formdata = {
'filecode': filecode,
'pagerange': pagerange,
'disk': disk,
}
while True:
_headers = headers.copy()
_headers['Cookie'] = cookies
try:
with open('system_csyb.log', 'a') as f:
f.write(' + 下载{}的资源链接...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
print(' + 下载{}的资源链接...'.format(title))
response = requests.post(caj_url, headers=_headers, data=_formdata)
break
except:
with open('system_csyb.log', 'a') as f:
f.write(' 失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 失败...')
time.sleep(reset_interval)
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
resource_url = response.json()['url']
while True:
try:
with open('system_csyb.log', 'a') as f:
f.write(' + 下载{}...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
print(' + 下载{}...'.format(title))
response = requests.get(resource_url, headers=headers)
if str(response.status_code) == '200':
break
else:
with open('system_csyb.log', 'a') as f:
f.write(' 重置cookie...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 重置cookie...')
time.sleep(reset_interval)
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
except:
with open('system_csyb.log', 'a') as f:
f.write(' 失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 失败...')
time.sleep(regular_interval)
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
time.sleep(regular_interval)
with open(os.path.join(target_path, '{}.caj'.format(filecode)), 'wb') as f:
f.write(response.content)
with open(os.path.join(target_path, 'log.txt'), 'a') as f:
f.write('{}\t{}\t{}.caj\n'.format(title, page_range, filecode))
# Find urls of year
if is_initial:
url = 'https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)
response = requests.get(url, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'lxml')
div = soup.find('div', class_='s_year clearfix')
links = []
ybcodes = []
for link in div.find_all('a'):
class_ = link.attrs.get('class')
if class_ is None: # not current
href = link.attrs.get('href')
ybcode = href.split('/')[-1].split('?')[0]
links.append(href)
ybcodes.append(ybcode)
with open('ybcode_csyb.txt', 'w') as f:
for ybcode in ybcodes:
f.write(f'{ybcode}\n')
# for ybcode in ybcodes:
# download_chinese_statistical_yearbook(ybcode=ybcode, is_initial=False)
def download_chinese_financial_yearbook(ybcode='N2020070552', year='2019', save_root='cfyb', is_initial=True, ignore_caj=True):
with open('system_cfyb.log', 'w') as f:
pass
headers = {'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
query_url = 'https://data.cnki.net/Yearbook/PartialGetCatalogResult'
excel_url = 'https://data.cnki.net/{}'.format
caj_url = 'https://data.cnki.net/download/GetCajUrl'
regex = r'<[^>]+>'
cookies = '''ASP.NET_SessionId=qgfddbtpp2yw1yik5xpie3mo; Ecp_ClientId=2210524115702029814; Ecp_LoginStuts={"IsAutoLogin":false,"UserName":"SH0013","ShowName":"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6","UserType":"bk","BUserName":"","BShowName":"","BUserType":"","r":"6dHmNy"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtGdzBoZVNMWk5Nc0RUeDFBOD0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 12:16:44; LID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtGdzBoZVNMWk5Nc0RUeDFBOD0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 12:16:44; SID=009026; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621828625; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621828625'''
cookies = '''ASP.NET_SessionId=pdbekustghjjz2neuam5etnt; Ecp_ClientId=5210524165003078186; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"SH0013\",\"ShowName\":\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"087ZRr\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqelcxUzhJV1VTdGVGdmpHd1JmTGx6Sjd5N1Yzcz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 17:09:30; LID=WEEvREcwSlJHSldSdmVqelcxUzhJV1VTdGVGdmpHd1JmTGx6Sjd5N1Yzcz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 17:09:30; SID=009024; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621846228; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621846228'''
cookies = '''ASP.NET_SessionId=mow1jjxmf3yl0kudfyxajmzc; Ecp_ClientId=2210524182003926881; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"SH0013\",\"ShowName\":\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"4ZXI5N\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtnL01TODdZeGZBQjFVNFFhVT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 18:39:44; LID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtnL01TODdZeGZBQjFVNFFhVT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 18:39:44; SID=009026; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621851606; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621852178'''
cookies = '''ASP.NET_SessionId=x2uuxyelllkb01vne0bg1fcz; Ecp_ClientId=1210524220405317104; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"SH0013\",\"ShowName\":\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"CoZFit\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVR4WDNESDFyZmdtZks1OWNYNFlMRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 22:23:34; LID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVR4WDNESDFyZmdtZks1OWNYNFlMRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 22:23:34; SID=009025; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621865075; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621865075'''
cookies = '''ASP.NET_SessionId=nl5mpjvzy2az5kamdhek0ydq; Ecp_ClientId=3210525133102568069; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"SH0013\",\"ShowName\":\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"ubJVB4\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMVc3M1dGdk5Xa2hFYzh2WjV6Y2cvSUZzR3FPbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/25/2021 13:51:50; LID=WEEvREcwSlJHSldSdmVqMVc3M1dGdk5Xa2hFYzh2WjV6Y2cvSUZzR3FPbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-25 13:51:50; SID=009022; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621920712; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621920726'''
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
compiler = re.compile(regex, re.S)
regular_interval = 15
reset_interval = 300
if not os.path.exists(save_root):
os.mkdir(save_root)
# year = ybcode[1:5]
target_path = os.path.join(save_root, year)
if not os.path.exists(target_path):
os.mkdir(target_path)
with open(os.path.join(target_path, 'log.txt'), 'w') as f:
pass
formdata = {
'ybcode': ybcode,
'entrycode': '',
'page': '1',
'pagerow': '20'
}
response = requests.post(query_url, data=formdata, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'lxml')
span = soup.find('span', class_='s_p_listl')
for link in span.find_all('a'):
onclick = link.attrs.get('onclick')
if onclick is not None:
lindex = onclick.find('\'')
rindex = onclick.find('\'', lindex + 1)
n_pages = int(onclick[lindex + 1:rindex])
break
with open('system_cfyb.log', 'a') as f:
f.write('正在处理{}年...\t{}\n'.format(year, time.strftime('%Y-%m-%d %H:%M:%S')))
print('正在处理{}年...'.format(year))
with open('system_cfyb.log', 'a') as f:
f.write('共计{}页\t{}\n'.format(n_pages, time.strftime('%Y-%m-%d %H:%M:%S')))
print('共计{}页'.format(n_pages))
for page in range(1, n_pages + 1):
with open('system_cfyb.log', 'a') as f:
f.write(' - 第{}页..\t{}\n'.format(page, time.strftime('%Y-%m-%d %H:%M:%S')))
print(' - 第{}页..'.format(page))
if not page == '1':
formdata = {
'ybcode': ybcode,
'entrycode': '',
'page': str(page),
'pagerow': '20'
}
while True:
try:
response = requests.post(query_url, data=formdata, headers=headers)
break
except:
with open('system_cfyb.log', 'a') as f:
f.write(' 页面访问失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 页面访问失败...')
time.sleep(reset_interval)
html = response.text
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table')
for tr in table.find_all('tr'):
tds = tr.find_all('td')
assert len(tds) == 3
title = compiler.sub('', str(tds[0])).replace('\n', '').replace('\t', '').replace(' ', '').replace('\r', '')
page_range = compiler.sub('', str(tds[1])).replace('\n', '').replace('\t', '').replace(' ', '')
for _link in tds[2].find_all('a'):
href = _link.attrs['href']
if href.startswith('/download/excel'): # excel
filecode = href[href.find('=')+1:]
while True:
_headers = headers.copy()
_headers['Cookie'] = cookies
try:
with open('system_cfyb.log', 'a') as f:
f.write(' + 下载{}...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
print(' + 下载{}...'.format(title))
response = requests.get(excel_url(href), headers=_headers)
print(' ' + str(response.status_code))
try:
html = response.text
soup = BeautifulSoup(html, 'lxml')
if str(soup.find('title').string)=='中国经济社会大数据研究平台':
with open('system_cfyb.log', 'a') as f:
f.write(' 重置cookie...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 重置cookie...')
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
else:
break
except:
break
except:
with open('system_cfyb.log', 'a') as f:
f.write(' 失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 失败...')
time.sleep(reset_interval)
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
time.sleep(regular_interval)
with open(os.path.join(target_path, '{}.xls'.format(filecode)), 'wb') as f:
f.write(response.content)
with open(os.path.join(target_path, 'log.txt'), 'a') as f:
f.write('{}\t{}\t{}.xls\n'.format(title, page_range, filecode))
else: # caj
if ignore_caj:
continue
filecode = _link.attrs['fn']
pagerange = _link.attrs['pg']
disk = _link.attrs['disk']
_formdata = {
'filecode': filecode,
'pagerange': pagerange,
'disk': disk,
}
while True:
_headers = headers.copy()
_headers['Cookie'] = cookies
try:
with open('system_cfyb.log', 'a') as f:
f.write(' + 下载{}的资源链接...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
print(' + 下载{}的资源链接...'.format(title))
response = requests.post(caj_url, headers=_headers, data=_formdata)
break
except:
with open('system_cfyb.log', 'a') as f:
f.write(' 失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 失败...')
time.sleep(reset_interval)
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
resource_url = response.json()['url']
while True:
try:
with open('system_cfyb.log', 'a') as f:
f.write(' + 下载{}...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
print(' + 下载{}...'.format(title))
response = requests.get(resource_url, headers=headers)
if str(response.status_code) == '200':
break
else:
with open('system_cfyb.log', 'a') as f:
f.write(' 重置cookie...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 重置cookie...')
time.sleep(reset_interval)
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
except:
with open('system_cfyb.log', 'a') as f:
f.write(' 失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
print(' 失败...')
time.sleep(regular_interval)
cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
time.sleep(regular_interval)
with open(os.path.join(target_path, '{}.caj'.format(filecode)), 'wb') as f:
f.write(response.content)
with open(os.path.join(target_path, 'log.txt'), 'a') as f:
f.write('{}\t{}\t{}.caj\n'.format(title, page_range, filecode))
# Find urls of year
if is_initial:
url = 'https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)
response = requests.get(url, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'lxml')
div = soup.find('div', class_='s_year clearfix')
links = []
ybcodes = []
for link in div.find_all('a'):
class_ = link.attrs.get('class')
if class_ is None: # not current
href = link.attrs.get('href')
ybcode = href.split('/')[-1].split('?')[0]
links.append(href)
ybcodes.append(ybcode)
with open('ybcode_cfyb.txt', 'w') as f:
for ybcode in ybcodes:
f.write(f'{ybcode}\n')
for ybcode in ybcodes:
download_chinese_financial_yearbook(ybcode=ybcode, is_initial=False)
if __name__ == '__main__':
'''
with open('ybcode_csyb.txt', 'r') as f:
lines = f.read().splitlines()
for line in lines:
ybcode, year = line.split()
# if int(year) > 1999:
# continue
download_chinese_statistical_yearbook(ybcode=ybcode, year=year, is_initial=False)
'''
with open('ybcode_cfyb.txt', 'r') as f:
lines = f.read().splitlines()
for line in lines:
ybcode, year = line.split()
#if int(year) > 1994:
# continue
download_chinese_financial_yearbook(ybcode=ybcode, year=year, save_root='cfyb', is_initial=False, ignore_caj=True)
数据源来自中国知网,附本爬虫的数据源:
因为笔者所在区域具有知网下载的权限,所以没有知网下载权限的拿到这个爬虫也没什么意义,注意到每次爬虫之前都会调用Selenium驱动浏览器去获取Cookies,以获取下载权限,由于知网Cookies时效很短,一旦失效下载得到的就是知网首页的HTML,所以需要编写逻辑去监测Cookies的有效性,一旦失效就需要继续调用Selenium重新获取Cookies。此外爬取速度尽可能地慢一些,代码中两次下载之间间隔 15 15 15秒,以《中国统计年鉴》为例,截至本文发布共计 39 39 39年,每年差不多有 600 600 600张Excel表,所以基本上需要两三天时间才能全部下完。
2 关于爬虫的一些细节说明及如何使用脚本处理获得的Excel表
笔者鉴于时间有限不想多提爬虫的思路,这里主要记录几个细节(坑点):
-
代码中的
ignore_caj=True
即自动过滤caj链接的下载,事实上caj
的下载相对会复杂一些,需要先做一个POST请求获取资源链接,而excel的下载链接直接就写在页面源代码上了。之所以选择不下载caj文件有两个原因,其一是caj文件确实没什么用,有用的数据都写在excel中了,重要的是另一个原因,就是caj的POST请求获取资源链接很容易造成Cookie不可用,导致需要频繁切换Cookie,太费时间,相对excel下载链接就很稳定,一般来说Cookie用上整整一天都不会失效。 -
爬虫主体仅为
requests
,浏览器驱动selenium
只用作更新Cookie,所以资源耗用是比较小的。 -
注意上述链接中的页面源代码上是有
<iframe>
标签的,所以下载链接并不能直接在页面源代码中找到,需要监听抓包取得<iframe>
标签下内容对应的URL。 -
这里有一个问题,就是从上述数据源获取得到的Excel表全部都是受保护的,其实只是不能去编辑这些Excel,正常使用Office或是WPS依然可以打开读取(只读模式),如果想要解除保护,需要在审阅菜单下输入密码:
其实这也不影响使用,因为本来也不需要修改这些Excel表格,但是如果是使用pandas.read_excel
或是xlrd.open_workbook
都会发生报错:
xlrd.biffh.XLRDError: Workbook is encrypted
笔者暂时没有测试openpyxl.load_workbook
在受保护的Excel上的读取情况,原因是openpyxl
只能支持.xlsx
格式的文件读取。因为年鉴数据的Excel表数量实在是太多了,如果想要批量地找一类数据,不借助脚本而是手动去复制数据实在是太蠢,最后终于找到了一种可行的方案,即使用win32com.client
模块下的DispatchEx
方法:
from win32com.client import DispatchEx
excel = DispatchEx('Excel.Application')
demo = excel.Workbooks.Open('N2018070031000595.xls')
sheet = demo.WorkSheets(1)
print(sheet.Cells(1,1).Value)
demo.Close(True)
本质是调用Office程序来读取,在任务管理器中会出现Excel的进程,所以demo.Close
关闭进程就非常重要,否则计算机很容易会因为打开过多的Excel而内存爆炸。缺点是这个模块没有提供什么现成的结构性数据处理方法,所以取值只能借助sheet.Cells
来原始的取值,代码量会比较高。
笔者主要是在取各个省份的金融经济数据表,获取分省份的贷款余额,存款余额,生产总值,价格指数等信息,脚本如下:
# -*- coding: utf-8 -*-
# @author: caoyang
# @email: caoyang@163.sufe.edu.cn
import re
import os
import sys
import time
import xlrd
import numpy
import pandas
import openpyxl
from win32com.client import DispatchEx
# 全局变量
CURRENT_DIR = os.getcwd()
CFYB_DIR = 'cfyb'
CSYB_DIR = 'csyb'
TEMP_DIR = 'temp'
LOG_FILE = 'log.txt'
PROVINCE = [
'北京', '天津', '上海', '重庆', '河北', '山西',
'辽宁', '吉林', '黑龙江', '江苏', '浙江', '安徽',
'福建', '江西', '山东', '河南', '湖北', '湖南',
'广东', '海南', '四川', '贵州', '云南', '陕西',
'甘肃', '青海', '台湾', '内蒙古', '广西', '西藏',
'宁夏', '新疆', '香港', '澳门',
]
INT_COMPILER = re.compile(r'[^\d]') # 整型数正则
FLOAT_COMPILER = re.compile(r'[^\d | .]') # 浮点数正则
# 获取分省份的金融经济信息:以贷款余额为主
def get_loan_by_province():
def _get_province_by_title(_title): # 根据数据表名称提取对应省份名
for _province in PROVINCE:
if _province in _title:
return _province
def _format_cell_value(_cell_value, dtype=str): # 标准化单元格的值
if str(_cell_value) == 'None':
return None
if dtype == int:
return INT_COMPILER.sub('', str(_cell_value).replace(' ', ''))
if dtype == float:
return FLOAT_COMPILER.sub('', str(_cell_value).replace(' ', ''))
return str(_cell_value).replace(' ', '')
def _get_dataframe_from_sheet(_sheet):
_data_dict = {
'year': [], # 年份
'gdp': [], # 国内生产总值
'cpi': [], # 价格指数
'deposit': [], # 总存款余额
'individual_deposit': [], # 个人存款余额
'unit_deposit': [], # 单位存款余额
'finance_deposit': [], # 财政存款余额
'loan': [], # 总贷款余额
'short_loan': [], # 短期贷款余额
'long_loan': [], # 长期贷款余额
}
_flags = { # 对应_data_dict中的每一个字段用一个flag记录它是否被找到
'year': True, # 年份
'gdp': True, # 国内生产总值
'cpi': True, # 价格指数
'deposit': True, # 总存款余额
'individual_deposit': True, # 个人存款余额
'unit_deposit': True, # 单位存款余额
'finance_deposit': True, # 财政存款余额
'loan': True, # 总贷款余额
'short_loan': True, # 短期贷款余额
'long_loan': True, # 长期贷款余额
}
_row = 0
_MAX_ROW = 100
while _row < _MAX_ROW: # 遍历每一行
_row += 1 # 行号递进
_cell_value = _format_cell_value(_sheet.Cells(_row, 1)) # 提取每一行第一列的单元格值
if _cell_value is None: # 跳过空值的单元格
continue
if _flags['year'] and '项目' in _cell_value: # year: 【项目】所在行可以提取年份
print('year: ' + _cell_value)
_flags['year'] = False # 已经找到【项目】所在行
_column = 1
while True: # 遍历【项目】所在行的每一列:取得年份
_column += 1 # 列号递进
_year_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=int)
if _year_string is None: # 遇到空单元格值即可退出搜索
break
_data_dict['year'].append(_year_string) # 将当前年份添加到字典中
_num_year = len(_data_dict['year']) # 记录一共有多少年份
continue
if _flags['gdp'] and '生产总值' in _cell_value: # gdp: 国内生产总值
if _flags['year']: # 偶然发现有的表里没有【项目】,用国内生产总值的上一行修正
_flags['year'] = False # 已经找到【项目】所在行
_column = 1
while True: # 遍历【项目】所在行的每一列:取得年份
_column += 1 # 列号递进
_year_string = _format_cell_value(_sheet.Cells(_row - 1, _column), dtype=float)
if _year_string is None: # 遇到空单元格值即可退出搜索
break
_data_dict['year'].append(_year_string) # 将当前年份添加到字典中
_num_year = len(_data_dict['year']) # 记录一共有多少年份
print('gdp: ' + _cell_value)
_flags['gdp'] = False # 已经找到【国内生产总值】所在行
for _column in range(2, 2 + _num_year):
_gdp_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
_data_dict['gdp'].append(_gdp_string)
continue
if _flags['cpi'] and '价格指数' in _cell_value: # cpi: 消费者价格指数
print('cpi: ' + _cell_value)
_flags['cpi'] = False # 已经找到【消费者价格指数】所在行
for _column in range(2, 2 + _num_year):
_cpi_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
_data_dict['cpi'].append(_cpi_string)
continue
if _flags['deposit'] and ('存款' in _cell_value and \
'银行' in _cell_value): # deposit: 总存款余额
print('deposit: ' + _cell_value)
_flags['deposit'] = False # 已经找到【总存款余额】所在行
for _column in range(2, 2 + _num_year):
_deposit_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
_data_dict['deposit'].append(_deposit_string)
continue
if _flags['individual_deposit'] and ('存款' in _cell_value and \
('城乡' in _cell_value or '储蓄' in _cell_value)): # individual_deposit:个人存款余额
print('individual deposit: ' + _cell_value)
_flags['individual_deposit'] = False # 已经找到【个人存款余额】所在行
for _column in range(2, 2 + _num_year):
_individual_deposit_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
_data_dict['individual_deposit'].append(_individual_deposit_string)
continue
if _flags['unit_deposit'] and ('存款' in _cell_value and \
('企' in _cell_value or '单位' in _cell_value)): # unit_deposit: 单位存款余额
print('unit deposit: ' + _cell_value)
_flags['unit_deposit'] = False # 已经找到【单位存款余额】所在行
for _column in range(2, 2 + _num_year):
_unit_deposit_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
_data_dict['unit_deposit'].append(_unit_deposit_string)
continue
if _flags['finance_deposit'] and ('存款' in _cell_value and
('财政' in _cell_value)): # finance_deposit: 财政存款余额
print('finance deposit: ' + _cell_value)
_flags['finance_deposit'] = False # 已经找到【财政存款余额】所在行
for _column in range(2, 2 + _num_year):
_finance_deposit_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
_data_dict['finance_deposit'].append(_finance_deposit_string)
continue
if _flags['loan'] and ('贷款' in _cell_value and \
'银行' in _cell_value): # loan: 总贷款余额
print('loan: ' + _cell_value)
_flags['loan'] = False # 已经找到【总贷款余额】所在行
for _column in range(2, 2 + _num_year):
_loan_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
_data_dict['loan'].append(_loan_string)
continue
if _flags['short_loan'] and ('贷款' in _cell_value and \
('短期' in _cell_value or '流动' in _cell_value)):# short_loan: 短期贷款余额
print('short loan: ' + _cell_value)
_flags['short_loan'] = False # 已经找到【短期贷款余额】所在行
for _column in range(2, 2 + _num_year):
_short_loan_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
_data_dict['short_loan'].append(_short_loan_string)
continue
if _flags['long_loan'] and ('贷款' in _cell_value and \
('长期' in _cell_value or '固定' in _cell_value)):# long_loan: 长期贷款余额
print('long loan: ' + _cell_value)
_flags['long_loan'] = False # 已经找到【长期贷款余额】所在行
for _column in range(2, 2 + _num_year):
_long_loan_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
_data_dict['long_loan'].append(_long_loan_string)
continue
for key in _flags: # 没有找到数据的字段置空
if _flags[key]:
for _ in range(_num_year):
_data_dict[key].append(None)
_dataframe = pandas.DataFrame(_data_dict, columns=list(_data_dict.keys()))
print(_flags)
# print(_data_dict)
# print(_dataframe)
return _dataframe
application = DispatchEx('Excel.Application') # 启动Excel程序
for year in os.listdir(os.path.join(CURRENT_DIR, CFYB_DIR)): # 遍历中国金融年鉴下所有年份
print(f'======={year}=======')
# if year < 2017:
# continue
log_df = pandas.read_csv(os.path.join(CFYB_DIR, year, LOG_FILE), header=None, sep='\t', encoding='gbk') # 读取爬虫时的log表
log_df.columns = ['title', 'pagerange', 'filename'] # 为log表添加表头
result_df = log_df[log_df['title'].map(lambda x: sum([province in x for province in PROVINCE]) > 0 \
and ('主要金融经济统计' in x or '主要经济金融统计' in x))].reset_index(drop=True) # 筛选分地区的Excel表
dataframes = []
if result_df.shape[0] == 0:
continue
for i in range(result_df.shape[0]): # 遍历每一行
title = result_df.loc[i, 'title'] # 获取Excel表的名称
filename = result_df.loc[i, 'filename'] # 获取Excel表的文件名
province = _get_province_by_title(title) # 根据Excel表的名称提取对应的省份
print(title, filename)
excel = application.Workbooks.Open(os.path.join(CURRENT_DIR, CFYB_DIR, year, filename)) # 打开Excel表
sheet = excel.WorkSheets(1) # 取出其中第一张sheet
dataframe = _get_dataframe_from_sheet(sheet)
dataframe['province'] = province
excel.Close(True)
dataframes.append(dataframe)
df_concat = pandas.concat(dataframes, ignore_index=True)
df_concat.to_csv(f'loan_by_area_{year}.csv', index=False, header=True, sep='\t')
if __name__ == '__main__':
get_loan_by_province()
这个脚本写得很硬,很多地方的逻辑写得都非常硬,因为这些表每年的格式还不太一样,同一年份下不同省份的表格式也不太一样,所以数据真的是很难找,仅供参考。
后记(最近的一些感想)
数罪当论,最近做了好多错事。
跟老妈聊了一点之后,觉得昨天既然见面了还是应该打个招呼,大大方方就就好了。但是真的见到面还是觉得有点尴尬,招呼不是很说得出口,本来就不怎么擅长打招呼,尤其是这种情境。
实话说 S S S确实变了很多,头发留长了,也染了一些颜色,如果不是看正脸我应该是不太能一眼认出来了。
两年后在这个学校里,该走的人也该走完了,到那时也不由得自己再想些什么,读博之路,长夜漫漫,可能还是要孤独一些比较好,不然只会越走越长,谁知道呢?
坚持日记和跑步,这是我生命中最重要的两个信仰,前者让我记住自己,后者再让我忘掉自己。在记忆与遗忘中浮沉,也是一种人生之道。
今年是我写日记的第十个年头,所有日记加起来约莫百万余字,十几本整整齐齐地排在我的书桌上。
在2011年12月31日提笔开始写第一篇日记时,我可能想不到自己会写到这么久,但是对于我来说,一旦选择坚持一件事情,很可能会坚持到自己力不能及为止,因为我觉得没有什么是值得放弃的,如果还能抓得住的话。
但是也许我只是不想再去抓什么东西了罢,或者确切地说,我只是不敢再去抓住什么东西了罢。