【日常】《中国统计年鉴》与《中国金融年鉴》数据表爬虫(附1985-2020所有Excel资源)

序言

最近期末比较忙,挂个可能有用的资源:《中国金融年鉴》(1986-2019)和《中国统计年鉴》(1981-2020)的所有Excel表的资源。数据来源于中国知网的爬虫(下面正文中有提及具体链接网址)。目前为止网上还没有人提供完整的自1986年至今的年鉴数据,基本上只有特定年份的年鉴数据,而且还都是需要付费的。

链接:https://pan.baidu.com/s/13fjrInmjjxaNQRgS_Jv91w 
提取码:k5ir 

好了需要资源的上面自取即可,后记里的废话就不用看了。



1 《中国统计年鉴》与《中国金融年鉴》Excel数据爬虫

  • 鱼已经提供在上面了,下面是渔,不过笔者事先提醒,渔并不好学,建议自己去爬一遍就知道哪里比较坑了。
  • 不过第二部分里笔者也简要说明了一下爬虫的细节。

正在经历史上最难期末,放个历年《中国统计年鉴》和《中国金融年鉴》所有Excel表的爬虫脚本:

# -*- coding: utf-8 -*-
# @author: caoyang
# @email: caoyang@163.sufe.edu.cn

import os
import re
import time
import requests

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains

from bs4 import BeautifulSoup

def get_cookie(url):
	options = webdriver.FirefoxOptions()								
	options.add_argument("--headless")									
	driver = webdriver.Firefox(options=options)							
	driver.get(url)
	cookies = driver.get_cookies()
	driver.quit()
	def _cookie_to_string(cookies):
		string = ''
		for cookie in cookies:
			string += '{}={}; '.format(cookie['name'], cookie['value'])
		return string.strip()
	return _cookie_to_string(cookies)
	

def download_chinese_statistical_yearbook(ybcode='N2020100004', year='2020', save_root='csyb', is_initial=True, ignore_caj=True):
	with open('system_csyb.log', 'w') as f:
		pass
	headers = {'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
	query_url = 'https://data.cnki.net/Yearbook/PartialGetCatalogResult'
	excel_url = 'https://data.cnki.net/{}'.format
	caj_url = 'https://data.cnki.net/download/GetCajUrl'
	regex = r'<[^>]+>'
	cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
	compiler = re.compile(regex, re.S)
	regular_interval = 15
	reset_interval = 300
	

	if not os.path.exists(save_root):
		os.mkdir(save_root)
	# year = ybcode[1:5]
	target_path = os.path.join(save_root, year)
	if not os.path.exists(target_path):
		os.mkdir(target_path)

	with open(os.path.join(target_path, 'log.txt'), 'w') as f:
		pass

	formdata = {
		'ybcode': ybcode,
		'entrycode': '',
		'page': '1',
		'pagerow': '20'
	}
	response = requests.post(query_url, data=formdata, headers=headers)
	html = response.text
	soup = BeautifulSoup(html, 'lxml')
	span = soup.find('span', class_='s_p_listl')
	for link in span.find_all('a'):
		onclick = link.attrs.get('onclick')
		if onclick is not None:
			lindex = onclick.find('\'')
			rindex = onclick.find('\'', lindex + 1)
			n_pages = int(onclick[lindex + 1:rindex])
			break
	with open('system_csyb.log', 'a') as f:
		f.write('正在处理{}年...\t{}\n'.format(year, time.strftime('%Y-%m-%d %H:%M:%S')))
	print('正在处理{}年...'.format(year))
	with open('system_csyb.log', 'a') as f:
		f.write('共计{}页\t{}\n'.format(n_pages, time.strftime('%Y-%m-%d %H:%M:%S')))
	print('共计{}页'.format(n_pages))
	for page in range(1, n_pages + 1):
		with open('system_csyb.log', 'a') as f:
			f.write('  - 第{}页..\t{}\n'.format(page, time.strftime('%Y-%m-%d %H:%M:%S')))
		print('  - 第{}页..'.format(page))
		if not page == '1': 
			formdata = {
				'ybcode': ybcode,
				'entrycode': '',
				'page': str(page),
				'pagerow': '20'
			}
			while True:
				try:
					response = requests.post(query_url, data=formdata, headers=headers)
					break
				except:
					with open('system_csyb.log', 'a') as f:
						f.write('    页面访问失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
					print('    页面访问失败...')
					time.sleep(reset_interval)				
			html = response.text
			soup = BeautifulSoup(html, 'lxml')
		table = soup.find('table')
		for tr in table.find_all('tr'):
			tds = tr.find_all('td')
			assert len(tds) == 3
			title = compiler.sub('', str(tds[0])).replace('\n', '').replace('\t', '').replace(' ', '').replace('\r', '')
			page_range = compiler.sub('', str(tds[1])).replace('\n', '').replace('\t', '').replace(' ', '')
			for _link in tds[2].find_all('a'):
				href = _link.attrs['href']
				if href.startswith('/download/excel'):					 # excel
					filecode = href[href.find('=')+1:]
					while True:
						_headers = headers.copy()
						_headers['Cookie'] = cookies
						try:
							with open('system_csyb.log', 'a') as f:
								f.write('    + 下载{}...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
							print('    + 下载{}...'.format(title))
							response = requests.get(excel_url(href), headers=_headers)
							print('      ' + str(response.status_code))
							try:
								html = response.text
								soup = BeautifulSoup(html, 'lxml')
								if str(soup.find('title').string)=='中国经济社会大数据研究平台':
									with open('system_csyb.log', 'a') as f:
										f.write('      重置cookie...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
									print('      重置cookie...')
									cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
								else:
									break
							except:
								break
						except:
							with open('system_csyb.log', 'a') as f:
								f.write('      失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
							print('      失败...')
							time.sleep(reset_interval)
							cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
					time.sleep(regular_interval)
					
					with open(os.path.join(target_path, '{}.xls'.format(filecode)), 'wb') as f:
						f.write(response.content)
					with open(os.path.join(target_path, 'log.txt'), 'a') as f:
						f.write('{}\t{}\t{}.xls\n'.format(title, page_range, filecode))
				else:													 # caj
					if ignore_caj:
						continue
					filecode = _link.attrs['fn']
					pagerange = _link.attrs['pg']
					disk = _link.attrs['disk']
					_formdata = {
						'filecode': filecode,
						'pagerange': pagerange,
						'disk': disk,
					}
					while True:
						_headers = headers.copy()
						_headers['Cookie'] = cookies	
						try:		
							with open('system_csyb.log', 'a') as f:
								f.write('    + 下载{}的资源链接...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
							print('    + 下载{}的资源链接...'.format(title))		
							response = requests.post(caj_url, headers=_headers, data=_formdata)
							break
						except:
							with open('system_csyb.log', 'a') as f:
								f.write('      失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
							print('      失败...')
							time.sleep(reset_interval)		
							cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))					
					resource_url = response.json()['url']
					while True:
						try:
							with open('system_csyb.log', 'a') as f:
								f.write('    + 下载{}...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
							print('    + 下载{}...'.format(title))
							response = requests.get(resource_url, headers=headers)
							if str(response.status_code) == '200':
								break 
							else:
								with open('system_csyb.log', 'a') as f:
									f.write('      重置cookie...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
								print('      重置cookie...')
								time.sleep(reset_interval)
								cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
						except:
							with open('system_csyb.log', 'a') as f:
								f.write('      失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
							print('      失败...')
							time.sleep(regular_interval)
							cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
					time.sleep(regular_interval)
					with open(os.path.join(target_path, '{}.caj'.format(filecode)), 'wb') as f:
						f.write(response.content)
					with open(os.path.join(target_path, 'log.txt'), 'a') as f:
						f.write('{}\t{}\t{}.caj\n'.format(title, page_range, filecode))

			
	# Find urls of year
	if is_initial:
		url = 'https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)
		response = requests.get(url, headers=headers)
		html = response.text
		soup = BeautifulSoup(html, 'lxml')
		div = soup.find('div', class_='s_year clearfix')
		links = []
		ybcodes = []
		for link in div.find_all('a'):
			class_ = link.attrs.get('class')
			if class_ is None:											 # not current
				href = link.attrs.get('href')
				ybcode = href.split('/')[-1].split('?')[0]
				links.append(href)
				ybcodes.append(ybcode)
		with open('ybcode_csyb.txt', 'w') as f:
			for ybcode in ybcodes:
				f.write(f'{ybcode}\n')
		# for ybcode in ybcodes:
		#	download_chinese_statistical_yearbook(ybcode=ybcode, is_initial=False)
	

def download_chinese_financial_yearbook(ybcode='N2020070552', year='2019', save_root='cfyb', is_initial=True, ignore_caj=True):
	with open('system_cfyb.log', 'w') as f:
		pass
	headers = {'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0'}
	query_url = 'https://data.cnki.net/Yearbook/PartialGetCatalogResult'
	excel_url = 'https://data.cnki.net/{}'.format
	caj_url = 'https://data.cnki.net/download/GetCajUrl'
	regex = r'<[^>]+>'
	cookies = '''ASP.NET_SessionId=qgfddbtpp2yw1yik5xpie3mo; Ecp_ClientId=2210524115702029814; Ecp_LoginStuts={"IsAutoLogin":false,"UserName":"SH0013","ShowName":"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6","UserType":"bk","BUserName":"","BShowName":"","BUserType":"","r":"6dHmNy"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtGdzBoZVNMWk5Nc0RUeDFBOD0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 12:16:44; LID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtGdzBoZVNMWk5Nc0RUeDFBOD0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 12:16:44; SID=009026; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621828625; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621828625'''
	cookies = '''ASP.NET_SessionId=pdbekustghjjz2neuam5etnt; Ecp_ClientId=5210524165003078186; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"SH0013\",\"ShowName\":\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"087ZRr\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqelcxUzhJV1VTdGVGdmpHd1JmTGx6Sjd5N1Yzcz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 17:09:30; LID=WEEvREcwSlJHSldSdmVqelcxUzhJV1VTdGVGdmpHd1JmTGx6Sjd5N1Yzcz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 17:09:30; SID=009024; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621846228; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621846228'''
	cookies = '''ASP.NET_SessionId=mow1jjxmf3yl0kudfyxajmzc; Ecp_ClientId=2210524182003926881; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"SH0013\",\"ShowName\":\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"4ZXI5N\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtnL01TODdZeGZBQjFVNFFhVT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 18:39:44; LID=WEEvREcwSlJHSldSdmVqMDh6a1dpNjgzOEtnL01TODdZeGZBQjFVNFFhVT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 18:39:44; SID=009026; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621851606; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621852178'''
	cookies = '''ASP.NET_SessionId=x2uuxyelllkb01vne0bg1fcz; Ecp_ClientId=1210524220405317104; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"SH0013\",\"ShowName\":\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"CoZFit\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVR4WDNESDFyZmdtZks1OWNYNFlMRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/24/2021 22:23:34; LID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVR4WDNESDFyZmdtZks1OWNYNFlMRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-24 22:23:34; SID=009025; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621865075; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621865075'''
	cookies = '''ASP.NET_SessionId=nl5mpjvzy2az5kamdhek0ydq; Ecp_ClientId=3210525133102568069; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"SH0013\",\"ShowName\":\"%e4%b8%8a%e6%b5%b7%e8%b4%a2%e7%bb%8f%e5%a4%a7%e5%ad%a6\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"ubJVB4\"}; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqMVc3M1dGdk5Xa2hFYzh2WjV6Y2cvSUZzR3FPbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/25/2021 13:51:50; LID=WEEvREcwSlJHSldSdmVqMVc3M1dGdk5Xa2hFYzh2WjV6Y2cvSUZzR3FPbz0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2021-05-25 13:51:50; SID=009022; Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1621920712; Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1621920726'''
	cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
	compiler = re.compile(regex, re.S)
	regular_interval = 15
	reset_interval = 300
	

	if not os.path.exists(save_root):
		os.mkdir(save_root)
	# year = ybcode[1:5]
	target_path = os.path.join(save_root, year)
	if not os.path.exists(target_path):
		os.mkdir(target_path)

	with open(os.path.join(target_path, 'log.txt'), 'w') as f:
		pass

	formdata = {
		'ybcode': ybcode,
		'entrycode': '',
		'page': '1',
		'pagerow': '20'
	}
	response = requests.post(query_url, data=formdata, headers=headers)
	html = response.text
	soup = BeautifulSoup(html, 'lxml')
	span = soup.find('span', class_='s_p_listl')
	for link in span.find_all('a'):
		onclick = link.attrs.get('onclick')
		if onclick is not None:
			lindex = onclick.find('\'')
			rindex = onclick.find('\'', lindex + 1)
			n_pages = int(onclick[lindex + 1:rindex])
			break
	with open('system_cfyb.log', 'a') as f:
		f.write('正在处理{}年...\t{}\n'.format(year, time.strftime('%Y-%m-%d %H:%M:%S')))
	print('正在处理{}年...'.format(year))
	with open('system_cfyb.log', 'a') as f:
		f.write('共计{}页\t{}\n'.format(n_pages, time.strftime('%Y-%m-%d %H:%M:%S')))
	print('共计{}页'.format(n_pages))
	for page in range(1, n_pages + 1):
		with open('system_cfyb.log', 'a') as f:
			f.write('  - 第{}页..\t{}\n'.format(page, time.strftime('%Y-%m-%d %H:%M:%S')))
		print('  - 第{}页..'.format(page))
		if not page == '1': 
			formdata = {
				'ybcode': ybcode,
				'entrycode': '',
				'page': str(page),
				'pagerow': '20'
			}
			while True:
				try:
					response = requests.post(query_url, data=formdata, headers=headers)
					break
				except:
					with open('system_cfyb.log', 'a') as f:
						f.write('    页面访问失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
					print('    页面访问失败...')
					time.sleep(reset_interval)				
			html = response.text
			soup = BeautifulSoup(html, 'lxml')
		table = soup.find('table')
		for tr in table.find_all('tr'):
			tds = tr.find_all('td')
			assert len(tds) == 3
			title = compiler.sub('', str(tds[0])).replace('\n', '').replace('\t', '').replace(' ', '').replace('\r', '')
			page_range = compiler.sub('', str(tds[1])).replace('\n', '').replace('\t', '').replace(' ', '')
			for _link in tds[2].find_all('a'):
				href = _link.attrs['href']
				if href.startswith('/download/excel'):					 # excel
					filecode = href[href.find('=')+1:]
					while True:
						_headers = headers.copy()
						_headers['Cookie'] = cookies
						try:
							with open('system_cfyb.log', 'a') as f:
								f.write('    + 下载{}...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
							print('    + 下载{}...'.format(title))
							response = requests.get(excel_url(href), headers=_headers)
							print('      ' + str(response.status_code))
							try:
								html = response.text
								soup = BeautifulSoup(html, 'lxml')
								if str(soup.find('title').string)=='中国经济社会大数据研究平台':
									with open('system_cfyb.log', 'a') as f:
										f.write('      重置cookie...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
									print('      重置cookie...')
									cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
								else:
									break
							except:
								break
						except:
							with open('system_cfyb.log', 'a') as f:
								f.write('      失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
							print('      失败...')
							time.sleep(reset_interval)
							cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
					time.sleep(regular_interval)
					with open(os.path.join(target_path, '{}.xls'.format(filecode)), 'wb') as f:
						f.write(response.content)
					with open(os.path.join(target_path, 'log.txt'), 'a') as f:
						f.write('{}\t{}\t{}.xls\n'.format(title, page_range, filecode))
				else:													 # caj
					if ignore_caj:
						continue
					filecode = _link.attrs['fn']
					pagerange = _link.attrs['pg']
					disk = _link.attrs['disk']
					_formdata = {
						'filecode': filecode,
						'pagerange': pagerange,
						'disk': disk,
					}
					while True:
						_headers = headers.copy()
						_headers['Cookie'] = cookies	
						try:		
							with open('system_cfyb.log', 'a') as f:
								f.write('    + 下载{}的资源链接...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
							print('    + 下载{}的资源链接...'.format(title))		
							response = requests.post(caj_url, headers=_headers, data=_formdata)
							break
						except:
							with open('system_cfyb.log', 'a') as f:
								f.write('      失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
							print('      失败...')
							time.sleep(reset_interval)		
							cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))					
					resource_url = response.json()['url']
					while True:
						try:
							with open('system_cfyb.log', 'a') as f:
								f.write('    + 下载{}...\t{}\n'.format(title, time.strftime('%Y-%m-%d %H:%M:%S')))
							print('    + 下载{}...'.format(title))
							response = requests.get(resource_url, headers=headers)
							if str(response.status_code) == '200':
								break 
							else:
								with open('system_cfyb.log', 'a') as f:
									f.write('      重置cookie...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
								print('      重置cookie...')
								time.sleep(reset_interval)
								cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
						except:
							with open('system_cfyb.log', 'a') as f:
								f.write('      失败...\t{}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S')))
							print('      失败...')
							time.sleep(regular_interval)
							cookies = get_cookie('https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode))
							
					time.sleep(regular_interval)
					with open(os.path.join(target_path, '{}.caj'.format(filecode)), 'wb') as f:
						f.write(response.content)
					with open(os.path.join(target_path, 'log.txt'), 'a') as f:
						f.write('{}\t{}\t{}.caj\n'.format(title, page_range, filecode))

			
	# Find urls of year
	if is_initial:
		url = 'https://data.cnki.net/trade/Yearbook/Single/{}?z=Z016'.format(ybcode)
		response = requests.get(url, headers=headers)
		html = response.text
		soup = BeautifulSoup(html, 'lxml')
		div = soup.find('div', class_='s_year clearfix')
		links = []
		ybcodes = []
		for link in div.find_all('a'):
			class_ = link.attrs.get('class')
			if class_ is None:											 # not current
				href = link.attrs.get('href')
				ybcode = href.split('/')[-1].split('?')[0]
				links.append(href)
				ybcodes.append(ybcode)
		with open('ybcode_cfyb.txt', 'w') as f:
			for ybcode in ybcodes:
				f.write(f'{ybcode}\n')

		for ybcode in ybcodes:
			download_chinese_financial_yearbook(ybcode=ybcode, is_initial=False)	



if __name__ == '__main__':
	'''
	with open('ybcode_csyb.txt', 'r') as f:
	
		lines = f.read().splitlines()	
	for line in lines:
		ybcode, year = line.split()
		# if int(year) > 1999:
			# continue
		download_chinese_statistical_yearbook(ybcode=ybcode, year=year, is_initial=False)
	'''
	with open('ybcode_cfyb.txt', 'r') as f:
		lines = f.read().splitlines()	
	for line in lines:
		ybcode, year = line.split()
		#if int(year) > 1994:
			# continue
		download_chinese_financial_yearbook(ybcode=ybcode, year=year, save_root='cfyb', is_initial=False, ignore_caj=True)

数据源来自中国知网,附本爬虫的数据源:

因为笔者所在区域具有知网下载的权限,所以没有知网下载权限的拿到这个爬虫也没什么意义,注意到每次爬虫之前都会调用Selenium驱动浏览器去获取Cookies,以获取下载权限,由于知网Cookies时效很短,一旦失效下载得到的就是知网首页的HTML,所以需要编写逻辑去监测Cookies的有效性,一旦失效就需要继续调用Selenium重新获取Cookies。此外爬取速度尽可能地慢一些,代码中两次下载之间间隔 15 15 15秒,以《中国统计年鉴》为例,截至本文发布共计 39 39 39年,每年差不多有 600 600 600Excel表,所以基本上需要两三天时间才能全部下完。

2 关于爬虫的一些细节说明及如何使用脚本处理获得的Excel表

笔者鉴于时间有限不想多提爬虫的思路,这里主要记录几个细节(坑点):

  1. 代码中的ignore_caj=True即自动过滤caj链接的下载,事实上caj的下载相对会复杂一些,需要先做一个POST请求获取资源链接,而excel的下载链接直接就写在页面源代码上了。之所以选择不下载caj文件有两个原因,其一是caj文件确实没什么用,有用的数据都写在excel中了,重要的是另一个原因,就是cajPOST请求获取资源链接很容易造成Cookie不可用,导致需要频繁切换Cookie,太费时间,相对excel下载链接就很稳定,一般来说Cookie用上整整一天都不会失效。

  2. 爬虫主体仅为requests,浏览器驱动selenium只用作更新Cookie,所以资源耗用是比较小的。

  3. 注意上述链接中的页面源代码上是有<iframe>标签的,所以下载链接并不能直接在页面源代码中找到,需要监听抓包取得<iframe>标签下内容对应的URL

  4. 这里有一个问题,就是从上述数据源获取得到的Excel表全部都是受保护的,其实只是不能去编辑这些Excel,正常使用Office或是WPS依然可以打开读取(只读模式),如果想要解除保护,需要在审阅菜单下输入密码:

Figure 1
其实这也不影响使用,因为本来也不需要修改这些Excel表格,但是如果是使用pandas.read_excel或是xlrd.open_workbook都会发生报错:

xlrd.biffh.XLRDError: Workbook is encrypted

笔者暂时没有测试openpyxl.load_workbook受保护的Excel上的读取情况,原因是openpyxl只能支持.xlsx格式的文件读取。因为年鉴数据的Excel表数量实在是太多了,如果想要批量地找一类数据,不借助脚本而是手动去复制数据实在是太蠢,最后终于找到了一种可行的方案,即使用win32com.client模块下的DispatchEx方法:

from win32com.client import DispatchEx		
excel = DispatchEx('Excel.Application')
demo = excel.Workbooks.Open('N2018070031000595.xls')
sheet = demo.WorkSheets(1)
print(sheet.Cells(1,1).Value)
demo.Close(True)

本质是调用Office程序来读取,在任务管理器中会出现Excel的进程,所以demo.Close关闭进程就非常重要,否则计算机很容易会因为打开过多的Excel而内存爆炸。缺点是这个模块没有提供什么现成的结构性数据处理方法,所以取值只能借助sheet.Cells来原始的取值,代码量会比较高。

笔者主要是在取各个省份的金融经济数据表,获取分省份的贷款余额,存款余额,生产总值,价格指数等信息,脚本如下:

# -*- coding: utf-8 -*-
# @author: caoyang
# @email: caoyang@163.sufe.edu.cn

import re
import os
import sys

import time
import xlrd
import numpy
import pandas																
import openpyxl															
from win32com.client import DispatchEx									

# 全局变量
CURRENT_DIR = os.getcwd()
CFYB_DIR = 'cfyb'
CSYB_DIR = 'csyb'
TEMP_DIR = 'temp'
LOG_FILE = 'log.txt'
PROVINCE = [
	'北京', '天津', '上海', '重庆', '河北', '山西',
	'辽宁', '吉林', '黑龙江', '江苏', '浙江', '安徽',
	'福建', '江西', '山东', '河南', '湖北', '湖南',
	'广东', '海南', '四川', '贵州', '云南', '陕西',
	'甘肃', '青海', '台湾', '内蒙古', '广西', '西藏',
	'宁夏', '新疆', '香港', '澳门',
]

INT_COMPILER = re.compile(r'[^\d]')										 # 整型数正则
FLOAT_COMPILER = re.compile(r'[^\d | .]')								 # 浮点数正则

# 获取分省份的金融经济信息:以贷款余额为主
def get_loan_by_province():
	
	def _get_province_by_title(_title):									 # 根据数据表名称提取对应省份名
		for _province in PROVINCE:
			if _province in _title:
				return _province
	
	def _format_cell_value(_cell_value, dtype=str):						 # 标准化单元格的值
		if str(_cell_value) == 'None':
			return None
		if dtype == int:
			return INT_COMPILER.sub('', str(_cell_value).replace(' ', ''))
		if dtype == float:
			return FLOAT_COMPILER.sub('', str(_cell_value).replace(' ', ''))
		return str(_cell_value).replace(' ', '')
	
	def _get_dataframe_from_sheet(_sheet):
		_data_dict = {
			'year': [],													 # 年份
			'gdp': [],													 # 国内生产总值
			'cpi': [],													 # 价格指数
			'deposit': [],												 # 总存款余额
			'individual_deposit': [],									 # 个人存款余额
			'unit_deposit': [],											 # 单位存款余额
			'finance_deposit': [],										 # 财政存款余额
			'loan': [],													 # 总贷款余额
			'short_loan': [],											 # 短期贷款余额
			'long_loan': [],											 # 长期贷款余额
		}
		_flags = {														 # 对应_data_dict中的每一个字段用一个flag记录它是否被找到
			'year': True,												 # 年份
			'gdp': True,												 # 国内生产总值
			'cpi': True,												 # 价格指数
			'deposit': True,											 # 总存款余额
			'individual_deposit': True,									 # 个人存款余额
			'unit_deposit': True,										 # 单位存款余额
			'finance_deposit': True,									 # 财政存款余额
			'loan': True,												 # 总贷款余额
			'short_loan': True,											 # 短期贷款余额
			'long_loan': True,											 # 长期贷款余额
		}
		_row = 0
		_MAX_ROW = 100
		while _row < _MAX_ROW:											 # 遍历每一行
			_row += 1													 # 行号递进
			_cell_value = _format_cell_value(_sheet.Cells(_row, 1))		 # 提取每一行第一列的单元格值
			if _cell_value is None:										 # 跳过空值的单元格
				continue
			if _flags['year'] and '项目' in _cell_value:					 # year: 【项目】所在行可以提取年份
				print('year: ' + _cell_value)
				_flags['year'] = False									 # 已经找到【项目】所在行
				_column = 1
				while True:												 # 遍历【项目】所在行的每一列:取得年份
					_column += 1										 # 列号递进
					_year_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=int)
					if _year_string is None:							 # 遇到空单元格值即可退出搜索
						break
					_data_dict['year'].append(_year_string)				 # 将当前年份添加到字典中
				_num_year = len(_data_dict['year'])						 # 记录一共有多少年份
				continue
			if _flags['gdp'] and '生产总值' in _cell_value:				 # gdp: 国内生产总值
				if _flags['year']:										 # 偶然发现有的表里没有【项目】,用国内生产总值的上一行修正
					_flags['year'] = False								 # 已经找到【项目】所在行
					_column = 1
					while True:											 # 遍历【项目】所在行的每一列:取得年份
						_column += 1									 # 列号递进
						_year_string = _format_cell_value(_sheet.Cells(_row - 1, _column), dtype=float)
						if _year_string is None:						 # 遇到空单元格值即可退出搜索
							break
						_data_dict['year'].append(_year_string)			 # 将当前年份添加到字典中
					_num_year = len(_data_dict['year'])					 # 记录一共有多少年份					
				print('gdp: ' + _cell_value)
				_flags['gdp'] = False									 # 已经找到【国内生产总值】所在行
				for _column in range(2, 2 + _num_year):
					_gdp_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
					_data_dict['gdp'].append(_gdp_string)
				continue 
			
			if _flags['cpi'] and '价格指数' in _cell_value:				 # cpi: 消费者价格指数
				print('cpi: ' + _cell_value)
				_flags['cpi'] = False									 # 已经找到【消费者价格指数】所在行
				for _column in range(2, 2 + _num_year):
					_cpi_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
					_data_dict['cpi'].append(_cpi_string)
				continue
				
			if _flags['deposit'] and ('存款' in _cell_value and \
										'银行' in _cell_value):			 # deposit: 总存款余额
				print('deposit: ' + _cell_value)
				_flags['deposit'] = False								 # 已经找到【总存款余额】所在行
				for _column in range(2, 2 + _num_year):
					_deposit_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
					_data_dict['deposit'].append(_deposit_string)
				continue

			if _flags['individual_deposit'] and ('存款' in _cell_value and \
					('城乡' in _cell_value or '储蓄' in _cell_value)):	 # individual_deposit:个人存款余额
				print('individual deposit: ' + _cell_value)
				_flags['individual_deposit'] = False					 # 已经找到【个人存款余额】所在行
				for _column in range(2, 2 + _num_year):
					_individual_deposit_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
					_data_dict['individual_deposit'].append(_individual_deposit_string)
				continue	
		
			if _flags['unit_deposit'] and ('存款' in _cell_value and \
					('企' in _cell_value or '单位' in _cell_value)):		 # unit_deposit: 单位存款余额
				print('unit deposit: ' + _cell_value)
				_flags['unit_deposit'] = False							 # 已经找到【单位存款余额】所在行
				for _column in range(2, 2 + _num_year):
					_unit_deposit_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
					_data_dict['unit_deposit'].append(_unit_deposit_string)
				continue		
				
			if _flags['finance_deposit'] and ('存款' in _cell_value and 
										('财政' in _cell_value)):		 # finance_deposit: 财政存款余额
				print('finance deposit: ' + _cell_value)
				_flags['finance_deposit'] = False						 # 已经找到【财政存款余额】所在行
				for _column in range(2, 2 + _num_year):
					_finance_deposit_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
					_data_dict['finance_deposit'].append(_finance_deposit_string)
				continue					
				
			if _flags['loan'] and ('贷款' in _cell_value and \
										'银行' in _cell_value):			 # loan: 总贷款余额
				print('loan: ' + _cell_value)
				_flags['loan'] = False									 # 已经找到【总贷款余额】所在行
				for _column in range(2, 2 + _num_year):
					_loan_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
					_data_dict['loan'].append(_loan_string)
				continue		
				
			if _flags['short_loan'] and ('贷款' in _cell_value and \
						('短期' in _cell_value or '流动' in _cell_value)):# short_loan: 短期贷款余额
				print('short loan: ' + _cell_value)
				_flags['short_loan'] = False							 # 已经找到【短期贷款余额】所在行
				for _column in range(2, 2 + _num_year):
					_short_loan_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
					_data_dict['short_loan'].append(_short_loan_string)
				continue	

			if _flags['long_loan'] and ('贷款' in _cell_value and \
						('长期' in _cell_value or '固定' in _cell_value)):# long_loan: 长期贷款余额
				print('long loan: ' + _cell_value)
				_flags['long_loan'] = False								 # 已经找到【长期贷款余额】所在行
				for _column in range(2, 2 + _num_year):
					_long_loan_string = _format_cell_value(_sheet.Cells(_row, _column), dtype=float)
					_data_dict['long_loan'].append(_long_loan_string)
				continue
			
		for key in _flags:												 # 没有找到数据的字段置空
			if _flags[key]:
				for _ in range(_num_year):
					_data_dict[key].append(None)
					
		_dataframe = pandas.DataFrame(_data_dict, columns=list(_data_dict.keys()))
		print(_flags)
		# print(_data_dict)
		# print(_dataframe)
		return _dataframe
	
	
	
	application = DispatchEx('Excel.Application')						 # 启动Excel程序
	for year in os.listdir(os.path.join(CURRENT_DIR, CFYB_DIR)):		 # 遍历中国金融年鉴下所有年份
		print(f'======={year}=======')
		# if year < 2017:
			# continue
		log_df = pandas.read_csv(os.path.join(CFYB_DIR, year, LOG_FILE), header=None, sep='\t', encoding='gbk')		# 读取爬虫时的log表
		log_df.columns = ['title', 'pagerange', 'filename']															# 为log表添加表头
		result_df = log_df[log_df['title'].map(lambda x: sum([province in x for province in PROVINCE]) > 0 \
									and ('主要金融经济统计' in x or '主要经济金融统计' in x))].reset_index(drop=True)	# 筛选分地区的Excel表
		dataframes = []
		if result_df.shape[0] == 0:
			continue
		for i in range(result_df.shape[0]):																			# 遍历每一行
			title = result_df.loc[i, 'title']																		# 获取Excel表的名称
			filename = result_df.loc[i, 'filename']																	# 获取Excel表的文件名
			province = _get_province_by_title(title)																# 根据Excel表的名称提取对应的省份
			print(title, filename)
			excel = application.Workbooks.Open(os.path.join(CURRENT_DIR, CFYB_DIR, year, filename))					# 打开Excel表
			sheet = excel.WorkSheets(1)																				# 取出其中第一张sheet
			dataframe = _get_dataframe_from_sheet(sheet)
			dataframe['province'] = province
			excel.Close(True)
			dataframes.append(dataframe)
		df_concat = pandas.concat(dataframes, ignore_index=True)
		df_concat.to_csv(f'loan_by_area_{year}.csv', index=False, header=True, sep='\t')

if __name__ == '__main__':
	get_loan_by_province()

这个脚本写得很硬,很多地方的逻辑写得都非常硬,因为这些表每年的格式还不太一样,同一年份下不同省份的表格式也不太一样,所以数据真的是很难找,仅供参考。


后记(最近的一些感想)

数罪当论,最近做了好多错事。

跟老妈聊了一点之后,觉得昨天既然见面了还是应该打个招呼,大大方方就就好了。但是真的见到面还是觉得有点尴尬,招呼不是很说得出口,本来就不怎么擅长打招呼,尤其是这种情境。

实话说 S S S确实变了很多,头发留长了,也染了一些颜色,如果不是看正脸我应该是不太能一眼认出来了。

两年后在这个学校里,该走的人也该走完了,到那时也不由得自己再想些什么,读博之路,长夜漫漫,可能还是要孤独一些比较好,不然只会越走越长,谁知道呢?

坚持日记和跑步,这是我生命中最重要的两个信仰,前者让我记住自己,后者再让我忘掉自己。在记忆与遗忘中浮沉,也是一种人生之道。

今年是我写日记的第十个年头,所有日记加起来约莫百万余字,十几本整整齐齐地排在我的书桌上。

在2011年12月31日提笔开始写第一篇日记时,我可能想不到自己会写到这么久,但是对于我来说,一旦选择坚持一件事情,很可能会坚持到自己力不能及为止,因为我觉得没有什么是值得放弃的,如果还能抓得住的话。

但是也许我只是不想再去抓什么东西了罢,或者确切地说,我只是不敢再去抓住什么东西了罢。

评论 25
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值