效果预览
如图,目的是爬取国家税务总局2014-2018年国家税务总局各地纳税信用A级纳税人信息。
基础代码
import pandas as pd
import requests
URL='http://hd.chinatax.gov.cn/service/findCredit.do'
HEADER = {
'Cookie':'yfx_c_g_u_id_10003701=_ck20010211232618635509545356418; yfx_f_l_v_t_10003701=f_t_1577935406837__r_t_1577935406837__v_t_1577935406837__r_c_0; _Jo0OQK=21D020D4328410D73BDFA09A917AEB40E7167BB9651465E23A1380D81E8442706A3AA19408E6AD7127D826C47C034D0B2FB18F11F307B478FB63F657E29B5865DD71B918CCA8FE3BB9470EE0D297309F84070EE0D297309F840F2431DD92007ED637E4DF76A79A067B4GJ1Z1OA==; JSESSIONID=F82521F2DFB764BB975730AD95DEE54B',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
}
findCredit = {'page': 0,
'location': '110000',
'code': '',
'name': '',
'evalyear':'2018'
}
r = requests.post(URL ,data=findCredit, headers=HEADER)
# r=requests.get(URL,heders=findCredit)
print(r)
print(r.text)
进一步完善
方法一——根据输入的地方名称、年份搜索下载
import requests
import csv
import pandas as pd
def getData(pageNum,placecode,year,return_total_count=False):
URL='http://hd.chinatax.gov.cn/service/findCredit.do'
HEADER = {
'Cookie':'yfx_c_g_u_id_10003701=_ck20010211232618635509545356418; yfx_f_l_v_t_10003701=f_t_1577935406837__r_t_1577935406837__v_t_1577935406837__r_c_0; _Jo0OQK=21D020D4328410D73BDFA09A917AEB40E7167BB9651465E23A1380D81E8442706A3AA19408E6AD7127D826C47C034D0B2FB18F11F307B478FB63F657E29B5865DD71B918CCA8FE3BB9470EE0D297309F84070EE0D297309F840F2431DD92007ED637E4DF76A79A067B4GJ1Z1OA==; JSESSIONID=F82521F2DFB764BB975730AD95DEE54B',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
}
findCredit = {'page': pageNum,
'location': placecode,
# 'cPage':5,
'code': '',
'name': '',
'evalyear':year
}
r = requests.post(URL ,data=findCredit, headers=HEADER)
print(r)
# print(r.text)
if r.status_code == requests.codes.ok:
my_query = r.json()
if return_total_count:
tpage=my_query['totalPages']
return tpage
else:
data=pd.DataFrame({'code':[],'name':[], 'evalyear':[], 'location':[] })
for each in my_query['content']:
d1=str(each['code'])
d2=str(each['name'])
d3 = str(each['evalyear'])