页面介绍
专家查查:https://zjchacha.cn/
外页面接口:
![image.png](attachment:image.png)
https://api.zjchacha.cn/api/s?callback=resultcallback&q=%E5%8D%8E%E4%B8%AD%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&range=all&honor=&h=-1&domain=&org=&city=&from=20&size=11&page=21&ajax_flag=true&withMap=0&token=r6-8hHywrlpwrFmYco4FqGd3rY9V4h0ur3XKaZCRcYlDc67vr5Gg4X4dqGF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-81m4sY9Cc6uu4Y7XrlOEd12w0XKc5F3O&_=1623506284773
'https://api.zjchacha.cn/api/s?callback=resultcallback&q=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&range=all&honor=&h=-1&domain=&org=&city=&from=30&size=10&page=3&ajax_flag=true&withMap=0&token=7Dm531GId4tuIxUZt3O_gYyW3195snK58FUpIhihc1tWcyG4eo0nIksWcXF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-81mI0YG4dHcwqn8CzniZqntn053_84lO&_=1623753648961'
合作伙伴接口:
![image.png](attachment:image.png)
https://api.zjchacha.cn/api/co-authors?uri=0e70bc79fa3567ad5a9b60b0aa8d63ab&level=1&callback=resultcallbackUserAuthors&token=r6-8hHywrlpwrFmYco4FqGd3rY9V4h0ur3XKaZCRcYlDc67vr5Gg4X4dqGF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-81m4sY9Cc6uu4Y7XrlOEd12w0XKc5F3O&_=1623644916243
合作企业接口:
![image.png](attachment:image.png)
https://api.zjchacha.cn/api/co-coms?uri=0e70bc79fa3567ad5a9b60b0aa8d63ab&callback=resultcallbackUserCompany&token=r6-8hHywrlpwrFmYco4FqGd3rY9V4h0ur3XKaZCRcYlDc67vr5Gg4X4dqGF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-81m4sY9Cc6uu4Y7XrlOEd12w0XKc5F3O&_=1623645194599
一个接口文件的爬虫
import requests,pandas as pd,numpy as np,time,re,json,random
from fake_useragent import UserAgent设置随机User_Agent的库
h_factor_list=['0-5','6-10','11-15','16-20','20-25','>25']
h_factor_dict={}
for z in range(1,7):
h_factor_dict[z]=h_factor_list[z-1]
构造微信访问信息
token1=['hY0JIwNv5Ec_s3mYe6m0t6XLI6KD35Jvs1tyg1mhIy--PZK586yXqlGG8GF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-817DekG13mtwdmdZdYKY7xNwgh2usHsO&_'
,'dX9T3y8eqGdq03deIGt03H-Ge6GCIos-IFv15E4j7nmS3oCTg5mSgIT-dXF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-8H-IdntGh6KEzl4FzFCh5GKus50lrHNO&_'
,'7Dm531GId4tuIxUZt3O_gYyW3195snK58FUpIhihc1tWcyG4eo0nIksWcXF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-81mI0YG4dHcwqn8CzniZqntn053_84lO&_'
,'4XJY7YUog5-VsDl_dIUi0Xiu3nik5ITu3XmFdXpZ7ZdZdZBDIYuv4nmD3mF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-8HyShHBSrh44aYXG3o47dECKrY44rllO&_'
,'84tcIyOHe6UHPXins197co7D4194aYUyhh4Jt3v5tyCTcHBY0ZmG4YKZPXF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-8HOa3nm736OPzHOFr34ic3GD7YUdd4FO&_'
,'r6-8hHywrlpwrFmYco4FqGd3rY9V4h0ur3XKaZCRcYlDc67vr5Gg4X4dqGF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-81m4sY9Cc6uu4Y7XrlOEd12w0XKc5F3O&_'
,'0kgWq6K_rHGKdh3-sD0CzE03qk0W8l0WhmdKaDdlry80sjOjqltnd18ddXF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-818H5H9m53mChy9khmia3olS4H4_5hBO&_'
,'rZtxd5vKg14VrH9at548sYXqIk4ishCZs14WIo878kmgg10I3oi3z5_WqGF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-8HO_cm2DhGpW0n4WtEmCPZ7DekigqE2O&_'
,'rH4YsDC37ndEt1KeIXmudk3y34FSgD9DgZdXzFXg3G40dXK-zoGR0G3SdXF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-8104qkp-3E85hYO6d12-8FOSIEcvgy3O&_'
,'znKmI4KcdosugH93tEmm4m045os-I1KfclOfsYm9cX4ydlUvgYFYaXiV5mF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-8HFWIXpY5H9Vd44dqEt_sZigPX7vc6cO&_'
,'dGKy53dx3o933X4irHdC4FUygYOJsH9n83-6rhGhsZKG4YdR4l0grXFvImF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-810g3h4p31i-I3-ihG045FvldDi9t6NO&_']
def request_zjcc(university,h,fm,token):
header={'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36','cookie':'JSESSIONID=B34935612229D1ED91B7B72EB5FCBEB0'}
zjchacha=requests.get('https://api.zjchacha.cn/api/s?callback=resultcallback&q='+university+'&range=all&honor=&h='+str(h)+'&domain=&org=&city=&from='+str(fm*10)+'&size=10&page=1&ajax_flag=true&withMap=0&token='+token+'=1623676853529',headers=header)
zjchacha2=json.loads(zjchacha.text.split('resultcallback(')[1][:-1])['data']['talents']
zjchacha_df=[]
for i in range(10):
name=zjchacha2[i]['name'][0]姓名
tags=zjchacha2[i]['tags'][0]研究领域
hIndex=zjchacha2[i]['hIndex']h因子
if hIndex<0:
continue
paperCounts=zjchacha2[i]['paperCounts']论文数
cited=zjchacha2[i]['cited'] 被引用数
patentCounts=zjchacha2[i]['patentCounts']专利数
projectCounts=zjchacha2[i]['projectCounts']项目数
try:
titles=zjchacha2[i]['titles'][0]特殊荣誉
except:
titles='暂无'
暂时不需要的字段。
UserAuthors=json.loads(requests.get('https://api.zjchacha.cn/api/co-authors?uri='+zjchacha2[i]['uri']+'&level=1&callback=resultcallbackUserAuthors&token='+token+'=1623644916243',headers=header).text.split('resultcallbackUserAuthors(')[1][:-1])['data']
try:
UserAuthors_=[[o['name'],o['org'],o['paperCoTimes'],o['patentCoTimes'],o['projectCoTimes']] for o in UserAuthors if university not in o['org']][0]论文合作次数、专利合作次数、科研项目合作次数
except:
UserAuthors_=[None,None,None,None,None]
UserCompany=json.loads(requests.get('https://api.zjchacha.cn/api/co-coms?uri='+zjchacha2[i]['uri']+'&callback=resultcallbackUserCompany&token='+token+'=1623645194599',headers=header).text.split('resultcallbackUserCompany(')[1][:-1])['data']
try:
UserCompany_=[UserCompany[0]['org'],UserCompany[0]['city'],UserCompany[0]['paperTimes'],UserCompany[0]['patentTimes'],UserCompany[0]['projectTimes']]
except:
UserCompany_=[None,None,None,None,None]
zjchacha_list=(name,tags,hIndex,paperCounts,cited,patentCounts,projectCounts,titles,UserAuthors_[0],UserAuthors_[1],UserAuthors_[2],UserAuthors_[3],UserAuthors_[4],UserCompany_[0],UserCompany_[1],UserCompany_[2],UserCompany_[3],UserCompany_[4],h_factor_dict[h],university)
zjchacha_list=(name,tags,hIndex,paperCounts,cited,patentCounts,projectCounts,titles,h_factor_dict[h],university)
zjchacha_df.append(zjchacha_list)
time.sleep(round(random.uniform(0.5,1.5),2))
print(i)
return zjchacha_df
每个学校六个H因子的爬虫
![image.png](attachment:image.png)
def zjcc_tuple_list(university):
sheng=[]
for z in range(1,7):六个H因子
for k in range(0,10):每个页面只显示100条数据,但由十个接口文件组成
try:
sheng=sheng+request_zjcc(university,z,k,'hXmyaYmK0htptF9Ut6OWIYK6dE8lsnKaryG_gDtXhmCCehGu3XC90nBW8GF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-814xsh2v8Z4wqXCPqZdHhlsDejUe4XFO&_=')token1[k])
except IndexError:
break
except KeyError:
pass
except TypeError:
pass
except:
pass
print(str(z)+str(k))
return sheng
request_zjcc('北京交通大学',1,1,'dGKy53dx3o933X4irHdC4FUygYOJsH9n83-6rhGhsZKG4YdR4l0grXFvImF-eh2v3Yt0710f8ZKC4FU3thCP3DCmtn7S8Ft77Ft3d4iEq6K-3EF-810g3h4p31i-I3-ihG045FvldDi9t6NO&_')
434个高校的爬虫
List_of_Universities=pd.read_excel('爬取的高校名单.xlsx')
len(List_of_Universities)
reverse1=list(List_of_Universities['学校名称'])[::-1]
reverse1[:20]
reverse1.index('北京师范大学')
university_1=[]
for u in reverse1[416:]:
university_1=university_1+zjcc_tuple_list(u)
报错截图
![image.png](attachment:image.png)
len(university_1)
整理与合并接口文件数据
col_list=['姓名','研究领域','H因子','论文数','被引用数','专利数','项目','特殊荣誉','第一外校合作者名字','第一外校合作者学校','第一外校合作者合作论文数','第一外校合作者合作专利数','第一外校合作者合作项目数','第一合作企业者名字','第一合作企业者城市','第一合作企业者合作论文数','第一合作企业者合作专利数','第一合作企业者合作项目数','H因子范围','高校名称']
col_list=['姓名','研究领域','H因子','论文数','被引用数','专利数','项目','特殊荣誉','H因子范围','高校名称']
col_dict={}
for i in range(len(col_list)):
col_dict[i]=col_list[i]
zjcc_df_8=pd.DataFrame(university_1)zjcc_list_)
zjcc_df_8=zjcc_df_8.rename(columns=col_dict)
zjcc_df_8
zjcc_df_3=pd.concat([zjcc_df_1,zjcc_df_,zjcc_df_2,zjcc_df_4,zjcc_df_5,zjcc_df_6,zjcc_df_7,zjcc_df_8])
zjcc_df_3.index=list(range(len(zjcc_df_3)))
zjcc_df_3.drop_duplicates()
zjcc_df_3.index=list(range(len(zjcc_df_3)))
zjcc_df_3
len(set(zjcc_df_3['高校名称']))最终获取388
zjcc_df_3.to_excel('全部388个学校数据.xlsx')
专家查查-专家人才查询网
最新推荐文章于 2024-09-30 15:42:36 发布
该博客介绍了如何使用Python爬虫从'专家查查'网站抓取特定高校不同H因子范围的学者信息,包括姓名、研究领域、H因子、论文数等,并通过多个接口获取合作者和合作企业的数据。然后对爬取的数据进行整理,合并成一个DataFrame,最后筛选出388所高校的完整数据并保存为Excel文件。
摘要由CSDN通过智能技术生成