US News的大学排名数据获取有一点点复杂。US News 2022 Best Global Universities Rankings网页是动态刷新的,数据量也比较大。数据获取总共分为三步:
1、获取大学基本信息;2、获取每所大学的详细信息;3、各学科各大学的排名指标
获取基本信息
在不断下拉的过程中打开F12调试工具,可以看到有一个“search?format=json&page=”开头的包,这个包的响应里面就有一些大学的基本信息。每个响应包里有10所大学的信息。
再看这个包的标头,就可以找到它的“请求URL”,我们也只要发这个请求URL,就可以获取相应的响应包。总共有2005所大学,所以只要循环201次即可。
#encoding=utf-8
import requests
import time
fp = open('collegeInfo.txt', 'w', encoding='utf-8')
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'
}
for page in range(1, 202, 1):
response = requests.get(url='https://www.usnews.com/education/best-global-universities/search?format=json&page='+str(page),headers=headers)
if response.status_code == 200:
re_text = response.json()
li = re_text.get("items")
for i in li:
strConcat = i['city'] + ';' + i['country_name'] + ';' + str(i['id']) + ';' + i['name'] + ';' + str(i['ranks'][0]['value']) + ';' + str(i["stats"][0]['value']) + ';' + str(i['stats'][1]['value']) + ';' + i['url']
print(strConcat)
fp.writelines(strConcat)
time.sleep(3)
else:
print('response error')
break
fp.close()
获取详细信息
上面的代码获取了几所大学的基本信息,我保存到了一个txt文件“collegeInfo.txt”中,然后把它导入到excel里,分隔符选择分号,就可以得到几所大学的学校名字和网页链接。把校名和网页链接分别保存到两个文本文件“collegeNames.txt”和“collegeLinks.txt”中,用于后续获取大学的详细信息。
详细信息包括哪些内容呢?以哈佛大学为例:
包括上面的“University Data”和“Rankings”,基本上每个大学都有,这两部分数据,只是有些数据项可能有欠缺。所以针对每个大学我都从一个空的字典开始往里填充数据,一所大学对应一个Dataframe,最后把所有大学的Dataframe Concat起来就好了。
# _._ coding:utf-8 _._#
import lxml
from lxml import etree
import requests
import time
import pandas as pd
import random
fp_name = open('collegeNames.txt', 'r', encoding='utf-8')
fp_links = open('collegeLinks.txt', 'r', encoding='utf-8')
dfli = []
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'
}
cnt = 1
while True:
schoolName = fp_name.readline()
schoolName = schoolName.split('\n')[0]
if not schoolName:
break
schoolUrl = fp_links.readline()
dataDict = {'University':schoolName,
'Total number of students':'',
'Number of international students':'',
'Total number of academic staff':'',
'Number of international staff':'',
'Number of undergraduate degrees awarded':'',
"Number of master's degrees awarded":'',
'Number of doctoral degrees awarded':'',
'Number of research only staff':'',
'Number of new undergraduate students':'',
"Number of new master's students":'',
'Number of new doctoral students':'',
'Best Global Universities':'',
'Best Global Universities in Region':'',
'Best Global Universities in Country':'',
'Agricultural Sciences':'',
'Arts and Humanities':'',
'Biology and Biochemistry':'',
'Biotechnology and Applied Microbiology':'',
'Cardiac and Cardiovascular Systems':'',
'Cell Biology':'',
'Chemical Engineering':'',
'Chemistry':'',
'Civil Engineering':'',
'Clinical Medicine':'',
'Computer Science':'',
'Condensed Matter Physics':'',
'Economics and Business':'',
'Electrical and Electronic Engineering':'',
'Endocrinology and Metabolism':'',
'Energy and Fuels':'',
'Engineering':'',
'Environment/Ecology':'',
'Food Science and Technology':'',
'Gastroenterology and Hepatology':'',
'Geosciences':'',
'Immunology':'',
'Infectious Diseases':'',
'Materials Science':'',
'Mathematics':'',
'Mechanical Engineering':'',
'Microbiology':'',
'Molecular Biology and Genetics':'',
'Nanoscience and Nanotechnology':'',
'Neuroscience and Behavior':'',
'Oncology':'',
'Optics':'',
'Pharmacology and Toxicology':'',
'Physical Chemistry':'',
'Physics':'',
'Plant and Animal Science':'',
'Polymer Science':'',
'Psychiatry/Psychology':'',
'Public, Environmental and Occupational Health':'',
'Radiology, Nuclear Medicine and Medical Imaging':'',
'Social Sciences and Public Health':'',
'Space Science':'',
'Surgery':''}
response = requests.get(url=schoolUrl, headers=headers)
if response.status_code == 200:
html = response.text.encode('utf-8')
result = etree.HTML(html)
content = result.xpath('//*[@id="uniData"]/div/div')
for subItem in range(len(content)):
label = content[subItem].xpath('./p[1]')[0].text
value = content[subItem].xpath('./p[2]')[0].text
valueSplit = value.split(',')
valueConcat = ''
for i in range(len(valueSplit)):
valueConcat = valueConcat + valueSplit[i]
print(str(cnt) + ' ' + schoolName + ' ' + label + ': ' + valueConcat)
dataDict[label] = valueConcat
content = result.xpath('//*[@id="rankings"]/div')
for i in range(len(content)):
if content[i].attrib['class'] == 'mb5':
subUl = content[i].xpath('./div/ul')
elif content[i].attrib['class'] == 'subject-rankings':
subUl = content[i].xpath('./ul')
else:
break
for subLi in subUl:
subLiObj = subLi.xpath('./li')
for items in subLiObj:
rankValue = items.xpath('./a/div/strong/text()')
if rankValue[0] == '#':
rank = rankValue[1]
else:
rank = rankValue[0]
label = items.xpath('./a/strong')[0].text
if label in dataDict.keys():
dataDict[label] = rank
elif ((label == 'Best Global Universities in Asia') |
(label == 'Best Global Universities in Africa') |
(label == 'Best Global Universities in Australia/New Zealand') |
(label == 'Best Global Universities in Europe') |
(label == 'Best Global Universities in Latin America')):
dataDict['Best Global Universities in Region'] = rank
else:
dataDict['Best Global Universities in Country'] = rank
print(str(cnt) + ' ' + schoolName + ' ' + label + ': ' + rank)
df = pd.DataFrame(dataDict, index=[0])
dfli.append(df)
cnt = cnt + 1
else:
break
time.sleep(random.random()*2+1)
dfli = pd.concat(dfli)
dfli.to_excel('UniversityDetail.xlsx', index=False)
各学科各大学排名指标
不同的学科也有各自的排名,大致数据如下:
遍历每个大学的网页的时候,可以在每个网页中找到多个学科的排名指标,为每个学科设置一个Dataframe的列表,然后再最后Concat起来,再写入excel中就好了。
from lxml import etree
import requests
import time
import re
import pandas
import random
from requests.api import head
fpDict = {'Global Universities': 0,
'Agricultural Sciences': 1,
'Arts and Humanities':2,
'Biology and Biochemistry':3,
'Biotechnology and Applied Microbiology':4,
'Cardiac and Cardiovascular Systems':5,
'Cell Biology':6,
'Chemical Engineering':7,
'Chemistry':8,
'Civil Engineering':9,
'Clinical Medicine':10,
'Computer Science':11,
'Condensed Matter Physics':12,
'Economics and Business':13,
'Electrical and Electronic Engineering':14,
'Endocrinology and Metabolism':15,
'Energy and Fuels':16,
'Engineering':17,
'Environment/Ecology':18,
'Food Science and Technology':19,
'Gastroenterology and Hepatology':20,
'Geosciences':21,
'Immunology':22,
'Infectious Diseases':23,
'Materials Science':24,
'Mathematics':25,
'Mechanical Engineering':26,
'Microbiology':27,
'Molecular Biology and Genetics':28,
'Nanoscience and Nanotechnology':29,
'Neuroscience and Behavior':30,
'Oncology':31,
'Optics':32,
'Pharmacology and Toxicology':33,
'Physical Chemistry':34,
'Physics':35,
'Plant and Animal Science':36,
'Polymer Science':37,
'Psychiatry/Psychology':38,
'Public, Environmental and Occupational Health':39,
'Radiology, Nuclear Medicine and Medical Imaging':40,
'Social Sciences and Public Health':41,
'Space Science':42,
'Surgery':43}
fp_name = open('collegeNames.txt', 'r', encoding='utf-8')
fp_country = open('collegeCountries.txt', 'r', encoding='utf-8')
fp_links = open('collegeLinks.txt', 'r', encoding='utf-8')
dfli0 = []
dfli1 = []
dfli2 = []
dfli3 = []
dfli4 = []
dfli5 = []
dfli6 = []
dfli7 = []
dfli8 = []
dfli9 = []
dfli10 = []
dfli11 = []
dfli12 = []
dfli13 = []
dfli14 = []
dfli15 = []
dfli16 = []
dfli17 = []
dfli18 = []
dfli19 = []
dfli20 = []
dfli21 = []
dfli22 = []
dfli23 = []
dfli24 = []
dfli25 = []
dfli26 = []
dfli27 = []
dfli28 = []
dfli29 = []
dfli30 = []
dfli31 = []
dfli32 = []
dfli33 = []
dfli34 = []
dfli35 = []
dfli36 = []
dfli37 = []
dfli38 = []
dfli39 = []
dfli40 = []
dfli41 = []
dfli42 = []
dfli43 = []
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'
}
cnt = 1
while True:
schoolName = fp_name.readline()
schoolName = schoolName.split('\n')[0]
if not schoolName:
break
schoolCountry = fp_country.readline()
schoolCountry = schoolCountry.split('\n')[0]
schoolUrl = fp_links.readline()
response = requests.get(url=schoolUrl, headers=headers)
if response.status_code == 200:
html = response.text.encode('utf-8')
result = etree.HTML(html)
content = result.xpath('//*[@id="indicators"]/div[2]/div')
for box in content:
category = box.xpath('./button/h3')[0].text
rank = box.xpath('./div/ul/li/a/div/strong/text()')
if rank[0] == '#':
rankValue = rank[1]
else:
rankValue = rank[0]
indDict = {'University':schoolName,
'Country':schoolCountry,
'Rank':rankValue,
'overall score':'',
'global research reputation':'',
'regional research reputation':'',
'publications':'',
'books':'',
'conferences':'',
'normalized citation impact':'',
'total citations':'',
'number of publications that are among the 10% most cited':'',
'percentage of total publications that are among the 10% most cited':'',
'international collaboration - relative to country':'',
'international collaboration':'',
'number of highly cited papers that are among the top 1% most cited':'',
'percentage of highly cited papers that are among the top 1% most cited':''}
fpNum = fpDict.get(category)
dfli = eval('dfli'+str(fpNum))
allInd = box.xpath('./div/div')
for eachInd in allInd:
label = eachInd.xpath('./p[1]')[0].text
value = eachInd.xpath('./p[2]')[0].text
label = re.sub(category+' ', '', label)
label = label.lower()
value = re.sub('#', '', value)
if ((label == 'global score') | (label == 'overall score')):
indDict['overall score'] = value
else:
indDict[label] = value
print(str(cnt) + ' | ' + schoolName + ' | ' + category + ' | ' + label + ': ' + value)
df = pandas.DataFrame(indDict, index=[0])
dfli.append(df)
time.sleep(random.random()*2+1)
cnt = cnt + 1
dfli0=pandas.concat(dfli0)
dfli1=pandas.concat(dfli1)
dfli2=pandas.concat(dfli2)
dfli3=pandas.concat(dfli3)
dfli4=pandas.concat(dfli4)
dfli5=pandas.concat(dfli5)
dfli6=pandas.concat(dfli6)
dfli7=pandas.concat(dfli7)
dfli8=pandas.concat(dfli8)
dfli9=pandas.concat(dfli9)
dfli10=pandas.concat(dfli10)
dfli11=pandas.concat(dfli11)
dfli12=pandas.concat(dfli12)
dfli13=pandas.concat(dfli13)
dfli14=pandas.concat(dfli14)
dfli15=pandas.concat(dfli15)
dfli16=pandas.concat(dfli16)
dfli17=pandas.concat(dfli17)
dfli18=pandas.concat(dfli18)
dfli19=pandas.concat(dfli19)
dfli20=pandas.concat(dfli20)
dfli21=pandas.concat(dfli21)
dfli22=pandas.concat(dfli22)
dfli23=pandas.concat(dfli23)
dfli24=pandas.concat(dfli24)
dfli25=pandas.concat(dfli25)
dfli26=pandas.concat(dfli26)
dfli27=pandas.concat(dfli27)
dfli28=pandas.concat(dfli28)
dfli29=pandas.concat(dfli29)
dfli30=pandas.concat(dfli30)
dfli31=pandas.concat(dfli31)
dfli32=pandas.concat(dfli32)
dfli33=pandas.concat(dfli33)
dfli34=pandas.concat(dfli34)
dfli35=pandas.concat(dfli35)
dfli36=pandas.concat(dfli36)
dfli37=pandas.concat(dfli37)
dfli38=pandas.concat(dfli38)
dfli39=pandas.concat(dfli39)
dfli40=pandas.concat(dfli40)
dfli41=pandas.concat(dfli41)
dfli42=pandas.concat(dfli42)
dfli43=pandas.concat(dfli43)
dfli0.to_excel('Global Universities.xlsx', index=False)
dfli1.to_excel('Agricultural Sciences.xlsx', index=False)
dfli2.to_excel('Arts and Humanitie.xlsx', index=False)
dfli3.to_excel('Biology and Biochemistr.xlsx', index=False)
dfli4.to_excel('Biotechnology and Applied Microbiolog.xlsx', index=False)
dfli5.to_excel('Cardiac and Cardiovascular System.xlsx', index=False)
dfli6.to_excel('Cell Biolog.xlsx', index=False)
dfli7.to_excel('Chemical Engineerin.xlsx', index=False)
dfli8.to_excel('Chemistr.xlsx', index=False)
dfli9.to_excel('Civil Engineerin.xlsx', index=False)
dfli10.to_excel('Clinical Medicine.xlsx', index=False)
dfli11.to_excel('Computer Science.xlsx', index=False)
dfli12.to_excel('Condensed Matter Physics.xlsx', index=False)
dfli13.to_excel('Economics and Business.xlsx', index=False)
dfli14.to_excel('Electrical and Electronic Engineering.xlsx', index=False)
dfli15.to_excel('Endocrinology and Metabolism.xlsx', index=False)
dfli16.to_excel('Energy and Fuels.xlsx', index=False)
dfli17.to_excel('Engineering.xlsx', index=False)
dfli18.to_excel('Environment_Ecology.xlsx', index=False)
dfli19.to_excel('Food Science and Technology.xlsx', index=False)
dfli20.to_excel('Gastroenterology and Hepatology.xlsx', index=False)
dfli21.to_excel('Geosciences.xlsx', index=False)
dfli22.to_excel('Immunology.xlsx', index=False)
dfli23.to_excel('Infectious Diseases.xlsx', index=False)
dfli24.to_excel('Materials Science.xlsx', index=False)
dfli25.to_excel('Mathematics.xlsx', index=False)
dfli26.to_excel('Mechanical Engineering.xlsx', index=False)
dfli27.to_excel('Microbiology.xlsx', index=False)
dfli28.to_excel('Molecular Biology and Genetics.xlsx', index=False)
dfli29.to_excel('Nanoscience and Nanotechnology.xlsx', index=False)
dfli30.to_excel('Neuroscience and Behavior.xlsx', index=False)
dfli31.to_excel('Oncology.xlsx', index=False)
dfli32.to_excel('Optics.xlsx', index=False)
dfli33.to_excel('Pharmacology and Toxicology.xlsx', index=False)
dfli34.to_excel('Physical Chemistry.xlsx', index=False)
dfli35.to_excel('Physics.xlsx', index=False)
dfli36.to_excel('Plant and Animal Science.xlsx', index=False)
dfli37.to_excel('Polymer Science.xlsx', index=False)
dfli38.to_excel('Psychiatry_Psychology.xlsx', index=False)
dfli39.to_excel('Public, Environmental and Occupational Health.xlsx', index=False)
dfli40.to_excel('Radiology, Nuclear Medicine and Medical Imaging.xlsx', index=False)
dfli41.to_excel('Social Sciences and Public Health.xlsx', index=False)
dfli42.to_excel('Space Science.xlsx', index=False)
dfli43.to_excel('Surgery.xlsx', index=False)