一、概述:
总共有56个学科排名,5个是"BROAD SUBJECT AREA",还有51个是"SPECIFIC SUBJECT",下面是代码的实现。每次启动的时候都需要在浏览器打开网页之后手动重新输入网页地址,不然会跳转到qsChina的网页。 打开网页后先要选Subject,也是利用selenium执行js脚本的方式模拟浏览器点击。先打开下拉框,再选择相应的Subject,然后根据选择的Subject创建Excel表格。接着在网页下方可以获取这个Subject的条目数量。然后切换到具体的指标页面,改变每页显示的条目数量(尽可能大,可以少翻页)。再之后就可以跟综合排名一样获取具体的数据了。每遍历一页之后要翻页。
二、实现代码:
# encoding=utf-8
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
import time
import xlsxwriter
import math
s = Service(r'C:\Users\myComputer\Desktop\chromedriver_win32\chromedriver.exe')
driver = webdriver.Chrome(service=s)
curl = 'https://www.topuniversities.com/university-rankings/university-subject-rankings/2022/arts-humanities'
driver.get(curl)
time.sleep(1)
for i in range(1, 59, 1):
# skip the border. "1" for "BROAD SUBJECT AREA"; "7" for "SPECIFIC SUBJECT"
if i == 1 or i == 7:
continue
# change subject
SubjectSel = driver.find_element(By.XPATH, '//*[@id="ranking-fillters"]/div[7]/div/div')
driver.execute_script('arguments[0].click();', SubjectSel)
time.sleep(1)
Subject = driver.find_element(By.XPATH, '//*[@id="ranking-fillters"]/div[7]/div/div/div[2]/div[' + str(i) + ']')
SubjectName = Subject.text
driver.execute_script('arguments[0].click();', Subject)
print('Select Subject: ' + SubjectName)
time.sleep(1)
# create sheet
Workbook = xlsxwriter.Workbook(SubjectName + '.xlsx')
Sheet = Workbook.add_worksheet()
# get item total number
itemNumber = driver.find_element(By.XPATH, '//*[@id="_totalcountresults"]')
itemNum = int(itemNumber.text)
print('Total Item count in ' + SubjectName + ': ' + itemNumber.text)
# change tab to rank index
rankInd = driver.find_element(By.XPATH,
'//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div[1]/div/div[1]/div/div/ul/li[2]/a')
driver.execute_script('arguments[0].click();', rankInd)
time.sleep(1)
print('Change tab to Ranking Indicators')
# change items number in every page
dropdown = driver.find_element(By.XPATH,
'//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div[1]/div/div[3]/div[4]/div[1]/div[2]/i')
driver.execute_script('arguments[0].click();', dropdown)
time.sleep(1)
itemsPerPage = driver.find_element(By.XPATH,
'//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div[1]/div/div[3]/div[4]/div[1]/div[2]/div[2]/div[4]')
driver.execute_script('arguments[0].click();', itemsPerPage)
time.sleep(1)
print('Now there are 100 items in every page')
# initial the table head
Sheet.write(0, 0, 'Rank')
Sheet.write(0, 1, 'University')
Sheet.write(0, 2, 'Location')
Sheet.write(0, 3, 'Overall Score')
Sheet.write(0, 4, 'H-index Citations')
Sheet.write(0, 5, 'Citations per Paper')
Sheet.write(0, 6, 'Academic Reputation')
Sheet.write(0, 7, 'Employer Reputation')
CycleCnt = int(math.ceil(itemNum / 100))
currentRow = 1
for j in range(CycleCnt):
k = 1
while True:
try:
eachItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div[' + str(k) + ']')
except:
break
attr = eachItem.get_attribute('customblock') # exclude ad
time.sleep(0.5)
if not attr:
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div[' + str(
k) + ']/div/div/div/div[1]/div/div/div/div/div[1]/div')
Sheet.write(currentRow, 0, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div[' + str(
k) + ']/div/div/div/div[1]/div/div/div/div/div[2]/div/div[1]/div')
Sheet.write(currentRow, 1, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div[' + str(
k) + ']/div/div/div/div[1]/div/div/div/div/div[2]/div/div[2]')
Sheet.write(currentRow, 2, subItem.text)
for q in range(3, 8, 1):
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div[' + str(
k) + ']/div/div/div/div[2]/div/div/div/div[' + str(q - 2) + ']')
Sheet.write(currentRow, q, subItem.text)
print(str(currentRow) + '/' + str(itemNum) + ' finished!')
currentRow = currentRow + 1
k = k + 1
# next page
if j < CycleCnt - 1:
q = 3
while True:
try:
nextPage = driver.find_element(By.XPATH, '//*[@id="alt-style-pagination"]/li[' + str(q) + ']/a')
except:
q = q + 1
continue
attr = nextPage.get_attribute('class')
if attr == 'page-link next':
break
q = q + 1
driver.execute_script('arguments[0].click();', nextPage)
time.sleep(1)
print('chaneg to page ' + str(j + 2))
print('finish ' + SubjectName)
Workbook.close()
driver.close()
三、Tips:
1. 首先需要根据浏览器与浏览器版本下载对应的driver
Chrome:http://chromedriver.storage.googleapis.com/index.html
Edge:https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/
2. 可执行路径的方式已经不能使用,要替换为Service对象,否则会报DeprecationWarning错误
之前的写法:(报错:DeprecationWarning: executable_path has been deprecated, please pass in a Service object)
driver = webdriver.Chrome(r'C:\Users\myComputer\Desktop\chromedriver_win32\chromedriver.exe') #这里添加的是driver的绝对路径
替换写法为:
s = Service(r'C:\Users\myComputer\Desktop\chromedriver_win32\chromedriver.exe')
driver = webdriver.Chrome(service=s)
持续更新中…
注:本文基于QS世界大学综合排名与学科排名数据获取扩展补充