最近在学习嵩天老师在慕课上的爬虫课程,在学到中国大学排名的爬虫时,发现当前网页布局已经和之前老师上课的布局完全不同了。故基于此,对爬虫进行了调整。
具体代码如下:
import requests
from bs4 import BeautifulSoup
def getHtmlText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding #上述三个是爬虫的传统套路,建议后续的爬虫当中都加入
return r.text
except:
return ''
def fillUnivList(ulist,html):
soup = BeautifulSoup(html, 'lxml')
#table = soup.find('table',class_='rk-table').find_all('tr',class_='tbody')
for tr in soup.find('tbody').find_all('tr'): #soup.find('tbody).find_all('tr')
tds = tr('td') #tr.find_all('td')的简写
name_cn = tr.find('span',class_='name-cn')
name_en = tr.find('span', class_='name-en')
tags = tr.find('p',class_='tags')
#name = divs.find(class_='univname')
ulist.append([tds[0].string.strip(),name_cn.string.strip(),name_en.string.strip(),tags.string.strip(),tds[2].text.strip()])
print(ulist)
def printUnivList(ulist):
thread = ['序号','中文名称','英文名称','标签','坐落']
ulist.insert(0,thread)
with open('中国大学排名.csv','w',encoding='utf-8') as f:
for i in ulist:
f.write('{},{},{},{},{}\n'.format(i[0],i[1],i[2],i[3],i[4]))
print('suc' + str(len(ulist)-1))
def main():
uinfo = []
url = 'https://www.shanghairanking.cn/rankings/bcur/2024'
html = getHtmlText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo)
if __name__ == '__main__':
main()