1、网址
http://zuihaodaxue.cn/ARWU2015.html
需要用到 bs4 、正则表达式、requests 的知识
正则表达式:
http://blog.csdn.net/qq_21046135/article/details/71075612
bs4:
http://blog.csdn.net/qq_21046135/article/details/71039587
requests:
http://blog.csdn.net/qq_21046135/article/details/70764480
2、分析网页
如图知数据块是 tbody, html 中一行对应一个tr,世界排名、国家排名、总分可以直接通过 tr[‘td’] 获得,而学校需要通过 tr[‘td’].a.string 获得,国家/地区需要通过 tr[‘td’].a[‘title’] 获得
import requests
from bs4 import BeautifulSoup
import bs4
# 获取网页数据
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
# 解析网页数据
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr('td')
ulist.append([ tds[0].string, tds[1].a.string, tds[2].a['title'], tds[3].string, tds[4].string ])
# 显示数据
def printUnivList(ulist, num):
tplt = "{0:^10}\t{1:^44}\t{2:^16}\t{3:^16}\t{4:^16}"
print(tplt.format("世界排名", "学校名称", "国家/地区", "国家排名", "总分"), chr(12288))
for i in range(num):
u = ulist[i]
print(tplt.format(u[0], u[1], u[2], u[3], u[4]), chr(12288))
# 数据显示得不好,还没找到能整齐排列数据的方法。。。。
def main():
uinfo = []
url = 'http://zuihaodaxue.cn/ARWU2015.html'
html = getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo, 80) # 20 univs
if __name__ == '__main__':
main()