爬取大学排名
爬虫功能要求:
输入:大学排名URL链接
输出:大学排名信息的屏幕输出(排名,大学名称,总分)
使用的工具库:Requests、BeautifulSoup等
#getListRank2.py
import requests
from bs4 import BeautifulSoup
import bs4
#从网络上获取排名的网页内容
def getHTMLText(url):
try:
r = requests.get(url,timeout = 60)
r.raise_for_status()
r.encoding = r.apparent_encoding
#返回获取的网页源代码
return r.text
except:
return ""
#提取网页内容中信息到合适的数据结构
def fillUnivList(ulist,html):
soup = BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds = tr('td')
print(tds)
ulist.append([tds[0].string,tds[1].string,tds[2].string])
#利用数据结构展示并输出结果
def printUnivList(ulist,num):
tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
print(tplt.format("排名","学校","城市",chr(12288)))
for i in range(num):
u = ulist[i]
print(tplt.format(u[0],u[1],u[2],chr(12288)))
#main函数
if __name__=='__main__':
uinfo = []
url = "http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html"
html = getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,20)
优秀代码(小的自愧不如):
Python爬虫工具:python3、requests、beautifulsoup
程序设计思路:
(1)研究大学排名网站网页URL
(2)设计fetchUrl函数,尝试获取页面;
(3)设计parseHtml函数,解析内容;
(4)设计output函数,组织列表形式输出;
(5)使用main函数调用程序。
import re
import requests
import bs4
def fetchUrl(url):
'''
功能:根据参数 url ,发起 http request,尝试获取指定网页并返回结果
参数:
url:某个 webpage 的url
返回:类文件对象型 http Response 对象
'''
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
print('success!')
return r.text
except requests.RequestError as e:
print(e)
except:
return "Error!"
def parserHtml(html,urating):
'''
功能:根据参数 html 给定的内存型 HTML 文件,尝试解析其结构,获取所需内容
参数:
html:类似文件的内存 HTML 文本对象
urating:一个二维列表,存放着大学排名信息
返回:一个二维列表,存放着大学排名信息
'''
bsobj = bs4.BeautifulSoup(html,'html.parser')
# 获取表头信息
tr = bsobj.find('thead').find('tr')
hlist = []
if isinstance(tr, bs4.element.Tag):
for th in tr('th'):
hlist.append(th.string)
hlist.pop()
for option in tr('option'):
hlist.append(option.string)
urating.append(hlist)
# 获取表体信息
for tr in bsobj.find('tbody').children:
blist = []
if isinstance(tr, bs4.element.Tag):
for td in tr('td'):
blist.append(td.string)
urating.append(blist)
return urating
def output(urating, filename):
'''
功能:格式化输出结果
参数:
urating:存放着排名结果的二维列表
filename:保存的文件名
返回:无
'''
import pandas as pd
dataframe = pd.DataFrame(urating)
dataframe.to_csv(filename, index=False, sep=',', header=False)
print("Success!")
def main():
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html'
print("Begin to crawl the http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html and get the rating of universities in china ...")
print('---'*20)
print("Try to fetch url ...")
html = fetchUrl(url)
print("Try to parser html ...")
urating = []
ur = parserHtml(html,urating)
print("Try to save the results in file ...")
output(ur, '大学排名2018.csv')
print("The work of crawling is done.")
if __name__ == '__main__':
main()