需要准备的库
import requests
from bs4 import BeautifulSoup
import bs4
总体框架
def main(num,years):
unifo=[]
url = 'http://www.zuihaodaxue.cn/\zuihaodaxuepaiming'+years+'.html'
html=getHTMLText(url)
soup=BeautifulSoup(html,'html.parser')
fillUnivList(unifo,soup)
printUnivList(unifo,num)
首先我们得获取要爬取网页的源代码
def getHTMLText(url):
try:
#timeout为超时控制,当超时时,抛出异常
r=requests.get(url,timeout=30)
r.encoding='utf-8'
return r.text
except:
return ''
若要获得源代码的数据,需要首先找到tr标签,并遍历其中每个td标签,获取数值写入程序数据结构中
def fillUnivList(unifo,soup):
data=soup.find_all('tr') #查找<tr>
for tr in data: #遍历<td>
try:
if isinstance(tr,bs4.element.Tag):
tds=tr('td')
unifo.append([tds[0].string, tds[1].string, tds[2].string, tds[3].string])
except:
continue
按照数据在html源代码的位置,从Python列表中提前相应位置的数据,并打印出来
def printUnivList(unifo,num):
print('{:^4}{:^10}{:^5}{:^8}{:^10}'.format('排名','学校名称','省市','总分',chr(12288)))
#chr(12288)的作用是格式化文本
for i in range(num):
u=unifo[i]
print('{:^4}{:^10}{:^5}{:^8}{:^10}'.format(u[0],u[1],u[2],u[3],chr(12288)))
在此基础上,我们可以让代码变得更灵活
if __name__=="__main__":
num=int(input('请输入你要查看的大学排名数量:'))
years=input('请输入排名的年份')
main(num,years)
完整代码如下
import requests
from bs4 import BeautifulSoup
import bs4
def main(num,years):
unifo=[]
url = 'http://www.zuihaodaxue.cn/\zuihaodaxuepaiming'+years+'.html'
html=getHTMLText(url)
soup=BeautifulSoup(html,'html.parser')
fillUnivList(unifo,soup)
printUnivList(unifo,num)
def getHTMLText(url):
try:
r=requests.get(url,timeout=30) #timeout为超时控制,当超时时,抛出异常
r.encoding='utf-8'
return r.text
except:
return ''
def fillUnivList(unifo,soup):
data=soup.find_all('tr')
for tr in data:
try:
if isinstance(tr,bs4.element.Tag):
tds=tr('td')
unifo.append([tds[0].string, tds[1].string, tds[2].string, tds[3].string])
except:
continue
def printUnivList(unifo,num):
print('{:^4}{:^10}{:^5}{:^8}{:^10}'.format('排名','学校名称','省市','总分',chr(12288)))
for i in range(num):
u=unifo[i]
print('{:^4}{:^10}{:^5}{:^8}{:^10}'.format(u[0],u[1],u[2],u[3],chr(12288)))
if __name__=="__main__":
num=int(input('请输入你要查看的大学排名数量:'))
years=input('请输入排名的年份')
main(num,years)