功能描述:
1.输入:大学排名的 URL 链接
2.输出:大学排名信息的品目输出(排名,大学名称,总分)
3.定向爬虫:仅对输入 URL 进行爬取,不扩展爬取其他 URL
http://www.zuihaodaxue.com/robots.txt 查看robots协议:发现404 ,说明没有爬取限制
# 实现步骤:
# Code:
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist,html):
soup = BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):# 新引入bs4
tds = tr('td')
ulist.append([tds[0].string, tds[1].string, tds[2].string]) # 新增字段:大学排名、名字、分数
def printUnivList(ulist,num):
print( "{:^10}\t{:^6}\t{:^10}".format("排名","学校","分数") )
for i in range(num):
u = ulist[i]
print( "{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
def main():
uinfo = []
url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html"
html = getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,20) # 20 univs
main()
# 优化:实现文本对齐,采用中文字符的空格chr(12288) 填充
def printUnivList2(ulist,num):
tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
print( tplt.format("排名","学校","分数",chr(12288)))
for i in range(num):
u = ulist[i]
print( tplt.format(u[0],u[1],u[2],chr(12288)) )
# 改进:
import requests
import re
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist,html):
soup = BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):# 新引入bs4
tds = tr('td')
ulist.append([tds[0].string, tds[1].string, tds[2].string, tds[3].string]) # 新增字段:大学排名、名字、省份、分数
def printUnivList(ulist,num):
print( "{:^8}\t{:^16}\t{:^10}\t{:^9}".format("排名","学校","省份","分数",chr(12288))) # 在IDE里面显示总是调不出对齐,writeTxt里面可以对齐
for i in range(num):
u = ulist[i]
print( "{:^8}\t{:^14}\t{:^10}\t{:^10}".format(u[0],u[1],u[2],u[3],chr(12288)))
def writeTxt(ulist,num):
fb = open('中国大学排名.txt', 'w', encoding='utf-8')
fb.write( "{:^8}\t{:^16}\t{:^10}\t{:^9}".format("排名","学校","省份","分数",chr(12288)) + '\n')# 间距经过不断调试最终确定
for i in range(num):
u = ulist[i]
fb.write("{:^8}\t{:^14}\t{:^10}\t{:^10}".format(u[0],u[1],u[2],u[3],chr(12288)) + '\n')
fb.close()
def main():
uinfo = []
url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html"
html = getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,30) # IDE 打印
writeTxt(uinfo,30) # 写入文件
main()