先看一个爬虫入门的小程序——获取中国最好大学排名
import requests
from bs4 import BeautifulSoup
allUniv = []
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def fillUnivList(soup):
data = soup.find_all('tr')
for tr in data:
ltd = tr.find_all('td')
if len(ltd)==0:
continue
singleUniv = []
for td in ltd:
singleUniv.append(td.string)
allUniv.append(singleUniv)
def printUnivList(num):
print("{:^4}{:^10}{:^5}{:^8}{:^10}".format("排名","学校名称","省市","总分","培养规模"))
for i in range(num):
u=allUniv[i]
print("{:^4}{:^10}{:^5}{:^8}{:^10}".format(u[0],u[1],u[2],u[3],u[6]))
def main():
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2020.html'
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
fillUnivList(soup)
printUnivList(10)
main()
稍微修改一下源程序——获取某省最好大学排名
import requests
from bs4 import BeautifulSoup
allUniv = []
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def fillUnivList(soup):
data = soup.find_all('tr')
for tr in data:
ltd = tr.find_all('td')
if len(ltd)==0:
continue
singleUniv = []
for td in ltd:
singleUniv.append(td.string)
allUniv.append(singleUniv)
def printUnivList(num):
j = 1
print("{1:{0}^4}{2:{0}^6}{3:{0}^10}{4:{0}^6}{5:{0}^8}{6:{0}^10}".format(chr(12288),"省内排名","全国排名","学校名称","省市","总分","培养规模"))
for i in range(num):
u=allUniv[i]
if u[2] == '江西':
print("{1:{0}^6}{2:{0}^6}{3:{0}^10}{4:{0}^6}{5:{0}^9}{6:{0}^11}".format(chr(12288),j,u[0],u[1],u[2],u[3],u[6]))
j=j+1
def main():
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html'
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
fillUnivList(soup)
printUnivList(500)
main()
正文:写一个获取当前在线玩家人数排列的最热门游戏的小程序
学习完上面的爬虫入门程序后的某一天,我碰巧浏览到steam网站冒出灵感:写一个获取当前在线玩家人数排列的最热门游戏的小程序。
按F12发现,这个网页的源码正好适合新手练习。
干脆动手写一写吧!
源码
import requests
from bs4 import BeautifulSoup
import csv
def getHTMLText(url):
try:
r=requests.get(url)
r.raise_for_status()
r.encoding='utf-8'
return r.text
except:
return ""
def getMostPopularGamesList(steamHTMLText):
#课本上中国最好大学排名的实例
# 用一个singleGameData列表保存游戏当前玩家人数,今日峰值,游戏名称的数据
#将所有游戏的singleGameData再存入一个gameList的列表
gameList = []
soup=BeautifulSoup(steamHTMLText)
gameTr=soup.find_all("tr",{"class":"player_count_row"})
for tr in gameTr:
singleGameData=[]
for span in tr.find_all("span",{"class":"currentServers"}):
singleGameData.append(span.string)
for a in tr.find_all("a",{"class":"gameLink"}):
singleGameData.append(a.string)
gameList.append(singleGameData)
return gameList
def printList(gameList):
print("依据当前玩家人数排列的最热门游戏")
print("{1:{0}<4}{2:{0}<8}{3:{0}<10}{4:{0}<10}".format((chr(12288)),"排名","当前玩家人数","今日峰值","游戏"))
for i in range(num):
g=gameList[i]#获取每个游戏的数据列表
print("{1:{0}<4}{2:{0}<8}{3:{0}<10}{4:{0}^10}".format((chr(12288)),i+1,g[0],g[1],g[2]))
if __name__ == '__main__':
url = "https://store.steampowered.com/stats/"
steamHTMLText = getHTMLText(url)
gameList = getMostPopularGamesList(steamHTMLText)
num = len(gameList) # 网站只给出前一百的数据,因此num的值永远是100
printList(gameList)
当然,steam上还有其他数据可以作为爬虫的练习。例如steam的硬软件调查等: