今天小编分享一下,原生爬去熊猫平台的数据,希望能帮助大家。
from urllib import request
import re
class Reptile(object):
url_pattern = '<div class="video-info">([\S\s]*?)</div>'
url_name = '</i>([\s\S]*?)</span>'
url_number = '<span class="video-number">([\s\S]*?)</span>'
url_station_num = '<i class="video-station-num">([\s\S]*?)</i>'
#获取网页信息
def read_html(self,urls):
r = request.urlopen(urls)
html = r.read().decode("utf-8")
return html
#利用正则表达式,获取房间名、人气值、人数
def fetch_html(self,new_html):
dictor11 = []
fetch_html = re.findall(Reptile.url_pattern,new_html)
for v in fetch_html:
name = re.findall(Reptile.url_name,v)
number = re.findall(Reptile.url_number,v)
station_num = re.findall(Reptile.url_station_num,v)
dictor = {'name':name,'number':number,'station_num':station_num}
dictor11.append(dictor)
return dictor11
#去掉空格符
def wipe_html(self,wipes):
l = lambda wipe:{'name':wipe['name'][0].strip(),'number':wipe['number'][0].strip(),'station_num':wipe['station_num'][0].strip()}
return list(map(l,wipes))
#按照人气值,对房主排序
def rank(self,wipe):
result = sorted(wipe, key=self.sort_rank, reverse=True)
return result
#对人气值数据进行处理
def sort_rank(self,wipe1):
sort_num = re.findall('\d*',wipe1['number'])
sort_d = float(sort_num[0])
if '万' in wipe1['number']:
sort_d *= 10000
return sort_d
#展现数据
def show_rank(self,rank_result):
for v in range(0,len(rank_result)):
print("排名: ",v+1," 房间名:",rank_result[v]['name']," 人气值:",rank_result[v]['number']," 在线人数:",rank_result[v]['station_num'])
#各个函数之间的调用
def go(self):
category_list = ['lol', 'fortnite', 'hearthstone',
'overwatch', 'dota1', 'dota2', 'cod15', 'war3', 'cf', 'csgo', 'heroes', 'starcraft', 'boardgames', 'artifact']
for url_list in category_list:
print('*'*30+str(url_list)+'*'*30)
url = "https://www.panda.tv/cate/"+str(url_list)
new_htmls = self.read_html(url)
fetch_htmls = self.fetch_html(new_htmls)
wipe =list(self.wipe_html(fetch_htmls))
rank_result = self.rank(wipe)
self.show_rank(rank_result)
print("\n")
#实例化类,调用方法
reptile = Reptile()
reptile.go()