python爬取斗鱼当前英雄联盟主播排名
代码
from urllib import request
from io import BytesIO
import gzip
import re
class Spider():
url = 'https://www.douyu.com/g_LOL'
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36'
}
root_pattern = '<div class="DyListCover-info">(.*?)</div>'
name_pattern = '<div class="DyListCover-userName is-template">(.*)'
number_pattern = '<use xlink:href="#icon-hot_8a57f0b"></use></svg>(.*?)</span>'
def __fetch_content(self):
req = request.Request(Spider.url, headers=Spider.header)
r = request.urlopen(req)
htmls = r.read()
buff = BytesIO(htmls)
f = gzip.GzipFile(fileobj=buff)
htmls = f.read().decode('utf-8')
return htmls
def __analysis(self, htmls):
root_html = re.findall(Spider.root_pattern, htmls)
root_html1 = root_html[1::2]
anchors = []
for html in root_html1:
name = re.findall(Spider.name_pattern, html)
number = re.findall(Spider.number_pattern, html)
anchor = {'name': name, 'number': number}
anchors.append(anchor)
return anchors
def __save(self, htmls):
fh = open('htmls.html', 'w', encoding='utf-8')
fh.write(htmls)
fh.close()
def __transNumber(self, numberStr):
if len(numberStr) == 0:
return
if not isinstance(numberStr[0], str):
return
number = re.findall("-?(([1-9]\\d*\\.\\d*)|(0\\.\\d*[1-9]\\d*)|([1-9]\\d*))", numberStr[0])
number = float(number[0][0])
if '万' in numberStr[0]:
number *= 10000
return number
def __vedioNumCmp(self, numberStr1, numberStr2):
if not isinstance(numberStr1, str) and isinstance(numberStr2, str):
return
number1 = self.__transNumber(numberStr1)
number2 = self.__transNumber(numberStr2)
if number1 > number2:
return True
else:
return False
def __sort(self, vedio_list):
if not isinstance(vedio_list, list):
return
anchors = []
for vedio_dict in vedio_list:
if len(vedio_dict['name']) == 0:
break
if len(anchors) == 0:
anchors.append(vedio_dict)
continue
i = 0
for anchors_dict in anchors:
if self.__vedioNumCmp(vedio_dict['number'], anchors_dict['number']):
anchors.insert(i, vedio_dict)
break
else:
i += 1
continue
if len(anchors) == i:
anchors.append(vedio_dict)
continue
return anchors
def __show(self, anchors):
if not isinstance(anchors, list):
return
rank = 0
for vedio_dict in anchors:
rank += 1
print('排名第{}: 名称:{} 热度:{}'.format(rank, vedio_dict['name'][0], vedio_dict['number'][0]))
def go(self):
htmls = self.__fetch_content()
self.__save(htmls)
anchors = self.__analysis(htmls)
anchors = self.__sort(anchors)
self.__show(anchors)
if __name__ == '__main__':
spider = Spider()
spider.go()
结果展示:
排名第1: 名称:英雄联盟赛事 热度:842.8万
排名第2: 名称:东北大鹌鹑 热度:240.4万
排名第3: 名称:智勋勋勋勋 热度:234.5万
排名第4: 名称:南波儿大魔王丶 热度:205.4万
排名第5: 名称:洞主丨歌神洞庭湖 热度:203.9万
排名第6: 名称:不2不叫周淑怡 热度:201万
排名第7: 名称:灵药LoveLing 热度:183.9万
排名第8: 名称:叶音符 热度:155.1万
排名第9: 名称:梨落秋溪 热度:142.5万
排名第10: 名称:zzc啊哦额 热度:137.7万
排名第11: 名称:Xinyi新一丶 热度:126.4万
排名第12: 名称:杰克螳螂 热度:118.2万
排名第13: 名称:王纪超666 热度:112.6万
排名第14: 名称:小苏菲 热度:96.9万
排名第15: 名称:格局OoO 热度:95.4万
排名第16: 名称:魔獸后裔 热度:94.3万
排名第17: 名称:萌面酥 热度:82.5万
排名第18: 名称:Chogod虫王 热度:63.6万
排名第19: 名称:我也会饿龙咆哮嗷 热度:4.5万