爬虫获取斗鱼主播人气

获取斗鱼页面中DOTA2游戏主播的人气值,并进行排序

 

代码:

import requests
import re
import random

class Spider():
    # url = 'https://www.douyu.com/g_LOL'
    url = 'https://www.douyu.com/g_DOTA2'
    root_pattern = '<div class="DyListCover-info">([\d\D]*?)</div>'
    name_pattern = '</use></svg>([\d\D]{0,20}?)</h2>'
    number_pattern = '</use></svg>([\d\D]*?)</span>'

    def __fetch_content(self):
        r = requests.get(Spider.url)
        htmls = r.text
        return htmls

    def __analysis(self, htmls):
        root_html = re.findall(Spider.root_pattern, htmls)[1::2]
        # print(root_html[0])
        anchors = []
        # str_max = ""
        for html in root_html:
            name = re.findall(Spider.name_pattern, html)
            number = re.findall(Spider.number_pattern, html)
            anchor = {'name': name, 'number': number}
            # if len(anchor['name']) > len(str_max):
            #     str_max = anchor['name']
            anchors.append(anchor)
        # print(anchors[0], str_max, len(str_max))
        return anchors

    def __refine(self, anchors):
        l = lambda anchor: {
            'name': anchor['name'][0].strip(),
            'number': anchor['number'][0].strip()}
        anchors_refine = list(map(l, anchors))
        return anchors_refine

    def __sort(self, anchors):
        shuffle_list = list(range(len(anchors)))
        random.shuffle(shuffle_list)
        anchors_shuffle = [anchors[i] for i in shuffle_list]

        anchors = sorted(anchors_shuffle, key=self.__sort_seed, reverse=True)
        return anchors

    def __sort_seed(self, anchor):
        r = re.findall('\d*', anchor['number'])
        number = float(r[0])
        if '万' in anchor['number']:
            number *= 10000
        return number
        # return anchor['number']  # wrong

    def __show(self, anchors):
        for i, anchor in enumerate(anchors):
            print('rank', i+1, anchor['name'], anchor['number'])

    def go(self):
        htmls = self.__fetch_content()
        anchors = self.__analysis(htmls)
        anchors = self.__refine(anchors)
        anchors = self.__sort(anchors)
        self.__show(anchors)


spider = Spider()
spider.go()

 

结果:

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值