采集某站热搜排行榜(阅读量,评论量和点赞量)

总的思想:

任务一 采集某站 综合热门

任务二 采集某站 排行榜下的全站

任务三 采集排行榜下的动画

无加密,无非就是编码的问题

任务一做法:

经检查网页后 发现里面的可以直接从network中拿到api_ur,所以直采集api就可以拿到数据,其中里面只有播放量,评论。没有点赞量,所以点赞量要点进视频里面才能获取的到,所以这里需要找到另外视频页面的url,其中api里面已经包括其中的参数了 直接拿就好了

任务二和任务三的做法雷同:

首先检查network中是否存在api,发现没有api,只能采集网页了。然后排行榜里面的页面也只是含有播放量,评论,没有点赞。所以还需要进入视频页面拿到点赞量,视频页面的url也可以从排行榜这个页面拿到。

这一次要分享的内容是我帮他人采集的是某站里面的一个热搜排行榜,之前没有采集过某站,以为某站是有反爬功能的,还有就是说什么采集的弹幕,弹幕无非就算地下的哪些评论么,经过检查发现好像并没有什么加密的。没加密的,都是挺简单的。废话不说了! 看一下本次要采集的内容!

具体详细过程我也不能仔细说,不能发不出来,你们可以到知乎里面看看地址为:

具体代码:

#-*- coding:utf-8 -*-
import requests
import json
# 导入urllib与lxml库
from bs4 import BeautifulSoup
from lxml import etree
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
false = False
null = None
# 画图
def drawHistogram(x0, y1, y2, label1, label2, title):
    matplotlib.rc("font", family='MicroSoft YaHei')
    list1 = y1  # 柱状图第一组数据
    list2 = y2  # 柱状图第二组数据
    length = len(list1)
    x = np.arange(length)  # 横坐标范围
    listDate = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]

    plt.figure()
    total_width, n = 1, 2  # 柱状图总宽度,有几组数据
    width = total_width / n  # 单个柱状图的宽度
    x1 = x - width / 2  # 第一组数据柱状图横坐标起始位置
    x2 = x1 + width  # 第二组数据柱状图横坐标起始位置

    plt.title(title)  # 柱状图标题
    # plt.xlabel("星期")   # 横坐标label 此处可以不添加
    plt.ylabel("数量")  # 纵坐标label
    plt.bar(x1, list1, width=width, label=label1)
    plt.bar(x2, list2, width=width, label=label2)
    plt.xticks(x, listDate, rotation=30)  # 用星期几替换横坐标x的值
    plt.legend()  # 给出图例
    plt.show()

# 排行榜 总榜 动画
def get_data(url, name):
    headers = {
        'cookie': "buvid3=51EECB8F-B2B4-F648-B16B-3485D6761C0437292infoc; b_nut=1663560437; i-wanna-go-back=-1; _uuid=42CBCB23-A539-DE8E-61B3-106A47517E4CC36605infoc; buvid4=1D9BE17F-5D9E-9B18-FD37-0F39ECBB843038064-022091912-JDMlVHKZXid/ggr/2NfjCA==; fingerprint=22087719bd29558ec54024d294426d83; buvid_fp_plain=undefined; nostalgia_conf=-1; rpdid=|(J|)l)R)R|k0J'uYYJlY|RlJ; DedeUserID=1639478777; DedeUserID__ckMd5=d8f72f04917d4ed9; buvid_fp=cd9174fe898e94313f12161d55f5c6d4; b_ut=5; SESSDATA=99311495,1679456980,62d1c*91; bili_jct=cda1af7ea2aebcb2d84117106aeb7b9c; b_lsid=B7F1AFEA_183724A44BE; CURRENT_FNVAL=4048; sid=75325who; bp_video_offset_1639478777=709429817518325800; innersign=0; PVID=12",
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
    }
    # 爬取网页的网址,此网址仅限于本实训使用
    url = url
    # 请求网页获取网页源码
    response = requests.get(url=url, headers=headers)
    tree = etree.HTML(response.text)
    title = tree.xpath('//*[@class="title"]/text()')
    print(title)
    # 播放量 评论量
    bofangliang = tree.xpath('//*[@class="data-box"]/text()')
    # print(bofangliang)
    # 播放量 点赞量
    x = []
    y1 = []
    y2 = []
    for i in range(len(bofangliang)):
        if(i==40):
            break
        if((i+2)%2==0):
            y1.append(bofangliang[i].strip())
    # 除去万
    for i in range(len(y1)):
        if ("万" not in y1[i]):
            y1[i] = int(("".join(list(filter(str.isdigit, y1[i])))))
        else:
            y1[i] = int(str("".join(list(filter(str.isdigit, y1[i])))) + "000")
    # 找出点赞的url
    # 全站排行榜 和动画 排行榜点赞的url
    url = tree.xpath('//*[@class="img"]/a/@href')
    for i in range(len(url)):
        if(i==20):
            break
        praise_url = 'https://www.bilibili.com/video/' + str(url[i].split('/')[-1])
        y2.append(get_praise_1(praise_url, '//span[@class="info-text"]/text()'))
    # 画图
    print(y1)
    print(y2)
    drawHistogram(x, y1, y2, '播放量', '点赞量', name)


def get_api_data():
    # 综合热门前20
    headers = {
        'cookie': "buvid3=51EECB8F-B2B4-F648-B16B-3485D6761C0437292infoc; b_nut=1663560437; i-wanna-go-back=-1; _uuid=42CBCB23-A539-DE8E-61B3-106A47517E4CC36605infoc; buvid4=1D9BE17F-5D9E-9B18-FD37-0F39ECBB843038064-022091912-JDMlVHKZXid/ggr/2NfjCA==; fingerprint=22087719bd29558ec54024d294426d83; buvid_fp_plain=undefined; nostalgia_conf=-1; rpdid=|(J|)l)R)R|k0J'uYYJlY|RlJ; DedeUserID=1639478777; DedeUserID__ckMd5=d8f72f04917d4ed9; buvid_fp=cd9174fe898e94313f12161d55f5c6d4; b_ut=5; SESSDATA=99311495,1679456980,62d1c*91; bili_jct=cda1af7ea2aebcb2d84117106aeb7b9c; b_lsid=B7F1AFEA_183724A44BE; CURRENT_FNVAL=4048; sid=75325who; bp_video_offset_1639478777=709429817518325800; innersign=0; PVID=12",
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
    }
    hot_url = "https://api.bilibili.com/x/web-interface/popular"
    param = {
        'ps': 20,
        'pn': 1
    }
    res_0 = requests.get(url=hot_url, data=json.dumps(param), headers=headers)
    # json解析
    data = res_0.json()
    # print(data)
    # 构造数据
    y1 = []
    y2 = []
    x = []
    for i in range(len(data['data']['list'])):
        x.append(data['data']['list'][i]['title'])
        y1.append(data['data']['list'][i]['stat']['view'])
        # 获取点赞量
        url_praise = 'https://www.bilibili.com/video/' + str(data['data']['list'][i]['bvid']) + '/'
        y2.append(get_praise_0(url_praise, '//span[@class="info-text"]/text()'))
    print(x)
    print("播放量"+str(y1))
    print("点赞量" + str(y2))
    # 获取点赞量

    drawHistogram(x, y1, y2, '播放量', '点赞量', '综合热门')

# 综合热门的点赞量
def get_praise_0(url, data):
    headers = {
        'cookie': "buvid3=51EECB8F-B2B4-F648-B16B-3485D6761C0437292infoc; b_nut=1663560437; i-wanna-go-back=-1; _uuid=42CBCB23-A539-DE8E-61B3-106A47517E4CC36605infoc; buvid4=1D9BE17F-5D9E-9B18-FD37-0F39ECBB843038064-022091912-JDMlVHKZXid/ggr/2NfjCA==; fingerprint=22087719bd29558ec54024d294426d83; buvid_fp_plain=undefined; nostalgia_conf=-1; rpdid=|(J|)l)R)R|k0J'uYYJlY|RlJ; DedeUserID=1639478777; DedeUserID__ckMd5=d8f72f04917d4ed9; buvid_fp=cd9174fe898e94313f12161d55f5c6d4; b_ut=5; SESSDATA=99311495,1679456980,62d1c*91; bili_jct=cda1af7ea2aebcb2d84117106aeb7b9c; b_lsid=B7F1AFEA_183724A44BE; CURRENT_FNVAL=4048; sid=75325who; bp_video_offset_1639478777=709429817518325800; innersign=0; PVID=12",
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
    }
    response = requests.get(url=url, headers=headers)
    # print(response.text)
    # response=urllib.request.urlopen(url=url)
    # # time.sleep(5)
    # # 将文本转化为HTML元素树
    tree = etree.HTML(response.text)
    praise = tree.xpath(data)

    #print(praise)
    if("万" not in praise[0]):
        temp = int(("".join(list(filter(str.isdigit, praise[0])))))
    else:
        temp = int(str("".join(list(filter(str.isdigit, praise[0])))) + "000")
    return temp

#  排名全站和排名动画的点赞量
def get_praise_1(url, data):
    headers = {
        'cookie': "buvid3=51EECB8F-B2B4-F648-B16B-3485D6761C0437292infoc; b_nut=1663560437; i-wanna-go-back=-1; _uuid=42CBCB23-A539-DE8E-61B3-106A47517E4CC36605infoc; buvid4=1D9BE17F-5D9E-9B18-FD37-0F39ECBB843038064-022091912-JDMlVHKZXid/ggr/2NfjCA==; fingerprint=22087719bd29558ec54024d294426d83; buvid_fp_plain=undefined; nostalgia_conf=-1; rpdid=|(J|)l)R)R|k0J'uYYJlY|RlJ; DedeUserID=1639478777; DedeUserID__ckMd5=d8f72f04917d4ed9; buvid_fp=cd9174fe898e94313f12161d55f5c6d4; b_ut=5; SESSDATA=99311495,1679456980,62d1c*91; bili_jct=cda1af7ea2aebcb2d84117106aeb7b9c; b_lsid=B7F1AFEA_183724A44BE; CURRENT_FNVAL=4048; sid=75325who; bp_video_offset_1639478777=709429817518325800; innersign=0; PVID=12",
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
    }
    response = requests.get(url=url, headers=headers)
    tree = etree.HTML(response.text)
    praise = tree.xpath(data)
    if(len(praise)==0):
        praise = tree.xpath('//*[@class="video-toolbar-content_item video-toolbar-hover"]/text()')
        # print(praise)
        for i in range(len(praise)):
            praise[i] = praise[i].strip()
        temp = praise[2]
        # print(temp)
    else:
        temp = praise[0]
    # print(praise)
    if("万" not in temp):
        temp = int(("".join(list(filter(str.isdigit, temp)))))
    else:
        temp = int(str("".join(list(filter(str.isdigit, temp)))) + "000")
    # print(temp)
    return temp



# 综合热门
# get_api_data()

# # 全站排行榜前20
# quanzhan_url = 'https://www.bilibili.com/v/popular/rank/all'
# get_data(quanzhan_url, '排名全站')

# # 动画排行榜前20
donghua_url = 'https://www.bilibili.com/v/popular/rank/douga'
get_data(donghua_url, '排名动画')



运行结果:

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值