利用斗鱼api学习爬取斗鱼直播间信息

python爬取斗鱼房间的弹幕

斗鱼弹幕服务api:https://open.douyu.com/source/api/63

基本步骤

  1. 连接斗鱼api服务器
  2. 构造登录请求
  3. 进入房间并构造获取弹幕请求
  4. 保持心跳
  5. 断开连接
import multiprocessing
import socket
import time
import re
import signal
import jieba
import requests
from bs4 import BeautifulSoup as bs4
from openpyxl import Workbook
from wordcloud.wordcloud import WordCloud
import matplotlib.pyplot as plt
import json

# 构造socket连接,和斗鱼api服务器相连接
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
host = socket.gethostbyname("danmuproxy.douyu.com")
port = 8601
client.connect((host, port))

# 弹幕查询正则表达式
danmu_re = re.compile(b'txt@=(.+?)/cid@')
username_re = re.compile(b'nn@=(.+?)/txt@')
level_re = re.compile(b'/level@=(.+?)/sahf@')

# 根据房间号获取房间名
def get_room_name(roomid):
    res = requests.get('http://www.douyu.com/' + str(roomid))
    soup = bs4(res.text, 'lxml')
    total=soup.find(name='div', attrs={"class":'layout-Player-title'})
    t=total.find(name='h2', attrs={"class":'Title-anchorNameH2'})
    try:
        name=''.join(t.get_attribute_list('title'))
    except:
        name=''
    return name

def send_req_msg(msgstr):
    '''构造并发送符合斗鱼api的请求'''

    msg = msgstr.encode('utf-8')
    data_length = len(msg) + 8
    code = 689
    # 构造协议头
    msgHead = int.to_bytes(data_length, 4, 'little') \
        + int.to_bytes(data_length, 4, 'little') + \
        int.to_bytes(code, 4, 'little')
    client.send(msgHead)
    sent = 0
    while sent < len(msg):
        tn = client.send(msg[sent:])
        sent = sent + tn


# 数据保存至Excel中
def save_to_excel(room_name, barrage_list):
	wb = Workbook()
	ws = wb.active
	count = 0
	for bl in barrage_list:
		try:
			ws.append([bl[0], bl[1], bl[2]])
		except:
			print('第%d条弹幕信息保存失败' % count)
		count += 1
	if room_name == None:
		room_name = '未知房间'
	wb.save(r'danmu' + '.xlsx')

#获取弹幕信息
def DM_start(roomid,barrage_num):
    # 构造登录授权请求
    msg = 'type@=loginreq/roomid@={}/\0'.format(roomid)
    send_req_msg(msg)
    # 构造获取弹幕消息请求
    msg_more = 'type@=joingroup/rid@={}/gid@=-9999/\0'.format(roomid)
    send_req_msg(msg_more)
    room_name = get_room_name(roomid)
    print('已连接至\"{}\"的直播间'.format(room_name))
    barrage_list = []
    barrage_list.append(['等级', '昵称', '弹幕'])
    print("弹幕正在获取中...")

    flag = True
    while flag:
        # 服务端返回的数据
        data = client.recv(1024)
        # 通过re模块找发送弹幕的用户名和内容
        danmu_level = level_re.findall(data)
        danmu_username = username_re.findall(data)
        danmu_content = danmu_re.findall(data)
        if not data:
            continue
        else:
            for i in range(0, len(danmu_content)):
                try:
                    # 输出信息
                    level_deutf8 = danmu_level[0].decode( 'utf8')
                    username_deutf8 = danmu_username[0].decode( 'utf8')
                    barrage_deutf8 = danmu_content[0].decode(encoding='utf8')
                    # print('[{}]:{}'.format(danmu_username[0].decode(
                    #     'utf8'), danmu_content[0].decode(encoding='utf8')))
                except:
                    continue
                barrage_list.append([level_deutf8, username_deutf8 ,barrage_deutf8])
                barrages = len(barrage_list)
                if barrages > barrage_num:
                    print('已成功获得%d条弹幕' % (barrages - 1))
                    flag = False
                    break
    #制作词云
    all_barrages = ''
    for bl in barrage_list:
        all_barrages += str(bl[2])
    all_barrages = filterword(all_barrages)
    words = ' '.join(jieba.cut(all_barrages))
    # 这里设置字体路径
    Words_Cloud = WordCloud(background_color="black",width=900,height=600, max_words=100, font_path="simkai.ttf")
    process_word=WordCloud.process_text(Words_Cloud,words)
    Words_Cloud.generate_from_frequencies(process_word)
    print('成功生成词云...')
    image=Words_Cloud.to_image()
    plt.imshow(image) # 显示图片
    plt.axis('off') # 不显示坐标轴
    plt.savefig('barrages_cloud.jpg')
    plt.show()
    print('数据开始导入Excel中')
    save_to_excel(room_name, barrage_list)
    print('导入成功,保存在桌面')
    print(words)
    logout()


# 过滤函数:清洗数据,删除不必要的符号。
def filterword(filterdata):
	symbol = ',。“”~!@#¥%……&*()——+=【】{}、|;:‘’《》?!#$^&()[]{};:",.<>/?\\-\n'
	for sym in symbol:
		filterdata = filterdata.replace(sym, '')
		filterdata = filterdata.strip(' ')
	return filterdata

def keeplive():
    '''
    保持心跳,45秒心跳请求一次
     '''
    while True:
        #msg = 'type@=keeplive/tick@=' + str(int(time.time())) + '/\0'
        msg = "type@=mrkl/"
        send_req_msg(msg)
        print('发送心跳包')
        time.sleep(45)


def logout():
    '''
    与斗鱼服务器断开连接
    关闭线程
    '''
    msg = 'type@=logout/'
    send_req_msg(msg)
    print('已经退出服务器')


def signal_handler(signal, frame):
    '''
    捕捉 ctrl+c的信号 即 signal.SIGINT
    触发hander:
    登出斗鱼服务器
    关闭进程
    '''
    p.terminate()
    logout()
    print('Bye')

#保存直播间信息
def save_to_excel2(zhibo):
    wb = Workbook()
    ws = wb.active
    ws.append(['主播','房间号','房间名','热度'])
    for i in zhibo:
        try:
            ws.append(i)
        except:
            print('第%条信息保存失败!'%i)
    wb.save(r'直播间信息' + '.xlsx')
    print('写入成功!')

#爬取英雄联盟直播间信息
def catch(): 
    urls = ['https://www.douyu.com/gapi/rkc/directory/mixList/2_1/{}'.format(page) for page in range(1,3)]
    zhibo=[]
    for url in urls:
        res = requests.get(url)
        j = json.loads(res.text) #将已编码的 JSON 字符串解码为 Python 对象
        l1 = j['data']     # 通过观察可以发现要的数据在data下
        l2 = l1['rl']     #在观察发现在data的rl中
        for i in range(len(l2)):   # 这里用到for循环来处理一个列表下多个字典的数据
            Anchor = l2[i]['nn']              # 获取主播名字
            RoomNumber = l2[i]['rid']         # 获取房间号
            Heat = l2[i]['ol']                # 获取热度
            RoomName = l2[i]['rn']            # 获取房间名
            zhibo.append([Anchor,RoomNumber,RoomName,int(Heat)])
    
    zhibo=sorted(zhibo,key=lambda x:x[3]) #按热度高的排序
    zhibo.reverse()
    save_to_excel2(zhibo)
    #输出热度前10名直播间信息
    tplt =  '{0:{4}<10}\t{1:{4}<7}\t{2:{4}^15}\t{3:{4}<8}'
    print(tplt.format('主播', '房间号', '房间名', '热度', chr(12288)))
    for j in zhibo[:10]:
        print(tplt.format(j[0], str(j[1]), j[2], str(j[3]), chr(12288)))
    return zhibo

if __name__ == '__main__':
    zhibo=catch()
    room_id = input('请输入房间ID:')
    barrage_num = input('请输入需要的弹幕数量:')
    barrage_num = int(barrage_num)
    # 开启signal捕捉
    signal.signal(signal.SIGINT, signal_handler)
    DM_start(room_id,barrage_num)
    # 开启弹幕和心跳进程
    p = multiprocessing.Process(target=keeplive)
    p.start()

运行结果如图所示:
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值