爬虫练习-荔枝直播(分享页)

荔枝直播(分享页):

分享页主页直播列表:(抓包分析)

  • 接口:https://appweb.lizhi.fm/smallApp/getLiveList?pageNum=1
  • 支持获取n页数据,注意请求间隔;
  • get请求,获取分享页必须的 liveId 字段

直播分享页:

  • html:https://appweb.lizhi.fm/live/share?liveId=5190925580233002038&njId=2552360964061657132&duserId=138542e7ea551a918c42396e0488695b&from=iosBrowser
  • 中间两个参数非必须
  • get请求,获取 uid 及 liveUrl 字段
  • 该页面请求时需设置移动端User-Agent
  • ‘User-Agent’:‘Mozilla/5.0 (iPhone; CPU iPhone OS 14_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148’

编码实现:


import requests
import time
import json
import re

class LiveList(object):

    def __init__(self, pageLimit=10):
        self.liveList = []
        self.pageLimit = pageLimit
    
   
    def getLiveListPage(self, pageNum=1):
        url = 'https://appweb.lizhi.fm/smallApp/getLiveList?pageNum=' + str(pageNum)
        print(url)
        time.sleep(1)
        response = requests.get(url)
        response.encoding = 'utf-8'
        
        infos = {}
        try:
            infos = json.loads(response.text)
        except Exception as e:
            print(e)
            infos = {}
            infos['ret'] = {}
            infos['ret']['dataList'] = []            
        
        return (len(infos['ret']['dataList']), infos['ret']['dataList'])
    
    def getLiveListPages(self):
        cur_page = 1
        cur_len = 10
        
        while cur_len > 0:
            if cur_page > self.pageLimit:
                break
            page_len, page_liveList = self.getLiveListPage(cur_page)
            cur_len = page_len
            self.liveList.extend(page_liveList)
            cur_page = cur_page + 1
    
        return self.liveList
    
def parseRegular(param="liveUrl"):
    liveUrl_r = param + r' = \"([^;]*)\";' 
    liveUrl_b = re.compile(liveUrl_r ,  re.DOTALL)
    return liveUrl_b
    

def parseShareURL(liveId):
    url1 = "https://appweb.lizhi.fm/live/share?liveId=" + liveId + "&from=iosBrowser"
    headers = {
        'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 14_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'
    }
    response_1=requests.get(url1, headers=headers)
    time.sleep(1)
    response_1.encoding='utf-8'
    
    liveUrl = parseRegular('liveUrl').findall(response_1.text)
    userId = parseRegular('userId').findall(response_1.text)
    
    if len(liveUrl) > 0:
        return {"liveUrl":liveUrl[0],"userId":userId[0]}
    else:
        return None

if __name__ == '__main__':
#     建议:每隔1h进行一次刷新

#     获取1页数据
    liveListObj = LiveList(1)
    liveList = liveListObj.getLiveListPages()
#     打印5页数据的直播记录数
    print('liveList count = ',len(liveList))  
    
#     最终结果保存数组
    final_res = []
#     遍历直播记录
    for liveItem in liveList:
        print(liveItem['liveId'])
#         解析当前直播分享页
        userInfo = parseShareURL(liveItem['liveId'])
        if userInfo is None:
            print("直播已结束!")
        else:
            print('直播中...')
            final_item = {**liveItem, **userInfo} 
            final_res.append(final_item)
#     打印最终结果
    print(len(final_res),final_res)
#     结果存入本地json
    with open('liveList.json','w') as file_obj:
        json.dump(final_res,file_obj)
    
    print('over!!!')
                

Print:


https://appweb.lizhi.fm/smallApp/getLiveList?pageNum=1
liveList count =  10
5190925580233002038
直播中...
5386445300058660864
直播中...
5190898481141075510
直播中...
5190925580232704054
直播中...
5190925580233034294
直播中...
5190898481140796470
直播中...
5342496960865640448
直播中...
5386670725150109696
直播中...
5190898481141102646
直播中...
5190899559177836086
直播中...
10 [{'radioConver': 'http://cdnimg103.lizhi.fm/studio/2020/09/21/2829150432296549942.jpg', 'liveStatus': 1, 'totalCount': '25309', 'liveName': '大凯故事会', 'userName': '大凯说', 'liveId': '5190925580233002038', 'userPortrait': 'http://cdnimg103.lizhi.fm/user/2017/05/22/2603223407410991618_80x80.jpg', 'liveStartTime': 1626847200000, 'liveEndTime': 1626861600000, 'tag': '', 'liveUrl': 'http://pull102.gzlz307.com/home/6d944f6ab72b3d069517146587a23c39/playlist.m3u8?only-audio=1', 'userId': '2552360964061657132'}, {'radioConver': 'http://cdnimg103.lizhi.fm//studio/2021/07/14/2884134180267993654.jpg', 'liveStatus': 1, 'totalCount': '3154', 'liveName': '日常的午后尬聊', 'userName': '搞事儿ღ 养声糖', 'liveId': '5386445300058660864', 'userPortrait': 'http://cdnimg103.lizhi.fm/user/2020/07/26/2818635229221560322_80x80.jpg', 'liveStartTime': 1626856200000, 'liveEndTime': 1626861600000, 'tag': '脱口秀', 'liveUrl': 'http://pull102.gzlz307.com/home/94aa9ab2951090660da928e2418e5a76/playlist.m3u8?only-audio=1', 'userId': '14298657'}, {'radioConver': 'http://cdnimg103.lizhi.fm/studio/2020/10/19/2834415944099236918.jpg', 'liveStatus': 1, 'totalCount': '10128', 'liveName': '性感男神在线直播', 'userName': '王帅帅☀幸好有你', 'liveId': '5190898481141075510', 'userPortrait': 'http://cdnimg103.lizhi.fm/user/2021/04/25/2869209908380339202_80x80.jpg', 'liveStartTime': 1626854400000, 'liveEndTime': 1626856200000, 'tag': '', 'liveUrl': 'http://pull102.gzlz307.com/home/aef22384abe68f32729d855ee12b99bc/playlist.m3u8?only-audio=1', 'userId': '2545439804331933740'}, {'radioConver': 'http://cdnimg103.lizhi.fm/studio/2021/07/05/2882414548083745334.jpg', 'liveStatus': 1, 'totalCount': '11968', 'liveName': '百变老舅正在直播', 'userName': '老舅⁹ 招主播', 'liveId': '5190925580232704054', 'userPortrait': 'http://cdnimg103.lizhi.fm/user/2021/07/05/2882441263372247554_80x80.jpg', 'liveStartTime': 1626854419000, 'liveEndTime': 1626861619000, 'tag': '脱口秀', 'liveUrl': 'http://pull102.gzlz307.com/home/2b5aa92a75c9a6b76fafebd6e38d218d/playlist.m3u8?only-audio=1', 'userId': '5086955461592188972'}, {'radioConver': 'http://cdnimg103.lizhi.fm/studio/2021/01/04/2848695138400300598.jpg', 'liveStatus': 1, 'totalCount': '2794', 'liveName': '夺宝、塔罗牌好运直播间❤️', 'userName': '墨子轩🍄招主播', 'liveId': '5190925580233034294', 'userPortrait': 'http://cdnimg103.lizhi.fm/user/2020/12/02/2842569104138020866_80x80.jpg', 'liveStartTime': 1626856200000, 'liveEndTime': 1626859800000, 'tag': '古风', 'liveUrl': 'http://pull102.gzlz307.com/home/02e57ae2959d3a3104caf23561438501/playlist.m3u8?only-audio=1', 'userId': '2555204161544508972'}, {'radioConver': 'http://cdnimg103.lizhi.fm/studio/2021/03/16/2861859265127815734.jpg', 'liveStatus': 1, 'totalCount': '1279', 'liveName': '今天也是圆气满满的一天鸭', 'userName': '٩🥳۶圆气满满鸭~🥀', 'liveId': '5190898481140796470', 'userPortrait': 'http://cdnimg103.lizhi.fm/user/2021/05/25/2874894204829581314_80x80.jpg', 'liveStartTime': 1626948000000, 'liveEndTime': 1626951600000, 'tag': '', 'liveUrl': 'http://pull102.gzlz307.com/home/6c9eec0f07b9d9c31ba9a63a090e42ad/playlist.m3u8?only-audio=1', 'userId': '5037691755063110188'}, {'radioConver': 'http://cdnimg103.lizhi.fm/studio/2021/06/18/2879319034757152310.jpg', 'liveStatus': 1, 'totalCount': '10569602', 'liveName': '恋行男友', 'userName': '恋行-高福利招人💝', 'liveId': '5342496960865640448', 'userPortrait': 'http://cdnimg103.lizhi.fm/user/2021/05/21/2874104725109944834_80x80.jpg', 'liveStartTime': 1621590400000, 'liveEndTime': 1624182400000, 'tag': '男友', 'liveUrl': 'http://pull102.gzlz307.com/home/3ceb87599a87bc1e7152ad4419faf21b/playlist.m3u8?only-audio=1', 'userId': '5023457267080509996'}, {'radioConver': 'http://cdnimg103.lizhi.fm//studio/2021/07/07/2882818898805649974.jpg', 'liveStatus': 1, 'totalCount': '1414', 'liveName': '甜妹求带飞上星星✨', 'userName': 'dy.奶糖_Jenny', 'liveId': '5386670725150109696', 'userPortrait': 'http://cdnimg103.lizhi.fm/user/2021/07/07/2882783128169811970_80x80.jpg', 'liveStartTime': 1626856323000, 'liveEndTime': 1626859923000, 'tag': '情感', 'liveUrl': 'http://pull102.gzlz307.com/home/1c158cec050b10c5545fef19be825eed/playlist.m3u8?only-audio=1', 'userId': '5022849964157813292'}, {'radioConver': 'http://cdnimg103.lizhi.fm/studio/2021/07/20/2885259126691449398.jpg', 'liveStatus': 1, 'totalCount': '24076', 'liveName': '♬.星辰音乐电台 --温暖治愈', 'userName': '星辰✨冠名LuLu👑', 'liveId': '5190898481141102646', 'userPortrait': 'http://cdnimg103.lizhi.fm/user/2020/09/14/2827924329089217026_80x80.jpg', 'liveStartTime': 1626856200000, 'liveEndTime': 1626867000000, 'tag': '连线', 'liveUrl': 'http://pull102.gzlz307.com/home/160aaf1435dd8720a197c69e98632035/playlist.m3u8?only-audio=1', 'userId': '2679352497711647276'}, {'radioConver': 'http://cdnimg103.lizhi.fm/studio/2021/05/24/2874608674803885622.jpg', 'liveStatus': 1, 'totalCount': '237', 'liveName': '🍷别拿豆包 不当干粮🍷', 'userName': 'DJ安哥🍷天籁', 'liveId': '5190899559177836086', 'userPortrait': 'http://cdnimg103.lizhi.fm/user/2020/08/31/2825278312823928834_80x80.jpg', 'liveStartTime': 1626858000000, 'liveEndTime': 1626861600000, 'tag': '情感', 'liveUrl': 'http://pull102.gzlz307.com/home/c45cbdb813f2a0521b3a4d077033b373/playlist.m3u8?only-audio=1', 'userId': '5130700645405291820'}]
over!!!

得到记录信息:

{'radioConver': 'http://cdnimg103.lizhi.fm/studio/2021/05/24/2874608674803885622.jpg',
 'liveStatus': 1,
 'totalCount': '237',
 'liveName': '🍷别拿豆包 不当干粮🍷',
 'userName': 'DJ安哥🍷天籁',
 'liveId': '5190899559177836086',
 'userPortrait': 'http://cdnimg103.lizhi.fm/user/2020/08/31/2825278312823928834_80x80.jpg',
 'liveStartTime': 1626858000000,
 'liveEndTime': 1626861600000,
 'tag': '情感',
 'liveUrl': 'http://pull102.gzlz307.com/home/c45cbdb813f2a0521b3a4d077033b373/playlist.m3u8?only-audio=1',
 'userId': '5130700645405291820'}

多进程的优化

import concurrent.futures

group_max_workers = 5

def evaluate_item(tests):
    return tests

item_x_list = []
with concurrent.futures.ProcessPoolExecutor(
    max_workers=group_max_workers
) as executor:
    results = executor.map(evaluate_item, [[1,2,3,4,5],[2,2,3,4,5],[3,2,3,4,5],[4,2,3,4,5],[5,2,3,4,5],[6,2,3,4,5],[7,2,3,4,5],[8,2,3,4,5],[9,2,3,4,5],[10,2,3,4,5]])
    for result in results:
        item_x_list.extend(result)

print(item_x_list)

# [1, 2, 3, 4, 5, 2, 2, 3, 4, 5, 3, 2, 3, 4, 5, 4, 2, 3, 4, 5, 5, 2, 3, 4, 5, 6, 2, 3, 4, 5, 7, 2, 3, 4, 5, 8, 2, 3, 4, 5, 9, 2, 3, 4, 5, 10, 2, 3, 4, 5]
import requests
import time
import json
import re
import concurrent.futures

class LiveList(object):

    def __init__(self, pageLimit=10):
        self.liveList = []
        self.pageLimit = pageLimit
    
   
    def getLiveListPage(self, pageNum=1):
        url = 'https://appweb.lizhi.fm/smallApp/getLiveList?pageNum=' + str(pageNum)
        print(url)
        time.sleep(1)
        response = requests.get(url)
        response.encoding = 'utf-8'
        
        infos = {}
        try:
            infos = json.loads(response.text)
        except Exception as e:
            print(e)
            infos = {}
            infos['ret'] = {}
            infos['ret']['dataList'] = []            
        
        return (len(infos['ret']['dataList']), infos['ret']['dataList'])
    
    def getLiveListPages(self):
        cur_page = 1
        cur_len = 10
        
        while cur_len > 0:
            if cur_page > self.pageLimit:
                break
            page_len, page_liveList = self.getLiveListPage(cur_page)
            cur_len = page_len
            self.liveList.extend(page_liveList)
            cur_page = cur_page + 1
    
        return self.liveList
    
def parseRegular(param="liveUrl"):
    liveUrl_r = param + r' = \"([^;]*)\";' 
    liveUrl_b = re.compile(liveUrl_r ,  re.DOTALL)
    return liveUrl_b
    
def parseShareURL(liveId):
    url1 = "https://appweb.lizhi.fm/live/share?liveId=" + liveId + "&from=iosBrowser"
    headers = {
        'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 14_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'
    }
    response_1=requests.get(url1, headers=headers)
#     time.sleep(0.5)
    response_1.encoding='utf-8'
    
    liveUrl = parseRegular('liveUrl').findall(response_1.text)
    userId = parseRegular('userId').findall(response_1.text)
    
    if len(liveUrl) > 0:
        return {"liveUrl":liveUrl[0],"userId":userId[0]}
    else:
        return None

def dealLiveItemForUserInfo(liveItem):
    userInfo = parseShareURL(liveItem['liveId'])
    if userInfo is None:
        print("直播已结束!")
        return None
    else:
        print('直播中...')
        final_item = {**liveItem, **userInfo} 
        return final_item
    
def dealLiveListAsFinalResWithWokers(liveList, max_works=5):
#     默认开启5个进程 并行处理 当前直播分享页 的解析
#     最终结果保存数组
    final_res = []
    with concurrent.futures.ProcessPoolExecutor(
        max_workers=max_works
    ) as executor:
        results = executor.map(dealLiveItemForUserInfo,liveList)
        for result in results:
            if result is not None:
                final_res.append(result)

    return final_res

def saveFinaleResJsonToLocalPath(final_res, l_path='liveList.json'):
    with open('liveList.json','w') as file_obj:
        json.dump(final_res,file_obj)

if __name__ == '__main__':
    pageLimit = 1
    liveListObj = LiveList(pageLimit)
    liveList = liveListObj.getLiveListPages()
    final_res = dealLiveListAsFinalResWithWokers(liveList, max_works=5)
    saveFinaleResJsonToLocalPath(final_res,'liveList.json')
    print('共请求 ' + str(pageLimit) + '页数据\n',
          '共获取 ' + str(len(liveList)) + '条主播记录\n',
          '共保存 ' + str(len(final_res)) + '条有效记录\n',
          'over!!!')

Log:

https://appweb.lizhi.fm/smallApp/getLiveList?pageNum=1
直播中...
直播中...
直播中...
直播中...
直播中...
直播中...
直播中...
直播中...
直播中...
直播中...
共请求 1页数据
 共获取 10条主播记录
 共保存 10条有效记录
 over!!!
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值