Python 虎牙学习之舞蹈（一）_虎牙学习资料-CSDN博客

本文链接：https://blog.csdn.net/lubiancongzi/article/details/120407025
# -*- utf-8 -*-
# @Time: 2021/9/21 23:31
# @Author: Administrator
# @File: huya_video.py
# @Software: PyCharm

import os
import random
import requests
from bs4 import BeautifulSoup
from lxml import etree

"""
越多越好
"""
default_headers = [
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0",
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
]

"""
常量定义
"""
global headers
headers = {'User-Agent': random.choice(default_headers)}

global save_path
save_path = 'D:/huya/video'

"""
创建文件夹
"""


def createFile(file_path):
    if os.path.exists(file_path) is False:
        os.makedirs(file_path)
    os.chdir(file_path)


"""
HuyaVideo
"""


class HuyaVideo(object):
    """
    日常抓取：舞蹈学习
    """

    def __init__(self):
        self.video_urls = []
        self.headers = {
            # 'cookie': '替换自己打开网页找一找的cookie吧',
            'User-Agent': random.choice(default_headers)
        }
        pass

    def get_urls(self):
        for page in range(1, 500):
            base_url = 'https://v.huya.com/g/Dance?set_id=31&order=hot&page={}'.format(page)
            print('get_urls base_url==> ', base_url)
            response = requests.get(base_url, headers=self.headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            # print(response.text)
            list_list = soup.find('ul', class_='vhy-video-list').find_all('li')
            print('get_urls ==> list_list size [{}]'.format(len(list_list)))
            self.video_urls.extend(['https://v.huya.com' + i.find('a').get('href') for i in list_list])
            self.get_videos()

    def get_videos(self):
        # 468682371
        base_url = 'https://liveapi.huya.com/moment/getMomentContent?&videoId={}'
        for url in self.video_urls:
            videoid = url.split('/')[-1].split('.')[0]
            video_url = base_url.format(videoid)
            print('get_urls video_url==> ', video_url)
            response = requests.get(video_url, headers=self.headers)
            json_data = response.json()
            video_url = json_data['data']['moment']['videoInfo']['definitions'][0]['url']
            #
            print('get_videos 抓取作者==> ', url)
            response = requests.get(url, headers=self.headers)
            # print(response.text)
            # etree
            html = etree.HTML(response.text)
            aList = html.xpath('//div[contains(@class,"crumb")]//a//text()')
            author = aList[1]
            # for a in aList:
            #     print('get_videos a==> ', a)
            # # BeautifulSoup
            # soup = BeautifulSoup(response.text, 'html.parser')
            # aList = soup.find('div', class_='crumb').find_all('a')
            # print('get_videos 抓取作者==> ', len(aList))
            # for a in aList:
            #     print('get_videos a==> ', a)
            self.save_video(video_url, author, videoid)

    def save_video(self, url, author, name):
        print('文件来源：' + url)
        array = url.split('/')
        rootPath = '{}'.format(save_path)
        exists = os.path.exists(rootPath)
        if not exists:
            createFile(rootPath)
        file_name = '{}.mp4'.format(name)
        pathFile = '{}/{}【{}】{}'.format(rootPath, author, array[3], file_name)
        exists = os.path.exists(pathFile)
        if not exists:
            image = requests.get(url, headers=self.headers).content
            file = open(r'{}'.format(pathFile), "wb")
            file.write(image)
            file.close()
            print('新增文件 ' + pathFile)
        else:
            print('已存在文件 ' + pathFile)
        print('{}视频已经抓取完毕\n'.format(name), '*' * 50, '\n\n')


if __name__ == '__main__':
    huyaVideo = HuyaVideo()
    huyaVideo.get_urls()
PyCharm开发环境下运行指令如下：