# -*- utf-8 -*-
# @Time: 2021/9/21 23:31
# @Author: Administrator
# @File: huya_video.py
# @Software: PyCharm
import os
import random
import requests
from bs4 import BeautifulSoup
from lxml import etree
"""
越多越好
"""
default_headers = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0",
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
]
"""
常量定义
"""
global headers
headers = {'User-Agent': random.choice(default_headers)}
global save_path
save_path = 'D:/huya/video'
"""
创建文件夹
"""
def createFile(file_path):
if os.path.exists(file_path) is False:
os.makedirs(file_path)
os.chdir(file_path)
"""
HuyaVideo
"""
class HuyaVideo(object):
"""
日常抓取:舞蹈学习
"""
def __init__(self):
self.video_urls = []
self.headers = {
# 'cookie': '替换自己打开网页找一找的cookie吧',
'User-Agent': random.choice(default_headers)
}
pass
def get_urls(self):
for page in range(1, 500):
base_url = 'https://v.huya.com/g/Dance?set_id=31&order=hot&page={}'.format(page)
print('get_urls base_url==> ', base_url)
response = requests.get(base_url, headers=self.headers)
soup = BeautifulSoup(response.text, 'html.parser')
# print(response.text)
list_list = soup.find('ul', class_='vhy-video-list').find_all('li')
print('get_urls ==> list_list size [{}]'.format(len(list_list)))
self.video_urls.extend(['https://v.huya.com' + i.find('a').get('href') for i in list_list])
self.get_videos()
def get_videos(self):
# 468682371
base_url = 'https://liveapi.huya.com/moment/getMomentContent?&videoId={}'
for url in self.video_urls:
videoid = url.split('/')[-1].split('.')[0]
video_url = base_url.format(videoid)
print('get_urls video_url==> ', video_url)
response = requests.get(video_url, headers=self.headers)
json_data = response.json()
video_url = json_data['data']['moment']['videoInfo']['definitions'][0]['url']
#
print('get_videos 抓取作者==> ', url)
response = requests.get(url, headers=self.headers)
# print(response.text)
# etree
html = etree.HTML(response.text)
aList = html.xpath('//div[contains(@class,"crumb")]//a//text()')
author = aList[1]
# for a in aList:
# print('get_videos a==> ', a)
# # BeautifulSoup
# soup = BeautifulSoup(response.text, 'html.parser')
# aList = soup.find('div', class_='crumb').find_all('a')
# print('get_videos 抓取作者==> ', len(aList))
# for a in aList:
# print('get_videos a==> ', a)
self.save_video(video_url, author, videoid)
def save_video(self, url, author, name):
print('文件来源:' + url)
array = url.split('/')
rootPath = '{}'.format(save_path)
exists = os.path.exists(rootPath)
if not exists:
createFile(rootPath)
file_name = '{}.mp4'.format(name)
pathFile = '{}/{}【{}】{}'.format(rootPath, author, array[3], file_name)
exists = os.path.exists(pathFile)
if not exists:
image = requests.get(url, headers=self.headers).content
file = open(r'{}'.format(pathFile), "wb")
file.write(image)
file.close()
print('新增文件 ' + pathFile)
else:
print('已存在文件 ' + pathFile)
print('{}视频已经抓取完毕\n'.format(name), '*' * 50, '\n\n')
if __name__ == '__main__':
huyaVideo = HuyaVideo()
huyaVideo.get_urls()
PyCharm开发环境下运行指令如下:
python huya_video.py
exe 打包,可随处运行,前提须安装 `pip install pyinstaller` ,打包指令如下:
pyinstaller -F huya_video.py
结果数据如下: