前言
用python写了一个趴短视频的代码,我是新手,只是想在这里记录一下平时的学习生活。
相关的注释和对应print()出来的样子都在源代码中,我没有获取字,因为保存的时候,有些时频的名字有特殊符号导致文件的命名会出错,其实也可以自己对名字处理一下就ok了!
import requests
from bs4 import BeautifulSoup
import os
url = 'https://www.xxx.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}
html = requests.get(url=url, headers=headers)
html.raise_for_status()
html.encoding = html.apparent_encoding
soup = BeautifulSoup(html.text, 'html.parser')
# print(soup)
if not os.path.exists("./movies/"):
os.mkdir("./movies/")
movie = []
id = []
i = 0
movies_url = soup.find('div', class_='act-main cmmain').find_all('div', class_='vervideo-bd')
# print(movies_url, len(movies_url))
for movie_url in movies_url:
movie_url = movie_url.a.get('href')
# print(movie_url) # video_1765999
idd = movie_url[6:]
id.append(idd)
movie_url = 'https://www.xxx.com/' + movie_url # https://www.xxx.com/video_1765999
# print(movie_url)
movie.append(movie_url)
# print(id, '\n', len(id))
# print(movie, '\n', len(movie))
true_url = 'https://www.xxx.com/videoStatus.jsp?contId={}'.format(idd)
Referer = 'https://www.xxx.com/video_{}'.format(idd)
Myheaders = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36',
'Referer': Referer}
true_html = requests.get(url=true_url, headers=Myheaders).json()
# print(true_html, '\n', len(true_html))
# {'resultCode': '1', 'resultMsg': 'success', 'reqId': 'b8a07a26-c53a-42cb-9151-b934be99233f', 'systemTime': '1656574762447', 'videoInfo': {'playSta': '1', 'video_image': 'https://image2.xxx.com/cont/20220630/15902642-140436-1.png', 'videos': {'hdUrl': '', 'hdflvUrl': '', 'sdUrl': '', 'sdflvUrl': '', 'srcUrl': 'https://video.xxx.com/mp4/third/20220630/1656574762447-15902642-140152-hd.mp4'}}}
download_url = true_html["videoInfo"]["videos"]["srcUrl"]
# print(download_url) # https://video.xxx.com/mp4/third/20220630/1656575735017-15902642-140152-hd.mp4
download_url = download_url.replace(download_url.split("/")[-1].split("-")[0], "cont-" + idd)
# print(download_url) # https://video.xxx.com/mp4/third/20220622/cont-1765999-15902642-154851-hd.mp4
download_url_html = requests.get(url=download_url, headers=headers).content
with open('./movies/' + id[i] + '.mp4', 'wb') as f:
f.write(download_url_html)
print(id[i] + '已经下载')
i += 1
print('所有视频下载完毕!')
总结
新手上路,还请各位大神多多指教。欢迎大家一起交流学习呀!