用python下载网页视频,python下载网页视频

这段代码演示了如何使用Python的BeautifulSoup和urllib库从网页抓取MP4视频链接,并通过多线程进行下载。首先,通过GET请求获取网页HTML,然后解析HTML找到视频链接,再将链接转换为MP4源,最后利用多线程进行文件下载,每个线程负责下载一个视频文件。
摘要由CSDN通过智能技术生成

因网站不同需要修改。

下载 mp4 连接

from bs4 import BeautifulSoup

import requests

import urllib

import re

import json

encodestyle = 'gbk'

homepage='http://www.**.html'

htmlhead='http://www.**' # GetwVideoHtml() 函数用

#GetNPage_html(homepage,n)

#HtmlList2Mp4List(sumhtml)

#Writelist2json(listname,lists)

def GetwVideoHtml(furl):

retlist=[]

res = requests.get(furl)

res.encoding= encodestyle

soup = BeautifulSoup(res.text,'html.parser')

for Tag_contentpage in soup.select('.video_box'): #

for tag_a in Tag_contentpage.select('a'): #

httphtml=tag_a['href']

retlist.append(htmlhead+httphtml) # use htmlhead

#print(imgsrc)

return retlist

def GetNPage_html(homepage,n):

rethtml=[]

for num in range(1,n+1):

if num == 1:

homewebpage=homepage

else:

homewebpage= homepage.rsplit('.',1)[0] + '_'+ str(num) + '.html'

print(homewebpage)

htmllinks = GetwVideoHtml(homewebpage)

rethtml = rethtml + htmllinks

return rethtml

def GetMp4SrcFromHtml(url):

headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")

opener = urllib.request.build_opener()

opener.addheaders = [headers]

urllib.request.install_opener(opener)

file = urllib.request.urlopen(url).read()

file = file.decode('gbk')

pattern = re.compile(r'(https?://.*.mp4)', re.I)

videolinks = pattern.findall(file)

videolinks = list(set(videolinks))

return videolinks

def HtmlList2Mp4List(sumhtml):

retmp4s = []

for html in sumhtml:

mp4s = GetMp4SrcFromHtml(html)

for mp4 in mp4s:

retmp4s.append(mp4)

return retmp4s

def Writelist2json(listname,lists):

length = str(len(lists))

with open('D:/ipynb/commfile/'+ listname + '_len_'+length +'.json', 'w') as fw:

json.dump(lists, fw)

sumhtml = GetNPage_html(homepage,3)

mp4list = HtmlList2Mp4List(sumhtml)

Writelist2json("mp4list",mp4list)

下载部分

from bs4 import BeautifulSoup

import requests

import urllib

import json

import threading

import datetime

import os

def mkdir(path):

folder = os.path.exists(path)

if not folder: #判断是否存在文件夹如果不存在则创建为文件夹

os.makedirs(path) #makedirs 创建文件时如果路径不存在会创建这个路径

print ("--- new folder... ---")

print ("--- OK ---")

else:

print ("--- There is this folder! ---")

def Schedule(a,b,c):

'''

回调函数:用于显示下载进度

a:已经下载的数据块

b:数据块的大小

c:远程文件的大小

'''

per = 100.0 * a * b / c

if (per > 100) :

per = 100

print ('%.2f%%' % per)

def createdownloadlink(name,url):

urllib.request.urlretrieve(url,name)

class myThread (threading.Thread):

def __init__(self, name, url):

threading.Thread.__init__(self) # 线程初始化

self.name = name # 赋值成员变量

self.url = url

def run(self):

print ("开始下载:" + self.name)

urllib.request.urlretrieve(self.url,self.name)

#createdownloadlink(self.name, self.url) # 在线程中运行的函数

print ("完成下载:" + self.name)

def DownMp4file(lists):

dateASfolder=datetime.datetime.now().strftime('%m-%d')

foldername = 'D:/videos/'+dateASfolder

mkdir( foldername)

threadlist = [];#存放线程的数组,相当于线程池

filenum=0

for url in lists:

filename = foldername + '/'+ str(filenum)+ '.mp4'

filenum=filenum+1

thread = myThread(filename, url) # 创建线程对象

threadlist.append(thread) #这个线程放到线程threads

return threadlist

# 执行部分

with open('D:/ipynb/commfile/srcmp4s_len_66.json', 'r') as fr:

srcmp4s = json.load(fr)

print(len(srcmp4s))

srcmp4s[0]

threads= DownMp4file(srcmp4s)

for t in threads[:10]:#让线程池中的所有数组开始

t.start();

for t in threads[:10]:

t.join();#等待所有线程运行完毕才执行一下的代码

来源:https://www.cnblogs.com/ims-/p/9736006.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值