python爬取在线视频思路,用python实现多线程爬取影视网站全部视频方法【笔记】...

我拿这个站点作为案例:https://91mjw.com/  其他站点方法都是差不多的。

第一步:获得整站所有的视频连接

html = requests.get("https://91mjw.com",headers=gHeads).text

xmlcontent = etree.HTML(html)

UrlList = xmlcontent.xpath("//div[@class='m-movies clearfix']/article/a/@href")

NameList = xmlcontent.xpath("//div[@class='m-movies clearfix']/article/h2/a/text()")

第二步 :是进入选择的电影的页面 去获得视频的链接

UrlList = xmlContent.xpath("//div[@id='video_list_li']/a/@href")

第三步 构造下载视频用到的参数

第四步 下载视频 保存到本地

直接上实现代码

使用的多线程 加信号量实现  默认开启5条线程开始操作 每条线程去下载一套视频  是一套 一套 一套

也可以自己去修改同时开启几条线程

实现代码

#!/usr/bin/env python

# -*- coding: utf-8 -*-

import re

import requests

from threading import *

from bs4 import BeautifulSoup

from lxml import etree

from contextlib import closing

nMaxThread = 5

connectlock = BoundedSemaphore(nMaxThread)

gHeads = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}

class MovieThread(Thread):

def __init__(self,url,movieName):

Thread.__init__(self)

self.url = url

self.movieName = movieName

def run(self):

try:

urlList = self.GetMovieUrl(self.url)

for i in range(len(urlList)):

type,vkey = self.GetVkeyParam(self.url,urlList[i])

if type != None and vkey !=None:

payload,DownloadUrl = self.GetOtherParam(self.url,urlList[i],type,vkey)

if DownloadUrl :

videoUrl = self.GetDownloadUrl(payload,DownloadUrl)

if videoUrl :

self.DownloadVideo(videoUrl,self.movieName,i+1)

finally:

connectlock.release()

def GetMovieUrl(self,url):

heads = {

"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",

"Host":"91mjw.com",

"Referer":"https://91mjw.com/"

}

html = requests.get(url,headers=heads).text

xmlContent = etree.HTML(html)

UrlList = xmlContent.xpath("//div[@id='video_list_li']/a/@href")

if  len(UrlList) > 0:

return UrlList

else:

return None

def GetVkeyParam(self,firstUrl,secUrl):

heads = {

"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",

"Host": "91mjw.com",

"Referer": firstUrl

}

try :

html = requests.get(firstUrl+secUrl,headers=heads).text

bs = BeautifulSoup(html,"html.parser")

content = bs.find("body").find("script")

reContent = re.findall('"(.*?)"',content.text)

return reContent[0],reContent[1]

except:

return None,None

def GetOtherParam(self,firstUrl,SecUrl,type,vKey):

url = "https://api.1suplayer.me/player/?userID=&type=%s&vkey=%s"%(type,vKey)

heads = {

"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",

"Host": "api.1suplayer.me",

"Referer": firstUrl+SecUrl

}

try:

html = requests.get(url,headers=heads).text

bs = BeautifulSoup(html,"html.parser")

content = bs.find("body").find("script").text

recontent = re.findall(" = '(.+?)'",content)

payload = {

"type":recontent[3],

"vkey":recontent[4],

"ckey":recontent[2],

"userID":"",

"userIP":recontent[0],

"refres":1,

"my_url":recontent[1]

}

return payload,url

except:

return None,None

def GetDownloadUrl(self,payload,refereUrl):

heads = {

"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",

"Host": "api.1suplayer.me",

"Referer": refereUrl,

"Origin": "https://api.1suplayer.me",

"X-Requested-With": "XMLHttpRequest"

}

while True:

retData = requests.post("https://api.1suplayer.me/player/api.php",data=payload,headers=heads).json()

if  retData["code"] == 200:

return retData["url"]

elif retData["code"] == 404:

payload["refres"] += 1;

continue

else:

return None

def DownloadVideo(self,url,videoName,videoNum):

CurrentSize = 0

heads = {

"chrome-proxy":"frfr",

"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",

"Host":"sh-yun-ftn.weiyun.com",

"Range":"bytes=0-"

}

with closing(requests.get(url,headers=heads)) as response:

retSize = int(response.headers['Content-Length'])

chunkSize = 10240

if response.status_code == 206:

print '[File Size]: %0.2f MB\n' % (retSize/1024/1024)

with open("./video/%s/%02d.mp4"%(videoName,videoNum),"wb") as f:

for data in response.iter_content(chunk_size=chunkSize):

f.write(data)

CurrentSize += len(data)

f.flush()

print '[Progress]: %0.2f%%' % float(CurrentSize*100/retSize) + '\r'

def main():

html = requests.get("https://91mjw.com",headers=gHeads).text

xmlcontent = etree.HTML(html)

UrlList = xmlcontent.xpath("//div[@class='m-movies clearfix']/article/a/@href")

NameList = xmlcontent.xpath("//div[@class='m-movies clearfix']/article/h2/a/text()")

for i in range(len(UrlList)):

connectlock.acquire()

url = UrlList[i]

name = NameList[i].encode("utf-8")

t = MovieThread(url,name)

t.start()

if __name__ == '__main__':

main()

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值