python爬虫爬视频——带进度条

最新推荐文章于 2023-01-31 17:30:00 发布

roc-ever

最新推荐文章于 2023-01-31 17:30:00 发布

阅读量788

点赞数 1

本文链接：https://blog.csdn.net/roc_ever/article/details/105467738

版权

代码:

# encoding= 'utf-8'
# 添加了进度条
# 未添加多线程
import requests
from bs4 import BeautifulSoup
import os
from lxml import etree
import time
import threading  # add later
import sys
class Actress():
    def __init__(self, string, headers):
    	#网址是不能说的秘密。。。
        self.url = "https://xxxx.com/html/%s" % int(string)
        self.headers = headers
        self.string = string

    def video_source(self, headers, timeout):
        r = requests.get(self.url, headers=headers)
        if r.status_code == 200:
            print("连接成功，开始下载:")
        else:
            print("网络连接失败，请检查网站书写或者网络连接\n")

        html = etree.HTML(r.text)
        soup = BeautifulSoup(r.text, 'lxml')
        title = soup.select(
            'div.container.xq_js > div >div.col-lg-12.col-md-12.col-xs-12 > div.j_s > h3')
        #获取指定信息的一种方式
        # src = soup.select(
        #    'div.container.xq_js > div > div.col-lg-12.col-md-12.col-xs-12 > div.j_s > div.rm_bq > ul > li > a')
        #也可以使用xpath
        src = html.xpath('//div[@class="rm_bq"]/ul/li/a/@href')
        for title, src in zip(title, src):
            data = {
                'title': title.get_text().strip().split(' ')[0],
                'src': src.strip()
            }
            self.src = src.strip()
            print(self.src)
        if str(self.src).endswith('.mp4'):
            url = str(self.src)
            self.r = requests.get(url, headers=self.headers)
            if self.r.status_code == 200:
                print('headers is ', self.r.headers)
                print('Type: ',
                      self.r.headers['Content-Type'])
                self.Type = str(self.r.headers['Content-Type']).split('/')[1]
                #print('now type is:', self.Type)
                print('Size: ',
                      self.r.headers['Content-Length'], 'bytes')
                self.volume = int(self.r.headers['Content-Length'])
            else:
                print('bad link')

        else:
            print('existing...')

    def video_download(self, headers, timeout):
        r = requests.get(self.src, headers=headers)
        temp_size = 0
        if r.status_code == 200:
            print('link successfully.')
            string = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
            path = 'D:\\file\\%s\\' % string
            if os.path.exists(path):
                pass
            else:
                os.mkdir(path)
                print('directory has been made up.')
            name = time.strftime("%H_%M_%S", time.localtime())
            with open(path + name + '.' + self.Type, 'wb') as f:
                for chunk in self.r.iter_content(chunk_size=1024 * 1024 * 10):
                    if chunk:
                        temp_size += len(chunk)
                        f.write(chunk)
                        f.flush()
                        # 进度条
                        has_done = int(50 * temp_size / self.volume)
                    sys.stdout.write("\r[%s%s]%d%%" % (
                        '>' * has_done, '-' * (50 - has_done), 100 * temp_size / self.volume))
                print()
            print('video downloaded. Stored in ', path + name)
        else:
            print('wrong link.')
            print('new link needed.')
if __name__ == "__main__":
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    # input 5 numbers while number 0 is not started with and no more than 40000
    number='30000'
    action = Action(str(number), headers=headers)
    action.__init__(str(number), headers=headers)
    action.video_source(headers=headers, timeout=10)
    action.video_download(headers=headers, timeout=10)

仅用来记录爬虫的一些知识点。。
转载请注明。

roc-ever

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python爬虫爬视频——带进度条

代码:# encoding= 'utf-8'# 添加了进度条# 未添加多线程import requestsfrom bs4 import BeautifulSoupimport osfrom lxml import etreeimport timeimport threading # add laterimport sysclass Actress(): def _...
复制链接

扫一扫