爬虫--keep App

药不能停i

于 2020-05-14 20:14:27 发布

阅读量1.9k

点赞数 1

分类专栏：爬虫文章标签： python

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/qq_35338042/article/details/106127676

版权

爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import re

import os

import time

import requests

from contextlib import closing

class KeepReptile(object):

def __init__(self, url, headers, keep_suburl, keep_video_suburl):

self.url = url

self.headers = headers

self.keep_suburl = keep_suburl

self.keep_video_suburl = keep_video_suburl

def get_download_video(self):

download_video = {}

html_text = requests.get(self.url, headers=self.headers).text

classification_urls = re.findall('/workouthashtags/\w*', html_text, re.S)

# video_ids = re.findall('(<a href="/workouthashtags/)^[a-z][0-9]$', html_text, re.S)

#分类url

for classification_url in classification_urls:

new_classification_url = self.keep_suburl + classification_url

html_text = requests.get(new_classification_url, headers=self.headers).text

subclassification_urls = re.findall('/plans/\w*', html_text, re.S)

#子分类url

for subclassification_url in subclassification_urls:

new_subclassification_url = self.keep_suburl + subclassification_url

html_text = requests.get(new_subclassification_url, headers=self.headers).text

action_urls = re.findall('/exercises/\w*', html_text, re.S)

#动作分类url

for action_url in action_urls:

new_action_url = self.keep_suburl + action_url

html_text = requests.get(new_action_url, headers=self.headers).text

video_urls = re.findall('/chaos/\w*/\w*.mp4', html_text, re.S)

actions_names = re.findall('<h2 class="name">(.*?)</h2>', html_text, re.S)

#video url

for action_name, video_url in zip(actions_names, video_urls):

final_video_url = self.keep_video_suburl + video_url

print(action_name, final_video_url)

download_video[action_name] = final_video_url

# download_url.append(final_video_url)

return download_video

def download_video(self, video_path, action_name, video):

video_name = video_path + action_name + '.mp4'

print('video_name:', video_name)

with closing(requests.get(video, headers=self.headers, stream=True, verify=False)) as res:

chunk_size = 1024*10

content_size = int(res.headers['content-length'])

if os.path.exists(video_name) and os.path.getsize(video_name)>=content_size:

print('已下载')

return

if res.status_code == 200:

print('开始下载')

with open(video_name, "wb") as f:

p = ProgressData(size = content_size, unit='Kb', block=chunk_size, file_name=video_name)

for chunk in res.iter_content(chunk_size=chunk_size):

if chunk:

f.write(chunk)

p.output()

class ProgressData(object):

def __init__(self, block,size, unit, file_name='', ):

self.file_name = file_name

self.block = block/1000.0

self.size = size/1000.0

self.unit = unit

self.count = 0

self.start = time.time()

def output(self):

self.end = time.time()

self.count += 1

speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0

self.start = time.time()

loaded = self.count*self.block

progress = round(loaded/self.size, 4)

if loaded >= self.size:

print('%s下载完成\r\n'%self.file_name)

else:

print('{0}下载进度{1:.2f}{2}/{3:.2f}{4} {5:.2%} 下载速度{6:.2f}{7}/s'.format(self.file_name, loaded, self.unit, self.size, self.unit, progress, speed, self.unit))

print('%50s'%('/'*int((1-progress)*50)))

def main():

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}

url = 'https://www.gotokeep.com/training'

keep_suburl = 'https://www.gotokeep.com'

keep_video_suburl = 'https://static1.keepcdn.com'

video_path = 'G:/zh/zh/reptile_video/keep_video/'

k = KeepReptile(url, headers, keep_suburl, keep_video_suburl)

video_disc = k.get_download_video()

for action_name, video in video_disc.items():

k.download_video(video_path ,action_name, video)

if __name__ == "__main__":

main()

关注

1
点赞
踩
8

收藏

觉得还不错? 一键收藏
2
评论
爬虫--keep App

importreimportosimporttimeimportrequestsfromcontextlibimportclosingclassKeepReptile(object):def__init__(self,url,headers,keep_suburl,keep_video_suburl):self.url=urlself.headers=headersself....
复制链接

扫一扫

专栏目录

评论 2

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。