import requests
from lxml import etree
import json
import os
from selenium import webdriver
import time
class LoadVideos(object):
def __init__(self):
self.index_url = 'http://www.365yg.com/'
self.json_url = 'http://www.365yg.com/api/pc/feed/?category=video&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1654A545ACFD9C&cp=5A4A0F0D29FC7E1&_signature='
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}
self.webdriver = webdriver.PhantomJS('/Users/zhangninglei/Downloads/phantomjs-2.1.1-macosx/bin/phantomjs')
self.video_list = {}
def get_video_info(self):
r = requests.get(url=self.json_url,headers=self.headers)
obj = json.loads(r.text)
for video in obj['data']:
video_name = video['video_id']
video_url = self.index_url+ video['source_url']
self.video_list[video_name]=video_url
def load_video_data(self):
for i in self.video_list:
url = self.video_list[i]
#通过浏览器发送请求
self.webdriver.get(url)
#休眠一下,加载数据
time.sleep(5)
#得到网页源代码
html = self.webdriver.page_source
#解析页面,并下载
html_tree = etree.HTML(html)
video_src = html_tree.xpath('//video[@class="vjs-tech"]/source/@src')[0]
print('开始加载'+i+'的数据!')
r = requests.get(url = video_src,headers=self.headers)
print(i + '的数据加载完毕!')
#保存到本地
print('将'+i+'保存到本地!')
save_video(filename=i,data=r.content)
print(i+'已成功保存!')
def save_video(filename,data):
filepath = os.path.join(os.getcwd()+'/video/'+filename+'.mp4')
with open(filepath,'wb') as f1:
f1.write(data)
def main():
loadvideo = LoadVideos()
loadvideo.get_video_info()
loadvideo.load_video_data()
if __name__ == '__main__':
main()
爬取阳光宽频网的视频
最新推荐文章于 2024-04-25 16:55:45 发布