当时追剧的时候写的。www.h6080.com还是由不少好东西。用到了wget和progressbar,需要的朋友可以借鉴下。
import requests
from lxml import etree
import re
from multiprocessing import Pool
import wget
import progressbar
def get_page_url():
ls1 = range(1, 32)
ls2 = range(37, 61)
ls1.extend(ls2)
return ['http://www.h6080.com/play/24892/1/{}.html'.format(i) for i in ls1]
def get_video_url(page_url):
response = requests.get(page_url).text
selector = etree.HTML(response.encode('utf-8'))
x = selector.xpath('//*[@id="player"]/script/text()')
return re.findall('src="(.*?)&', x[0])[0]
def get_mp4(video_url):
text = requests.post(video_url, headers=headers).text
return re.findall("video=.'(.*)']", text.encode('utf-8'))[0]
def save_mp4_urls(video_urls):
with open('mp4_urls.txt', 'w')as f:
for i, video_url in enumerate(video_urls):
f.write(str(i + 1) + ' ' + video_url + '\n')
def read_mp4_urls(filename):
with open(filename, 'r')as f:
urls = f.readlines()
return [re.findall(' (.*?)\\n', i)[0] for i in urls]
if __name__ == '__main__':
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
'''
video_urls = map(get_video_url, get_page_url())
mp4_urls = map(get_mp4, video_urls)
print len(video_urls)
print mp4_urls
save_mp4_urls(mp4_urls)
'''
mp4_urls = read_mp4_urls('mp4_urls.txt')
bar = progressbar.ProgressBar()
for i, url in bar(enumerate(mp4_urls)):
filename = str(i + 1) + '.mp4'
print filename
wget.download(url, filename)
'''
p = Pool(4)
for index, url in bar(enumerate(mp4_urls)):
filename = str(index+1)+'.mp4'
p.apply_async(wget.download, (url, filename, ))
p.close()
p.join()
'''