这里展示如何使用 Python 下载网易公开课视频(这里以 MIT 编程导论为例);先用正则表达式提取出需要下载的 MP4 文件,然后使用进程池批量下载到指定目录
需要注意的是
- 提取的链接需要去重(我在调试时发现有重复的链接)
- 指定目标路径(用于存储结果文件)
源码如下示(使用 Python 2.7)
import os
import re
import urllib2
import functools
import logging
import multiprocessing
logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] - %(message)s')
log = logging.getLogger(__name__)
def fetch_file_links(url, pattern):
response = urllib2.urlopen(url)
data = response.read()
# Remove duplicate ones
return set(re.findall(pattern, data))
def _download(link, destination):
if not os.path.exists(destination):
os.makedirs(destination)
file_name = link.rsplit('/', 1)[1]
log.info('Resolving <%s>...', file_name)
target_path = os.path.join(destination, file_name)
target_file = open(target_path, 'wb')
log.info('Saving to <%s>...', target_path)
response = urllib2.urlopen(link)
while True:
data = response.read(64 * 1024)
if data:
target_file.write(data)
else:
break
target_file.close()
log.info('Success to create <%s>!!!', file_name)
def main():
blog_url = 'http://v.163.com/special/opencourse/bianchengdaolun.html'
links = fetch_file_links(blog_url, r"href='(http://.*\.mp4)'")
log.info('Totally get %d files...', len(links))
pool = multiprocessing.Pool(multiprocessing.cpu_count() * 2)
download = functools.partial(_download, destination='D:/workspace/bianchengdaolun')
pool.map(download, links)
pool.join()
log.info('Main exit!!!')
if __name__ == '__main__':
main()