Http Files Downloader
with Python 2.7 standard library
# -*- coding: utf-8 -*-
"""
Multithread download tool for massive tiny files.
"""
#-------------------------
# Author: Kun Liu
# Start date: 2017-03-06
# Latest edit: 2017-03-16
# email = lancelotdev@163.com
# python_version = Python 2.7.11
#===================================
#-----Python 3 Compatible
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
#---------------------------------
import re
import os
import sys
import Queue
import threading
import urllib
import urllib2
import json
import logging
from time import ctime,sleep
# 模拟浏览器访问参数
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
# log settings 抓取记录日志的设置
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='pythonDownload.log',
filemode='w')
def agent_request(url):
req_timeout = 40
req = urllib2.Request(url, None, headers)
resp = urllib2.urlopen(req, None, req_timeout)
html_content = resp.read()
return html_content
class DownLoad(threading.Thread):
def __init__(self, file_que, folder_name = "PyDownload"):
# Make directory 'PyDownload' to save files
if not os.path.exists('PyDownload') and folder_name == "PyDownload":
os.mkdir('PyDownload')
self.que = Queue.Queue()
if isinstance(file_que, list):
for i in file_que:
self.que.put(i)
else:
self.que = file_que
self.folder_name = folder_name
self.fail_file_list = []
threading.Thread.__init__(self)
def run(self):
global success_case_num
print("%d thread is working!"%threading.active_count())
while True:
if not self.que.empty():
file_tuple = self.que.get()
file_name = file_tuple[0]
# Legalize file name
file_name = file_name.translate((None,"|\\?*<\":>+[]/'"))
file_url = file_tuple[1]
try:
if not file_name.endswith(".torrent"):
file_name += ".torrent"
with open(os.path.join(self.folder_name,file_name), 'wb') as file:
file_data = agent_request(file_url)
file.write(file_data)
if mutex.acquire(1):
success_case_num += 1
mutex.release()
except Exception as e:
self.fail_file_list.append(file_url)
logging.warning("DownLoad error:" + str(e) + "Fail file: " + file_url)
# print (e, file_url)
continue
else:
return
mutex = threading.Lock()
success_case_num = 0
class DownLoadDispatcher:
def __init__(self, name_url_tuple_list):
self.file_list = name_url_tuple_list
def start_download(self):
try:
file_que = Queue.Queue()
# que=queue.Queue()#py 3
for f_tuple in self.file_list:
file_que.put(f_tuple)
# 线程个数
for _ in range(1):
d = DownLoad(file_que)
d.start()
# Download frequency control
sleep(1)
except Exception as e:
print ("pic_downloader exception:" + str(e))