Multithread download tool for massive tiny files.

Http Files Downloader

with Python 2.7 standard library

# -*- coding: utf-8 -*-
"""
    Multithread download tool for massive tiny files.
"""
#-------------------------
# Author: Kun Liu         
# Start date: 2017-03-06  
# Latest edit: 2017-03-16 
# email = lancelotdev@163.com
# python_version = Python 2.7.11
#===================================


#-----Python 3 Compatible
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
#---------------------------------

import re
import os
import sys
import Queue
import threading
import urllib
import urllib2
import json
import logging
from time import ctime,sleep

# 模拟浏览器访问参数
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {
    'Connection': 'Keep-Alive',
    'Accept': 'text/html, application/xhtml+xml, */*',
    'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
# log settings 抓取记录日志的设置
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    filename='pythonDownload.log',
                    filemode='w')


def agent_request(url):
    req_timeout = 40
    req = urllib2.Request(url, None, headers)
    resp = urllib2.urlopen(req, None, req_timeout)
    html_content = resp.read()
    return html_content      


class DownLoad(threading.Thread):
    def __init__(self, file_que, folder_name = "PyDownload"):
        # Make directory 'PyDownload' to save files
        if not os.path.exists('PyDownload') and folder_name == "PyDownload":
            os.mkdir('PyDownload')
        self.que = Queue.Queue()
        if isinstance(file_que, list):
            for i in file_que:
                self.que.put(i)
        else:
            self.que = file_que
        self.folder_name = folder_name
        self.fail_file_list = []
        threading.Thread.__init__(self)

    def run(self):
        global success_case_num
        print("%d thread is working!"%threading.active_count())
        while True:
            if not self.que.empty():
                file_tuple = self.que.get()
                file_name = file_tuple[0]
                # Legalize file name
                file_name = file_name.translate((None,"|\\?*<\":>+[]/'"))
                file_url = file_tuple[1]
                try:
                    if not file_name.endswith(".torrent"):
                        file_name += ".torrent"
                    with open(os.path.join(self.folder_name,file_name), 'wb') as file:
                            file_data = agent_request(file_url)
                            file.write(file_data)
                            if mutex.acquire(1):  
                                success_case_num += 1
                                mutex.release()
                except Exception as e:
                    self.fail_file_list.append(file_url)
                    logging.warning("DownLoad error:" + str(e) + "Fail file: " + file_url)
                    # print (e, file_url)
                    continue
            else:
                return

mutex = threading.Lock()
success_case_num = 0

class DownLoadDispatcher:
    def __init__(self, name_url_tuple_list):
        self.file_list = name_url_tuple_list

    def start_download(self):
        try:
            file_que = Queue.Queue()
            # que=queue.Queue()#py 3
            for f_tuple in self.file_list:
                file_que.put(f_tuple)
            # 线程个数
            for _ in range(1):
                d = DownLoad(file_que)
                d.start()
                # Download frequency control
                sleep(1)
        except Exception as e:
            print ("pic_downloader exception:" + str(e))
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值