功能代码记录:
# -*- coding:utf-8 -*-
import os
import sys
import time
import urllib
import Queue
import random
import requests
import threading
import threadpool
from bs4 import BeautifulSoup
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
DOWNLOAD_DIR = '/home/xxxxxx/files'
ROOT_URL = 'https://www.xxxxxx.org/file/'
FILE_SIZE_LIMIT = 10 * 1024 * 1024
class Task(threading.Thread):
def __init__(self, t_index, input_queue, error_queue):
super(Task, self).__init__()
self.thread_name = "task-%s" % t_index
self.input_queue = input_queue
self.error_queue = error_queue
self.daemon = True
def run(self):
while True:
try:
print '%s run task!' % self.thread_name
func, args = self.input_queue.get(block=False)
except Queue.Empty:
break
try:
func(*args)
except Exception as e:
self.error_queue.put((func, args, str(e)))
print "%s finished!" % self.thread_name
class TaskPool(object):
def __init__(self, size):
self.input_queue = Queue.Queue()
self.error_queue = Queue.Queue()
self.tasks = [Task(i, self.input_queue, self.error_queue) for i in range(size)]
def add_task(self, func, args):
if not isinstance(args, tuple):
raise TypeError('args must be tuple type!')
self.input_queue.put((func, args))
def add_tasks(self, tasks):
if not isinstance(tasks, list):
raise TypeError('tasks must be list type!')
for (func, args) in tasks:
self.add_task(func, args)
def get_error_result(self):
while not self.error_queue.empty():
func, args, msg = self.error_queue.get()
print "error result func %s args %s msg %s" % (func, args, msg)
def run(self):
print len(self.tasks)
for task in self.tasks:
task.run()
for task in self.tasks:
if task.isAlive():
task.join()
def find_download_url(parent_url, download_url_list):
resp = requests.get(parent_url)
if resp.ok:
html = BeautifulSoup(resp.content, 'html.parser')
a_tags = html.select('pre a')
start_tag = 0
for a_tag in a_tags:
start_tag = start_tag + 1
path = parent_url + str(a_tag.text)
print '%s %s' % (start_tag, path)
if path.endswith('../'):
continue
elif path.endswith('/'):
print '%s continue finding' % path
find_download_url(path, download_url_list)
else:
download_url_list.append(parent_url + str(a_tag['href']))
def download_url_link(download_url):
download_url = str(download_url).strip()
print 'start download url link %s' % download_url
resp = requests.get(download_url)
if resp.ok:
file_size = long(resp.headers['content-length'])
if file_size > FILE_SIZE_LIMIT:
return
download_url_suffix = download_url.replace(ROOT_URL, '')
last_index = download_url_suffix.rindex('/')
download_dir = DOWNLOAD_DIR + download_url_suffix[:last_index + 1]
if not os.path.exists(download_dir):
os.makedirs(download_dir)
with open(DOWNLOAD_DIR + download_url_suffix, 'wb') as df:
df.write(resp.content)
#urllib.urlretrieve(download_url, filename=DOWNLOAD_DIR + download_url_suffix)
else:
print resp.status_code
print resp.content
def gen_download_url_txt():
download_url_list = []
find_download_url(ROOT_URL, download_url_list)
with open('/home/xxxx/project/xxxxxx/download_url.txt', 'wb') as df:
for download_url in download_url_list:
print download_url
df.write(download_url)
df.write('\n')
if __name__ == '__main__':
# taskPool = TaskPool(5)
# with open('/home/xxxx/project/xxxxxx/download_url.txt', 'r') as df:
# for download_url in df.readlines():
# taskPool.add_task(download_url_link, (download_url,))
# taskPool.run()
# taskPool.get_error_result()
taskPool = threadpool.ThreadPool(5)
args_list = []
with open('/home/xxxx/project/xxxxxx/download_url.txt', 'r') as df:
for download_url in df.readlines():
args_list.append(download_url)
request_list = threadpool.makeRequests(download_url_link, args_list, None)
[taskPool.putRequest(request) for request in request_list]
taskPool.wait()