多线程下载
import os.path
import shutil
from threading import Thread, Lock
import traceback
import requests
from queue import Queue
class Spider:
def __init__(self, task_list, thread_num):
"""
多线程下载
:param task_list: [(url,save_path),(url,save_path),...]
:param thread_num: 线程数
"""
self.tasks = Queue()
self.task_list = task_list
self.thread_num = thread_num
self.lock = Lock()
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
def urls_in(self):
"""url入队列"""
for task in self.task_list:
self.tasks.put(task)
def download(self, url, save_path):
if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
print(f'{url} 已下载,跳过\n')
return
print(f'下载链接: {url} \n')
os.makedirs(os.path.dirname(save_path), exist_ok=True)
resp = requests.get(url, headers=self.headers, stream=True, timeout=10)
with open(save_path, 'wb') as f:
shutil.copyfileobj(resp.raw, f)
def write_failed_txt(self, task):
with open('failed.txt', 'a', encoding='utf-8', ) as f:
f.write(str(task))
f.write(',')
f.write('\n')
def crawl(self):
while not self.tasks.empty():
task = self.tasks.get()
try:
self.download(url=task[0], save_path=task[1])
except:
self.write_failed_txt(task)
print(traceback.format_exc())
finally:
self.tasks.task_done()
def run(self):
self.urls_in()
for i in range(self.thread_num):
t = Thread(target=self.crawl)
t.start()
self.tasks.join()
if __name__ == '__main__':
spider = Spider(
task_list=[
('https://assets.mixkit.co/active_storage/sfx/833/833-preview.mp3',
'./mixkit/More/Construction/Metal hammer hit.mp3'),
],
thread_num=4
)
spider.run()