函数式多线程
可以在创建线程对象时,通过 args 参数传入线程执行函数的参数(必须是一个元祖)。例如:
import threading
def worker(h):
print(f'Worker thread started with name {threading.current_thread().name} {h}')
# 线程执行的代码
print(f'Worker thread finished with name {threading.current_thread().name} {h}')
t = threading.Thread(target=worker, args=('thread1',))
t.start()
print('Main thread finished')
如果需要在线程中设置名称,可以通过 Thread 对象的 name 参数来指定,不传入的话,会使用默认的名字
自定义线程名字方式如下:
t = threading.Thread(target=worker, name='my_thread')
在线程中,获取线程名字方式
print('当前线程名字:',threading.current_thread().name)
综合应用(类调用式多线程)
(代码有点老了,怕丢失,发在csdn上,仅做多线程代码了解。
改改结构应该还能用到很多网站上)
import requests 斗图网 多线程爬虫
from lxml import etree
from urllib import request
import os
from queue import Queue
import threading
class Procuder(threading.Thread):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0' }
def __init__(self, page_queue, img_queue, *args, **kwargs):
super(Procuder, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.parse_page(url)
def parse_page(self, url):
response = requests.get(url, headers=self.headers)
text = response.text
html = etree.HTML(text)
Imgs=html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
for img in imgs:
img_url = img.get('data-original')
alt = img.get('alt')
suffix = os.path.splitext(img_url)[1] # 分割文件名和后缀名
filename = alt + suffix
self.img_queue.put((img_url, filename))
class Consumer(threading.Thread):
def __init__(self, page_queue, img_queue, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.page_queue.empty() and self.img_queue.empty():
break
img_url, filename = self.img_queue.get()
request.urlretrieve(img_url, 'images100/' + filename)
print(filename)
def main():
page_queue = Queue(100)
img_queue = Queue(10000)
for x in range(1, 100):
url = 'https://www.doutula.com/photo/list/?page=%s' % x
page_queue.put(url)
for x in range(8):
t = Procuder(page_queue, img_queue)
t.start()
for x in range(8):
t = Consumer(page_queue, img_queue)
t.start()
if __name__ == '__main__':
main()
一个下载图片的简洁多线程 链接从文件中读取的
import requests
from urllib import request
import os
from queue import Queue
import threading
import pandas, time
img_queue_over = False
all_down_num = 0
class Procuder(threading.Thread):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0' }
def __init__(self, img_queue,df_downimg_url_list, *args, **kwargs):
super(Procuder, self).__init__(*args, **kwargs)
self.img_queue = img_queue
self.df_downimg_url_list = df_downimg_url_list
def run(self):
global img_queue_over
for img_url in self.df_downimg_url_list:
self.img_queue.put(img_url)
print('插入队列数据量为:',self.img_queue.qsize())
img_queue_over = True
class Consumer(threading.Thread):
def __init__(self, img_queue, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.img_queue = img_queue
self.down_num = 0
def run(self):
global all_down_num
while True:
if self.img_queue.empty() and img_queue_over:
all_down_num += self.down_num
print(f'{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} 线程{self.name}报告:处理完毕,其共下载数量为:{self.down_num},所有线程共处理{all_down_num}张图片')
active_threads_num = threading.active_count()
if active_threads_num <=2:
print()
print()
print(f'~!! ^_^ !!~线程 {self.name} 报告:~程序处理完毕~~{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}当前共下载图片量为:{all_down_num}--]:当前活跃线程数量 {active_threads_num}')
break
img_url = self.img_queue.get()
request.urlretrieve(img_url, os.path.join(dir_name,os.path.basename(img_url)))
self.down_num +=1
if self.down_num //100 >0 and self.down_num % 100 ==0:
print(f'{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} 线程{self.name}报告:当前处理了{self.down_num}张图片')
def main():
img_queue = Queue(12000)
task_table_path = r"近期未回调结果_审查3.csv"
df = pandas.read_csv(task_table_path,encoding="gbk")
df_downimg_list = df.query('Message == "成功" and in_datainlog_and_dataoutlog_and_inspectiondata == False ')
df_downimg_url_list = df_downimg_list['ImagePath'].values.tolist()
print(f"共有{len(df_downimg_url_list)}张图片要下载")
for x in range(1):
t = Procuder(img_queue,df_downimg_url_list)
t.start()
time.sleep(5)
for x in range(8):
t = Consumer(img_queue)
t.start()
if __name__ == '__main__':
dir_name = "图片集0713"
if not os.path.exists(dir_name):
os.mkdir(dir_name)
print("文件夹创建成功")
main()