【日常点滴016】python多线程

浪淘三千

已于 2023-07-13 14:35:06 修改

阅读量799

点赞数

文章标签： python 爬虫开发语言

于 2023-06-01 16:42:35 首次发布

本文链接：https://blog.csdn.net/weixin_43521165/article/details/130991483

版权

python多线程

函数式多线程

可以在创建线程对象时，通过 args 参数传入线程执行函数的参数（必须是一个元祖）。例如：

import threading

def worker(h):
    print(f'Worker thread started with name {threading.current_thread().name} {h}')
    # 线程执行的代码
    print(f'Worker thread finished with name {threading.current_thread().name} {h}')

t = threading.Thread(target=worker, args=('thread1',))
t.start()
print('Main thread finished')

如果需要在线程中设置名称，可以通过 Thread 对象的 name 参数来指定，不传入的话，会使用默认的名字
自定义线程名字方式如下：

t = threading.Thread(target=worker, name='my_thread')

在线程中，获取线程名字方式

print('当前线程名字：',threading.current_thread().name)

综合应用（类调用式多线程）

（代码有点老了，怕丢失，发在csdn上，仅做多线程代码了解。
改改结构应该还能用到很多网站上）

import requests     斗图网 多线程爬虫
from lxml import etree
from urllib import request
import os
from queue import Queue
import threading

class Procuder(threading.Thread):
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0' }

    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Procuder, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue
    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_page(url)

    def parse_page(self, url):
        response = requests.get(url, headers=self.headers)
        text = response.text
        html = etree.HTML(text)
        Imgs=html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
        for img in imgs:
            img_url = img.get('data-original')
            alt = img.get('alt')
            suffix = os.path.splitext(img_url)[1]  # 分割文件名和后缀名
            filename = alt + suffix
            self.img_queue.put((img_url, filename))

class Consumer(threading.Thread):
    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.page_queue.empty() and self.img_queue.empty():
                break
            img_url, filename = self.img_queue.get()
            request.urlretrieve(img_url, 'images100/' + filename)
            print(filename)

def main():
    page_queue = Queue(100)
    img_queue = Queue(10000)
    for x in range(1, 100):
        url = 'https://www.doutula.com/photo/list/?page=%s' % x
        page_queue.put(url)
    for x in range(8):
        t = Procuder(page_queue, img_queue)
        t.start()
    for x in range(8):
        t = Consumer(page_queue, img_queue)
        t.start()


if __name__ == '__main__':
    main()

一个下载图片的简洁多线程链接从文件中读取的

import requests     
from urllib import request
import os
from queue import Queue
import threading
import pandas, time

img_queue_over = False
all_down_num = 0
class Procuder(threading.Thread):
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0' }

    def __init__(self, img_queue,df_downimg_url_list, *args, **kwargs):
        super(Procuder, self).__init__(*args, **kwargs)
        self.img_queue = img_queue
        self.df_downimg_url_list = df_downimg_url_list
    def run(self):
        global img_queue_over

        for img_url in self.df_downimg_url_list: 
            self.img_queue.put(img_url)
        print('插入队列数据量为：',self.img_queue.qsize())
        img_queue_over = True
            
        
class Consumer(threading.Thread):

    def __init__(self, img_queue, *args, **kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.img_queue = img_queue
        self.down_num = 0

    def run(self):
        global all_down_num
        while True:
            if self.img_queue.empty() and img_queue_over:
                all_down_num += self.down_num
                print(f'{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} 线程{self.name}报告:处理完毕，其共下载数量为：{self.down_num},所有线程共处理{all_down_num}张图片')
                active_threads_num = threading.active_count()
                if active_threads_num <=2:
                    print()
                    print()
                    print(f'~!! ^_^ !!~线程 {self.name} 报告:~程序处理完毕~~{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}当前共下载图片量为:{all_down_num}--]：当前活跃线程数量 {active_threads_num}')
                break

            img_url = self.img_queue.get()
            request.urlretrieve(img_url, os.path.join(dir_name,os.path.basename(img_url)))
            self.down_num +=1
            
            if  self.down_num //100 >0 and self.down_num % 100 ==0:
                print(f'{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} 线程{self.name}报告:当前处理了{self.down_num}张图片')

    
def main():
    img_queue = Queue(12000)
    
    task_table_path = r"近期未回调结果_审查3.csv"
    df = pandas.read_csv(task_table_path,encoding="gbk")
    df_downimg_list = df.query('Message == "成功" and in_datainlog_and_dataoutlog_and_inspectiondata == False ')
    df_downimg_url_list = df_downimg_list['ImagePath'].values.tolist()
    print(f"共有{len(df_downimg_url_list)}张图片要下载")
        
    for x in range(1):
        t = Procuder(img_queue,df_downimg_url_list)
        t.start()
    time.sleep(5)
    for x in range(8):
        t = Consumer(img_queue)
        t.start()

if __name__ == '__main__':
    dir_name = "图片集0713"
    if not os.path.exists(dir_name):
        os.mkdir(dir_name)
        print("文件夹创建成功")
    main()