Python下的多线程处理

最新推荐文章于 2022-11-02 20:04:18 发布

houzeyu666

最新推荐文章于 2022-11-02 20:04:18 发布

阅读量8.9k

点赞数 9

分类专栏：学习

本文链接：https://blog.csdn.net/houzeyu666/article/details/82781636

版权

学习专栏收录该内容

48 篇文章 2 订阅

订阅专栏

########多线程#########

## 线程的概念与多线程的引入

一个标准的线程由线程ID，当前指令指针(PC），寄存器集合和堆栈组成。另外，线程是进程中的一个实体，一个进程里面必然会有一个主线程，是被系统独立调度和分派的基本单位，线程自己不拥有系统资源，只拥有一点儿在运行中必不可少的资源，但它可与同属一个进程的其它线程共享进程所拥有的全部资源。

多线程，是指从软件或者硬件上实现多个线程并发执行的技术。具有多线程能力的计算机因有硬件支持而能够在同一时间执行多于一个线程，进而提升整体处理性能。

## python中多线程的操作

## 创建线程

# 1. 管理线程的模块: _thread, threading
# 2. _thread创建多线程: _thread.start_new_thread(线程需要执行的任务,(任务需要的参数, 元组数据类型 ))
# 3. threading创建多线程第一种方式:实例化一个对象(Thread)
t1 = threading.Thread(target=任务函数名, args=(x1,x2), name='threadName')
# 4. 启动线程: t.start()

"""
创建线程的模块
_thread模块
threading模块(常用)

"""

import threading

import time


def job():
    time.sleep(1)
    print("正在执行的任务")
    # 激活的线程个数
    print("当前线程的个数:", threading.active_count())
    # 打印当前线程的详细信息
    print("当前线程的信息:", threading.current_thread())

if __name__ == "__main__":
    # 运行函数
    job()
    # 创建线程 并开始执行线程
    t1 = threading.Thread(target=job, name="Job1", args=())  ## 可见Thread是一个类，需要修改一些默认参数
    t2 = threading.Thread(target=job, name="Job2", args=())
    # 使用start方法开始进程
    t1.start()
    t2.start()

## 多线程与join方法

- join方法: 在使用多线程时，会等使用该方法的待线程结束之后，再执行其他线程，作用就是阻塞正在调用的其它线程。

import threading
import time


def readBook(name):
    time.sleep(1)
    print("正在读%s" % name)


def music(name):
    time.sleep(1)
    print("正在唱%s" % name)


if __name__ == "__main__":
    t1 = threading.Thread(target=readBook, name=readBook, args=("python36",))
    t2 = threading.Thread(target=music, name=(music), args=("双截棍",))
    t1.start()
    t2.start()
    # 在不使用join方法时直接执行程序,可见主线程先执行，t1,t2并发执行
    # 主程序
    print(time.ctime())

使用join方法：

import threading
import time


def readBook(name):
    time.sleep(1)
    print("正在读%s" % name)


def music(name):
    time.sleep(1)
    print("正在唱%s" % name)


if __name__ == "__main__":
    t1 = threading.Thread(target=readBook, name=readBook, args=("python36",))
    t2 = threading.Thread(target=music, name=(music), args=("双截棍",))
    t1.start()
    t2.start()
    # 使用join方法
    t1.join()
    t2.join()
    # 主程序
    print(time.ctime())

## 守护线程set_daemon

# 当主线程执行结束，让没有执行的线程强制结束;set_daemon

import threading
import time

# 任务1：
def music(name):
    for i in range(2):
        time.sleep(1)
        print("正在听音乐%s" %(name))
# 任务2：
def code(name):
    for i in range(2):
        time.sleep(2)
        print("正在编写代码%s" %(name))

if __name__ == '__main__':
    start_time = time.time()
    # music("中国梦")
    # code("爬虫")

    t1 = threading.Thread(target=music, args=("中国梦",))
    t2 = threading.Thread(target=code, args=("爬虫", ))
    # 将t1线程生命为守护线程, 如果设置为True, 子线程启动， 当主线程执行结束， 子线程也结束
    # 设置setDaemon必须在启动线程之前进行设置;
    t1.setDaemon(True)
    t2.setDaemon(True)
    t1.start()
    t2.start()

    print(time.time() - start_time)

由此可见：运行结果中不会显示子线程的信息，因为t1,t2均被设置为守护线程

## 多线程应用下的批量管理主机

pass

等待后续补充内容

## 多线程下的获取IP地理位置

import json
import threading
from urllib.request import urlopen

import time


def job(ip):
    """获取指定ip对应的地理位置"""
    url = "http://ip.taobao.com/service/getIpInfo.php?ip=%s" % ip
    # 根据url获取网页的内容， 并且解码为utf-8格式， 识别中文;
    text = urlopen(url).read().decode("utf-8")
    # 将获取的字符串类型转换为字典， 方便处理
    d = json.loads(text)['data']
    country = d['country']
    city = d['city']
    print("%s:" % ip, country, city)


def many_thread():
    start_time = time.time()
    threads = []
    ips = ['172.25.254.40', '8.8.8.8',
           '172.25.254.40', '8.8.8.8',
           '172.25.254.40', '8.8.8.8']
    for ip in ips:
        t = threading.Thread(target=job, args=(ip,))
        threads.append(t)
        t.start()

    [thread.join() for thread in threads]
    print("使用多线程的运行时间为%s" % (time.time() - start_time))


def no_thread():
    start_time = time.time()
    ips = ['172.25.254.40', '8.8.8.8',
           '172.25.254.40', '8.8.8.8',
           '172.25.254.40', '8.8.8.8']
    for ip in ips:
        job(ip)
    print("未使用线程的运行时间为%s" % (time.time() - start_time))


if __name__ == "__main__":
    many_thread()
    no_thread()

## 创建线程的第二种方法(继承)

## 类的继承来实现多线程

import threading


class Job(threading.Thread):
    # 重写构造方法
    def __init__(self, jobname):
        super(Job, self).__init__()
        self.jobname = jobname

    # 将多线程需要执行的任务重写到run方法中;
    def run(self):
        print("this is a job")

t1 = Job(jobname="new job")
t1.start()

## 利用类的继承实现多线程获取IP信息

import threading
import json
import time
from urllib.error import HTTPError
from urllib.request import urlopen


class IPthread(threading.Thread):
    def __init__(self, jobname, ip):
        super(IPthread, self).__init__()
        self.jobname = jobname
        self.ip = ip

    def run(self):
        try:
            # 需要有一个参数， 传ip;
            url = "http://ip.taobao.com/service/getIpInfo.php?ip=%s" % (self.ip)
            # 根据url获取网页的内容， 并且解码为utf-8格式， 识别中文;
            text = urlopen(url).read().decode('utf-8')

        except HTTPError as e:
            print("Error: %s获取地理位置网络错误" % (self.ip))

        else:
            d = json.loads(text)["data"]
            country = d['country']
            city = d['city']
            print("%s" % (self.ip), country, city)


def use_thread():
    start_time = time.time()
    threads = []
    ips = ['172.25.254.40', '8.8.8.8',
           '172.25.254.40', '8.8.8.8',
           '172.25.254.40', '8.8.8.8']
    for ip in ips:
        t = IPthread(jobname="爬虫", ip=ip)
        threads.append(t)
        t.start()

    [thread.join() for thread in threads]
    print("运行时间为%s" % (time.time() - start_time))


if __name__ == "__main__":
    use_thread()

## 线程同步之线程锁

# 多个线程对同一个数据进行修改时，肯能出现不可预料的情况，在执行时由于多个线程并发进行，在传递数据时会出现数据传输错误的情况，所以我们在执行时需要加入线程所来保证数据的正常传输，保证不同线程之间不会产生数据干扰

import threading

def add(lock):
    # 操作变量之前进行加锁
    lock.acquire()
    global money
    for i in range(10000000):
        money +=1
    #操作变量完成后进行解锁
    lock.release()

def reduce(lock):
    # 操作变量之前进行加锁
    lock.acquire()
    global money
    for i in range(10000000):
        money -=1
    #操作变量完成后进行解锁
    lock.release()

if __name__ == '__main__':
    money = 0
    # 实例化一个锁对象
    lock = threading.Lock()
    t1 = threading.Thread(target=add, args=(lock, ))
    t2 = threading.Thread(target=reduce, args=(lock, ))
    t1.start()
    t2.start()
    # 等待所有子线程执行结束
    t1.join()
    t2.join()

    print("最终金额：%s" %money)

运行结果：
最终金额：0 # 不会产生数据干扰导致错误

## GIL全局解释器锁的概念

# python使用多线程，是个好主意么? 为什么?
    - GIL(全局解释器锁)
    - python解释器默认每次只允许一个线程执行
    执行过程:
    1). 设置GIL
    2). 切换到线程去运行对应的任务;
    3). 运行中切换线程条件
        - 执行完了
        - time.sleep()
        - 获取其他信息才能继续执行, eg: 从网络上获取网页信息等;
    3. 把线程设置为睡眠状态
    4. 解锁GIL
    5.再次重复执行上述内容;
# python解释器:Cpython解释器, Jpython解释器, p-python解释器

# 方法的选择：
Python并不支持真正意义上的多线程。Python中提供了多线程包，但是如果你想通过多线程提高代码的速度，
使用多线程包并不是个好主意。Python中有一个被称为Global Interpreter Lock（GIL）的东西，
它会确保任何时候你的多个线程中，只有一个被执行。线程的执行速度非常之快，会让你误以为线程是并行执行的，
但是实际上都是轮流执行。经过GIL这一道关卡处理，会增加执行的开销。这意味着，如果你想提高代码的运行速度，
使用threading包并不是一个很好的方法。
# I/O密集型操作: 多线程操作
# CPU/计算密集型：多进程操作

import threading

from 多进程与多线程._timeit import mytime


def job(li):
    sum(li)

@mytime
def use_thread():
    li = range(1,10000)
    for i in range(5):
        t = threading.Thread(target=job, args=(li, ))
        t.start()
@mytime
def use_no_thread():
    li = range(1, 10000)
    for i in range(5):
        job(li)


if __name__ == "__main__":
    use_thread()
    use_no_thread()

## 列队与多线程

# 1). 理论上多线程执行任务，会产生一些数据，为其他程序执行作铺垫;
# 2). 多线程是不能返回任务执行结果的，因此需要一个容器来存储多线程产生的数据
# 3). 这个容器如何选择? list(栈，队列), tuple(x), set(x), dict(x)，此处选择队列来实现

import threading
from queue import Queue

from 多进程与多线程._timeit import mytime


def job(l, queue):
    # 将任务的结果存储到队列中;
    queue.put(sum(l))


@mytime
def use_thread():
    # 实例化一个队列， 用来存储每个线程执行的结果;
    q = Queue()
    threads = []
    li = [[1, 5, 7, 3, 6, 2], [5, 23, 4, 6], [7, 8, 93, 2], [1, 2, 3, 4]]
    for l in li:
        t = threading.Thread(target=job, args=(l, q))
        threads.append(t)
        t.start()

    [thread.join() for thread in threads]
    # 从队列里面拿出所有的运行结果
    result = [q.get() for _ in range(len(li))]
    print(result)


if __name__ == "__main__":
    use_thread()

## 多线程方式实现生产者与消费者模型

# 需求：给定200个ip地址，可能开放端口为80， 443， 7001， 7002， 8000， 8080
以http://ip:port形式访问页面以判断是否正常访问.

1). 构建所有的url地址；===存储到一个数据结构中
2). 依次判断url址是否可以成功访问

import threading
from queue import Queue
from  urllib.request import urlopen


def create_data():
    with open("ips.txt", "w") as f:
        for i in range(200):
            f.write("172.25.254.%d\n" % (i + 1))

create_data()
def create_url():
    portlist = [80, 443, 7001, 7002, 8000, 8080]
    with open("ips.txt") as f:
        ips = [ip.strip() for ip in f]
    urls = ["http://%s:%s" % (ip, port) for ip in ips for port in portlist]
    return urls


class Producer(threading.Thread):
    def __init__(self, queue):
        super(Producer, self).__init__()
        self.queue = queue

    def run(self):
        portlist = [80, 443, 7001, 7002, 8000, 8080]
        with open("ips.txt") as f:
            ips = [ip.strip() for ip in f]
        # 每生产一个url地址， 就将生产的数据放到队列里面;
        for ip in ips:
            for port in portlist:
                url = "http://%s:%s" % (ip, port)
                self.queue.put(url)


class Consumer(threading.Thread):
    def __init__(self, queue):
        super(Consumer, self).__init__()
        self.queue = queue

    def run(self):
        try:
            url = self.queue.get()
            urlObj = urlopen(url)
        except Exception as e:
            print("%s unknown url" % url)
        else:
            print("%s is ok" % url)


if __name__ == "__main__":
    # 实例化一个队列
    queue = Queue()
    # 一个线程对象，生产者
    p = Producer(queue)
    p.start()
    # 消费者启动多个线程(启动30个)
    for i in range(30):
        c = Consumer(queue)
        c.start()

## ThreadPool 线程池

# 注意: python3.2版本以后才可以使用;
from concurrent.futures import ThreadPoolExecutor

from concurrent.futures import ThreadPoolExecutor

import time

"""
THreadPoolExecutor类的部分源码：
class ThreadPoolExecutor(_base.Executor):

    # Used to assign unique thread names when thread_name_prefix is not supplied.
    _counter = itertools.count().__next__

    def __init__(self, max_workers=None, thread_name_prefix=''):
     ...

    def submit(self, fn, *args, **kwargs):
     ...
"""


def Job():
    print("this is a job")
    return "hello"


if __name__ == "__main__":
    # 实例化对象， 线程池包含10个线程来处理任务;
    pool = ThreadPoolExecutor(max_workers=10)
    # 往线程池里面扔需要执行的任务， 返回一个对象，( _base.Future实例化出来的)
    f1 = pool.submit(Job)
    f2 = pool.submit(Job)
    # 判断任务是否执行结束
    print(f1.done())
    time.sleep(1)
    print(f2.done())
    # 获取任务执行的结果
    print(f1.result())
    print(f2.result())

## 线程池与map函数

from urllib.error import HTTPError
from urllib.request import urlopen
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed
import time
URLS = ['http://httpbin.org', 'http://example.com/',
        'https://api.github.com/'] * 10
def get_page(url, timeout=3):
    try:
        content = urlopen(url).read()
        return {'url':url, 'len':len(content)}
    except HTTPError as e:
        return {'url':url, 'len':0}


# 方法1： submit提交任务
# start_time = time.time()
# pool = ThreadPoolExecutor(max_workers=20)
# futuresObj = [pool.submit(get_page, url) for url in URLS]
#
# # 注意: 传递的时包含futures对象的序列, as_complete, 返回已经执行完任务的future对象，
# # 直到所有的future对应的任务执行完成， 循环结束;
# # for finish_fs in as_completed(futuresObj):
# #     print(finish_fs.result() )
#
# for future in futuresObj:
#     print(future.result())
#
# print("执行时间:%s" %(time.time()-start_time))
#

# 方法2：通过map方式执行
pool = ThreadPoolExecutor(max_workers=20)
for res in pool.map(get_page, URLS):
    print(res)

## 应用练习

需求：使用生产者消费者模型，多线程爬取指定个url网页信息，并多线程写入mysql数据库中;
要求如下:
    - 理论上url地址信息通过其他程序获取, 此处从一个文件urls.txt中读取；
    - 如果url地址访问不成功， bytesCount存储为0；
    - 数据库存储的表头要求： id(唯一标识码)， url, bytesCount
    - 获取url中字节数最多的10个url(自行查找相关的SQL语句);

# 附加：对比多进程和多线程的速度;

部分代码如下：

# 创建一些指定的网页信息
import threading
from queue import Queue
from urllib.request import urlopen
import pymysql


def create_web():
    with open("urls.txt", "a+") as f:
        f.write("www.taobao.com\n")
        f.write("www.baidu.com\n")
        f.write("www.qq.com\n")
        f.write("172.25.254.40\n")
        for i in range(6):
            f.write("172.25.254.%d\n" % (i + 1))


# 创建生产者类
class Producer(threading.Thread):
    def __init__(self, queue):
        super(Producer, self).__init__()
        self.queue = queue

    def run(self):
        portlist = [80]
        with open("urls.txt") as f:
            ips = [ip.strip() for ip in f]
        for ip in ips:
            for port in portlist:
                url = "http://%s:%s" % (ip, port)
                self.queue.put(url)


class Consumer(threading.Thread):
    def __init__(self, queue):
        super(Consumer, self).__init__()
        self.queue = queue
        self.bytesCount = 1

    def run(self):
        try:
            url = self.queue.get()
            urlObj = urlopen(url)
        except Exception as e:
            print("%s is error" % url)
            self.bytesCount -= 1
            return self.bytesCount
        else:
            print("%s is ok" % url)
            return self.bytesCount


if __name__ == "__main__":
    # 实例化一个队列
    queue = Queue()
    create_web()
    # 创建一个生产者模型
    p = Producer(queue)
    p.start()
    p.join()
    # 创建消费者模型， 10个线程
    threads = []
    for i in range(10):
        t = Consumer(queue)
        threads.append(t)
        t.start()
    [thread.join() for thread in threads]

## 完整版

"""
使用生产者消费者模型， 多线程爬取指定个url网页信息，并多线程写入mysql数据库中;
要求如下:
    - 理论上url地址信息通过其他程序获取, 此处从一个文件urls.txt中读取；
    - 如果url地址访问不成功， bytesCount存储为0；
    - 数据库存储的表头要求： id(唯一标识码)， url,  bytesCount
    - 获取url中字节数最多的10个url(自行查找相关的SQL语句);

    # 附加： 对比多进程和多线程的速度;
"""
import threading
from threading import Thread
from urllib.request import urlopen
from queue import Queue
import pymysql


# def create_ip():
#     with open('urls.txt', 'a') as f:
#         f.write('www.baidu.com\n' 'www.taobao.com\n' 'www.163.com\n' 'www.xunlei.com\n' 'www.qq.com\n')
#         for i in range(5):
#             f.write("172.25.254.%d\n" % (i + 1))


# create_ip()
class Producer(threading.Thread):
    def __init__(self, queue):
        super(Producer, self).__init__()
        self.queue = queue

    def run(self):
        port = [80]
        with open('urls.txt') as f:
            urls = ['http://%s:%s' % (ip.strip(), i) for ip in f for i in port]
        try:
            for url in urls:
                print("%s正在获取网页内容" % url)
                bytesCount = len(urlopen(url).read())
                self.queue.put((url, bytesCount))
        except Exception as e:
            bytesCount = 0
            self.queue.put((url, bytesCount))


class Consumer(threading.Thread):
    # 读取列队里面的内容
    def __init__(self, queue, conn, cur):
        super(Consumer, self).__init__()
        self.queue = queue
        self.conn = conn
        self.cur = cur

    def run(self):
        while True:
            url, bytesCount = self.queue.get()
            print("获取列队中的内容", url, bytesCount)
            try:
                print("正在向数据库写入数据")
                insert_data = 'insert into URL(url, bytesCount) VALUES(%s, %d);' % (url, bytesCount)
                self.cur.execute(insert_data)
            except Exception as e:
                print("插入%s %d失败" % (url, bytesCount))
            else:
                print("插入%s %d成功" % (url, bytesCount))


if __name__ == "__main__":
    # 连接数据库，创建游标
    conn = pymysql.connect(host='localhost', user='root', password='redhat', db='houzeyu', charset='utf8',
                           autocommit=True)
    # 创建游标
    cur = conn.cursor()
    print("正在连接数据库")
    try:
        print("正在创建数据库表")
        create_sql = 'create table URL(ID int PRIMARY KEY auto_increment, url varchar(50), bytesCount INT);'
        cur.execute(create_sql)
    except Exception as e:
        print("数据库表已经存在")
    else:
        print("数据库表创建成功!")
    queue = Queue()
    p1 = Producer(queue)
    p2 = Producer(queue)
    p1.start()
    p2.start()
    threads = []
    for url in 'urls.txt':
        c = Consumer(queue, conn, cur)
        threads.append(c)
        c.start()
    _ = [thread.join() for thread in threads]
    print("执行结束")

#############################

houzeyu666

关注

9
点赞
踩
38

收藏

觉得还不错? 一键收藏
4
评论
Python下的多线程处理

########多线程######### ## 线程的概念与多线程的引入一个标准的线程由线程ID，当前指令指针(PC），寄存器集合和堆栈组成。另外，线程是进程中的一个实体，一个进程里面必然会有一个主线程，是被系统独立调度和分派的基本单位，线程自己不拥有系统资源，只拥有一点儿在运行中必不可少的资源，但它可与同属一个进程的其它线程共享进程所拥有的全部资源...
复制链接

扫一扫