#2:Python爬虫进阶之路---进程与线程

最新推荐文章于 2020-12-03 09:29:31 发布

lrzbupt

最新推荐文章于 2020-12-03 09:29:31 发布

阅读量129

点赞数 1

分类专栏： [01]Python爬虫

本文链接：https://blog.csdn.net/lrzbupt/article/details/104891815

版权

[01]Python爬虫专栏收录该内容

14 篇文章 0 订阅

订阅专栏

多进程

os模块中的fork 仅适用于Unix/Linux系统; multiprocessing模块跨平台.

使用fork, 调用一次,返回两次,第一次为父进程返回子进程的pid,第二次为子进程返回0,可由getpid获取当前进程pid, getppid获取父进程pid.

if __name__ == '__main__':
    print("current Process is {}".format(os.getpid()))
    pid = os.fork()
    if pid<0:
        print("an error in fork")
    elif pid == 0:
        print("Fpid {} creates me {}".format(os.getppid(),os.getpid()))
    else:
        print("current pid {} creates a new pid {}".format(os.getpid(), pid))

使用process类对象,p=Process(target=函数,args=(,)) p.start 开始, p.join阻塞主进程

# def fun1(name):
#     print('测试%s多进程' %name)
#
# if __name__ == '__main__':
#     process_list = []
#     for i in range(5):  #开启5个子进程执行fun1函数
#         p = Process(target=fun1,args=('Python',)) #实例化进程对象
#         p.start()
#         process_list.append(p)
#
#     for i in process_list:
#         p.join()
#
#     print('结束测试')

multiprocess的pool代表进程池对象用于更多进程,若池中未满可加入新进程,已满则等待.

def task(name):
    print("task {} pid: {} is running...".format(name, os.getpid()))
    time.sleep(2)
    print("task {} ends.".format(name))

if __name__ == '__main__':
    print("current pid {}".format(os.getpid()))
    p=Pool(processes=3)
    for i in range(5):
        p.apply_async(task, args=(i,))
    print("waiting subprocess done.")
    p.close()
    p.join()
    print("END")

进程间通信常使用queue和pipe方式 ;queue用于多进程间通信操作put插入数据blocked为true等待timeout,反之抛出异常queue.full. get操作为获取

def proc_send(q, stris):
    for s in stris:
        q.put(s,block=True,timeout=4)
        print("PID: {} send {} to Queue... time: {}".format(os.getpid(), s, time.time()))
        #time.sleep(random.random())

def proc_recv(q):
    print("PID: {} is reading.. time: {}".format(os.getpid(), time.time()))
    while 1:
        s = q.get(block=True, timeout=2)
        print("PID: {} get {} from Queue... time: {}".format(os.getpid(), s, time.time()))

if __name__ == "__main__":
    q = Queue(maxsize=16)
    proc1 = Process(target=proc_send, args=(q,["a","b","c","d"],))
    proc2 = Process(target=proc_send, args=(q,["1","2","3","4"],))
    proc3 = Process(target=proc_recv, args=(q,))

    proc1.start()
    proc2.start()

    proc3.start()

    proc1.join()
    proc2.join()

    proc3.terminate()

#Pipe方法

def send(p,strs):
    for s in strs:
        p.send(s)
        print("PID: {} send {} to Pipe... time: {}".format(os.getpid(), s, time.time()))
        time.sleep(random.random())

def recv(p):
    print("Recving...")
    while True:
        s = p.recv()
        print("PID: {} get {} from Pipe... time: {}".format(os.getpid(), s, time.time()))

if __name__ == "__main__":
    p = Pipe(duplex=False)#p[1]send p[0]recv
    proc1 = Process(target=send, args=(p[1],["test"+str(i) for i in range(10)],))
    proc2 = Process(target=recv, args=(p[0],))

    proc1.start()
    proc2.start()

    proc1.join()
    proc2.join()

多线程

Python中的thread和threading(高级)模块

1. 创建多线程的两种方式:把函数传入并创建一个thread实例调用start执行或直接从threading.Thread继承并创建线程类,然后重写__init__和run方法.

#线程

def thread_run(vals):
    print("thread {} is running...".format(threading.current_thread().name))
    for val in vals:
        print("thread {} : val {}".format(threading.current_thread().name, val))
        time.sleep(random.random())
    print("thread {} end...".format(threading.current_thread().name))

print("{} is running...".format(threading.current_thread().name))
t1 = threading.Thread(target=thread_run, name="thread_1", args=(['a','b','c'],))
t2 = threading.Thread(target=thread_run,name="thread_2", args=(['1','2','3','4'],))

t1.start()
t2.start()

t1.join()
t2.join()
print("{} end...".format(threading.current_thread().name))

class myThread(threading.Thread):
    def __init__(self,vals,name = None):
        threading.Thread.__init__(self, name= name)
        self.vals = vals
    def run(self):
        print("thread {} is running...".format(threading.current_thread().name))
        print("copy: {}".format(self.name))
        for val in self.vals:
            print("thread {} : val {}".format(threading.current_thread().name, val))
            time.sleep(random.random())

print("{} is running...".format(threading.current_thread().name))
t1 = myThread(name="thread_1", vals = ['a','b','c'])
t2 = myThread(name="thread_2", vals=['1','2','3','4'])

t1.start()
t2.start()

t1.join()
t2.join()
print("{} end...".format(threading.current_thread().name))

2.线程同步

Lock与RLock,对应操作acquire和release,前者只能一次acquire后release才能再次获取,后者增加了计数器可以多次acquire后依次release.

num = 0
mylock = threading.RLock()

class myThread(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        global num
        while True:
            time.sleep(random.random())
            mylock.acquire()
            print("{} locked number : {}".format(threading.current_thread().name, num))
            if num>=4:
                # mylock.acquire()
                # num+=7
                # mylock.release()
                mylock.release()
                print("{} released number : {}".format(threading.current_thread().name, num))
                break
            num+=1
            print("{} released number : {}".format(threading.current_thread().name, num))
            mylock.release()
print("{} is running...".format(threading.current_thread().name))
t1 = myThread()
t2 = myThread()

t1.start()
t2.start()

t1.join()
t2.join()
print("{} end...".format(threading.current_thread().name))

协程

略使用gevent库

分布式进程

multiprocessing模块中managers子模块分布到多台机器;本质上是通过一个队列进行进程间的通讯,即将一个本地队列映射为网络队列.

#taskManager
import random, time
import  multiprocessing
from multiprocessing import Queue
from  multiprocessing.managers import  BaseManager


task_queue = Queue()
result_queue = Queue()

class QueueManger(BaseManager):
    pass

QueueManger.register('get_task_queue',callable=lambda :task_queue)
QueueManger.register('get_result_queue',callable=lambda :result_queue)

manager = QueueManger(address=('', 8001), authkey=b'lrz')

manager.start()

task = manager.get_task_queue()
result = manager.get_result_queue()

for url in ['THISURL_'+ str(i) for i in range(10)]:
    print("put task {}...".format(url))
    task.put(url)

print("try to get result...")
for i in range(10):
    res = result.get(timeout = 5)
    print("get result is {}....".format(res))

manager.shutdown()


#taskWorker

import random, time
import  multiprocessing
from multiprocessing import Queue
from  multiprocessing.managers import  BaseManager

class QueueManger(BaseManager):
    pass

QueueManger.register('get_task_queue')
QueueManger.register('get_result_queue')

sever_adress = '127.0.0.1'
print("Connect to the sever {}...".format(sever_adress))
manager = QueueManger(address=(sever_adress, 8001), authkey=b'lrz')

manager.connect()

task = manager.get_task_queue()
result = manager.get_result_queue()

while(not task.empty()):
    url = task.get()
    print("run task download {}...".format(url))
    time.sleep(1)
    result.put("{} success ...".format(url))

print("worker exit.")


#forLinux