#2:Python爬虫进阶之路---进程与线程

多进程

os模块中的fork 仅适用于Unix/Linux系统; multiprocessing模块跨平台.

  1. 使用fork, 调用一次,返回两次,第一次为父进程返回子进程的pid,第二次为子进程返回0,可由getpid获取当前进程pid, getppid获取父进程pid.
    if __name__ == '__main__':
        print("current Process is {}".format(os.getpid()))
        pid = os.fork()
        if pid<0:
            print("an error in fork")
        elif pid == 0:
            print("Fpid {} creates me {}".format(os.getppid(),os.getpid()))
        else:
            print("current pid {} creates a new pid {}".format(os.getpid(), pid))

     

  2. 使用process类对象,p=Process(target=函数,args=(,))   p.start 开始, p.join阻塞主进程
    # def fun1(name):
    #     print('测试%s多进程' %name)
    #
    # if __name__ == '__main__':
    #     process_list = []
    #     for i in range(5):  #开启5个子进程执行fun1函数
    #         p = Process(target=fun1,args=('Python',)) #实例化进程对象
    #         p.start()
    #         process_list.append(p)
    #
    #     for i in process_list:
    #         p.join()
    #
    #     print('结束测试')

     

  3. multiprocess的pool代表进程池对象 用于更多进程,若池中未满可加入新进程,已满则等待.
    def task(name):
        print("task {} pid: {} is running...".format(name, os.getpid()))
        time.sleep(2)
        print("task {} ends.".format(name))
    
    if __name__ == '__main__':
        print("current pid {}".format(os.getpid()))
        p=Pool(processes=3)
        for i in range(5):
            p.apply_async(task, args=(i,))
        print("waiting subprocess done.")
        p.close()
        p.join()
        print("END")

     

  4. 进程间通信常使用queue和pipe方式 ;queue用于多进程间通信操作put插入数据blocked为true等待timeout,反之抛出异常queue.full. get操作为获取
    def proc_send(q, stris):
        for s in stris:
            q.put(s,block=True,timeout=4)
            print("PID: {} send {} to Queue... time: {}".format(os.getpid(), s, time.time()))
            #time.sleep(random.random())
    
    def proc_recv(q):
        print("PID: {} is reading.. time: {}".format(os.getpid(), time.time()))
        while 1:
            s = q.get(block=True, timeout=2)
            print("PID: {} get {} from Queue... time: {}".format(os.getpid(), s, time.time()))
    
    if __name__ == "__main__":
        q = Queue(maxsize=16)
        proc1 = Process(target=proc_send, args=(q,["a","b","c","d"],))
        proc2 = Process(target=proc_send, args=(q,["1","2","3","4"],))
        proc3 = Process(target=proc_recv, args=(q,))
    
        proc1.start()
        proc2.start()
    
        proc3.start()
    
        proc1.join()
        proc2.join()
    
        proc3.terminate()
    
    #Pipe方法
    
    def send(p,strs):
        for s in strs:
            p.send(s)
            print("PID: {} send {} to Pipe... time: {}".format(os.getpid(), s, time.time()))
            time.sleep(random.random())
    
    def recv(p):
        print("Recving...")
        while True:
            s = p.recv()
            print("PID: {} get {} from Pipe... time: {}".format(os.getpid(), s, time.time()))
    
    if __name__ == "__main__":
        p = Pipe(duplex=False)#p[1]send p[0]recv
        proc1 = Process(target=send, args=(p[1],["test"+str(i) for i in range(10)],))
        proc2 = Process(target=recv, args=(p[0],))
    
        proc1.start()
        proc2.start()
    
        proc1.join()
        proc2.join()
    
    

     

多线程

Python中的thread和threading(高级)模块

1. 创建多线程的两种方式:把函数传入并创建一个thread实例调用start执行或直接从threading.Thread继承并创建线程类,然后重写__init__和run方法.

#线程

def thread_run(vals):
    print("thread {} is running...".format(threading.current_thread().name))
    for val in vals:
        print("thread {} : val {}".format(threading.current_thread().name, val))
        time.sleep(random.random())
    print("thread {} end...".format(threading.current_thread().name))

print("{} is running...".format(threading.current_thread().name))
t1 = threading.Thread(target=thread_run, name="thread_1", args=(['a','b','c'],))
t2 = threading.Thread(target=thread_run,name="thread_2", args=(['1','2','3','4'],))

t1.start()
t2.start()

t1.join()
t2.join()
print("{} end...".format(threading.current_thread().name))

class myThread(threading.Thread):
    def __init__(self,vals,name = None):
        threading.Thread.__init__(self, name= name)
        self.vals = vals
    def run(self):
        print("thread {} is running...".format(threading.current_thread().name))
        print("copy: {}".format(self.name))
        for val in self.vals:
            print("thread {} : val {}".format(threading.current_thread().name, val))
            time.sleep(random.random())

print("{} is running...".format(threading.current_thread().name))
t1 = myThread(name="thread_1", vals = ['a','b','c'])
t2 = myThread(name="thread_2", vals=['1','2','3','4'])

t1.start()
t2.start()

t1.join()
t2.join()
print("{} end...".format(threading.current_thread().name))

2.线程同步

Lock与RLock,对应操作acquire和release,前者只能一次acquire后release才能再次获取,后者增加了计数器可以多次acquire后依次release.

num = 0
mylock = threading.RLock()

class myThread(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        global num
        while True:
            time.sleep(random.random())
            mylock.acquire()
            print("{} locked number : {}".format(threading.current_thread().name, num))
            if num>=4:
                # mylock.acquire()
                # num+=7
                # mylock.release()
                mylock.release()
                print("{} released number : {}".format(threading.current_thread().name, num))
                break
            num+=1
            print("{} released number : {}".format(threading.current_thread().name, num))
            mylock.release()
print("{} is running...".format(threading.current_thread().name))
t1 = myThread()
t2 = myThread()

t1.start()
t2.start()

t1.join()
t2.join()
print("{} end...".format(threading.current_thread().name))

协程

略 使用gevent库

分布式进程

multiprocessing模块中managers子模块分布到多台机器;本质上是通过一个队列进行进程间的通讯,即将一个本地队列映射为网络队列.

#taskManager
import random, time
import  multiprocessing
from multiprocessing import Queue
from  multiprocessing.managers import  BaseManager


task_queue = Queue()
result_queue = Queue()

class QueueManger(BaseManager):
    pass

QueueManger.register('get_task_queue',callable=lambda :task_queue)
QueueManger.register('get_result_queue',callable=lambda :result_queue)

manager = QueueManger(address=('', 8001), authkey=b'lrz')

manager.start()

task = manager.get_task_queue()
result = manager.get_result_queue()

for url in ['THISURL_'+ str(i) for i in range(10)]:
    print("put task {}...".format(url))
    task.put(url)

print("try to get result...")
for i in range(10):
    res = result.get(timeout = 5)
    print("get result is {}....".format(res))

manager.shutdown()


#taskWorker

import random, time
import  multiprocessing
from multiprocessing import Queue
from  multiprocessing.managers import  BaseManager

class QueueManger(BaseManager):
    pass

QueueManger.register('get_task_queue')
QueueManger.register('get_result_queue')

sever_adress = '127.0.0.1'
print("Connect to the sever {}...".format(sever_adress))
manager = QueueManger(address=(sever_adress, 8001), authkey=b'lrz')

manager.connect()

task = manager.get_task_queue()
result = manager.get_result_queue()

while(not task.empty()):
    url = task.get()
    print("run task download {}...".format(url))
    time.sleep(1)
    result.put("{} success ...".format(url))

print("worker exit.")


#forLinux

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值