使用python2的gevent实现进程+协程的爬虫
导入库
import gevent
from multiprocessing.pool import Pool
import requests
import time
from collections import deque
from functools import wraps
list = [
"http://www.baidu.com",
"http://www.qq.com",
"http://www.weibo.com",
"http://www.tencent.com",
"http://www.jd.com",
"http://www.meituan.com",
"http://www.douban.com",
"http://www.hao123.com",
"http://www.vip.com",
"http://www.sohu.com",
"http://www.alibaba.com",
"http://www.jumei.com",
]
def getUrlContent(url):
'''
获取url内容
'''
content = requests.post(url)
print(content)
return content
def coroutine(*url_args):
'''
创建协程
'''
print(url_args)
spawnList = []
for url in url_args:
spawn = gevent.spawn(getUrlContent, url)
spawnList.append(spawn)
gevent.joinall(spawnList)
def printTime(func):
'''
包装函数
'''
@wraps(func)
def wrapper(*args):
startTime = time.time()
r = func(*args)
endTime = time.time()
print("time %s" % str(endTime - startTime))
return r
return wrapper
@printTime
def createMultiProcess(processNumber, coroutineNumber):
'''
创建进程
'''
urlList = deque(list)
p = Pool()
for i in range(processNumber):
urlBlock = []
for process in range(coroutineNumber):
if len(urlList) != 0:
url = urlList.pop()
urlBlock.append(url)
p.apply_async(coroutine, args=urlBlock)
p.close()
p.join()
if __name__ == '__main__':
createMultiProcess(2, 4)