运行环境一致的情况下,协程,线程,和单线程的运行速度差别。
协程:
import gevent
import requests
from guanjianzi import keylist as keys
from gevent import monkey
monkey.patch_all() #猴子补丁,改变python的库,改成非阻塞的
import re
import time
s=requests.Session()
def getlist(url):
html=s.get(url).content.decode()
res=r'<li class="li"><font class="date">(.*?)</font><a href="(.*?)" target="_blank">(.*?)</a><span class="new"></span></li>'
li=re.findall(res,html)
#jix(li)
sps=[]
for x,y,z in li:
y=url+str(y)
y=re.sub(r"\/index\.html\.",'',y)
sps.append(gevent.spawn(jix,x,y,z))
gevent.joinall(sps)
def jix(x,y,z):
html1=s.get(y).content.decode()
text=str(html1)
guanjianzi = {}
num = 0
for key in keys: # 循环遍历关键字列表,查询关键字出现的次数
count = text.count(key) # count 关键字在本文中出现的次数
if count > 0:
guanjianzi.update({key: count}) # 把关键字和出现的次数添加到字典中
num += count
print(y,z,guanjianzi,num,x)
if __name__ == '__main__':
urls = ['http://www.ndrc.gov.cn/xwzx/xwfb/index.html','http://www.ndrc.gov.cn/zwfwzx/zxgg/index.htmm','http://www.ndrc.gov.cn/zwfwzx/xzxknew/index.html','http://www.ndrc.gov.cn/zcfb/zcfbl/index.html','http://www.ndrc.gov.cn/zcfb/gfxwj/index.html','http://www.ndrc.gov.cn/zcfb/zcfbgg/index.html','http://www.ndrc.gov.cn/zcfb/zcfbghwb/index.html','http://www.ndrc.gov.cn/zcfb/zcfbtz/index.html','http://www.ndrc.gov.cn/zcfb/jd/index.html','http://www.ndrc.gov.cn/yjzq/index.html']
li=[]
start = time.time()
for url in urls:
li.append(gevent.spawn(getlist,url))
gevent.joinall(li)
end=time.time()
print('运行时间',end-start)
下边是使用线程池,开了五个线程
import requests
from guanjianzi import keylist as keys
import re
from conMySql import ConDb
from multiprocessing.dummy import Pool as ThreadPool
import time
start = time.time()
s=requests.Session()
con=ConDb()
def getlist(url):
html=s.get(url).content.decode()
res=r'<li class="li"><font class="date">(.*?)</font><a href="(.*?)" target="_blank">(.*?)</a><span class="new"></span></li>'
li=re.findall(res,html)
title=re.findall(r"<title>(.*?)</title>",html)[0]
for x,y,z in li:
y=url+str(y)
y=re.sub(r"\/index\.html\.",'',y)
html1=s.get(y).content.decode()
text=str(html1)
guanjianzi = {}
num = 0
for key in keys: # 循环遍历关键字列表,查询关键字出现的次数
count = text.count(key) # count 关键字在本文中出现的次数
if count > 0:
guanjianzi.update({key: count}) # 把关键字和出现的次数添加到字典中
num += count
print(title,y,z,guanjianzi,num,x)
#sql=''' insert into urllist(source,urls,titles,keyname,keysum,date1) values('{}','{}','{}',"{}",'{}','{}') '''.format(title,y,z,guanjianzi,num,x)
#con.runSql(sql)
if __name__ == '__main__':
urls = ['http://www.ndrc.gov.cn/xwzx/xwfb/index.html','http://www.ndrc.gov.cn/zwfwzx/zxgg/index.htmm','http://www.ndrc.gov.cn/zwfwzx/xzxknew/index.html','http://www.ndrc.gov.cn/zcfb/zcfbl/index.html','http://www.ndrc.gov.cn/zcfb/gfxwj/index.html','http://www.ndrc.gov.cn/zcfb/zcfbgg/index.html','http://www.ndrc.gov.cn/zcfb/zcfbghwb/index.html','http://www.ndrc.gov.cn/zcfb/zcfbtz/index.html','http://www.ndrc.gov.cn/zcfb/jd/index.html','http://www.ndrc.gov.cn/yjzq/index.html']
t=ThreadPool(5)
for url in urls:
t.apply_async(getlist,args=(url,))
t.close()
t.join()
end=time.time()
print('',end-start)
#sql1='select max(bat) from urllist limit 1'
#bat=con.runSql(sql1)[0][0]
#bat=int(bat)+1
# print(bat)
#sql2="update urllist set bat='{}'".format(bat)
#con.runSql(sql2)
运行时间如下:
多线程分配任务给线程的方式和协程一样的速度,但是协程进行二次封装之后速度比线程快了一倍