在某些应用场景下,想要提高python的并发能力,可以使用多线程,或者协程。比如网络爬虫,数据库操作等一些IO密集型的操作。下面对比python单线程,多线程和协程在网络爬虫场景下的速度。
一,单线程。
单线程代
1 #!/usr/bin/env
2 # coding:utf8
3 #Author: hz_oracle
4
5 importMySQLdb6 importgevent7 importrequests8 importtime9
10
11 classDbHandler(object):12 def __init__(self, host, port, user, pwd, dbname):13 self.host =host14 self.port =port15 self.user =user16 self.pwd =pwd17 self.db =dbname18
19 defdb_conn(self):20 try:21 self.conn = MySQLdb.connect(host=self.host, port=self.port, user=self.user, passwd=self.pwd, db=self.db, charset="utf8")22 self.cursor =self.conn.cursor()23 return 1
24 exceptException as e:25 return026
27 defget_urls(self, limitation):28 sql = """select pic from picurltable limit %s""" %limitation29 urls_list =list()30 try:31 self.cursor.execute(sql)32 fetchresult =self.cursor.fetchall()33 for line infetchresult:34 urls_list.append(line[0])35 printlen(urls_list)36 exceptException as e:37 print u"数据库查询失败:%s" %e38 return[]39 returnurls_list40
41 defdb_close(self):42 self.conn.close()43
44
45 defget_pic(url):46 try:47 pic_obj =requests.get(url).content48 exceptException as e:49 print u"图片出错"
50 return ""
51 filename = url.split('/')[-2]52 file_path = "./picture/" + filename + '.jpg'
53 fp = file(file_path, 'wb')54 fp.write(pic_obj)55 fp.close()56 return "ok"
57
58
59 defmain():60 start_time =time.time()61 db_obj = DbHandler(host='127.0.0.1', port=3306, user='root', pwd='123456', dbname='pic')62 db_obj.db_conn()63 url_list = db_obj.get_urls(100)64 map(get_pic, url_list)65 #for url in url_list:
66 #get_pic(url)
67 end_time =time.time()68 costtime = float(end_time) -float(start_time)69 printcosttime70 print "download END"
71
72 if __name__ == "__main__":73 main()
运行结果
100
45.1282339096download END
单线程情况下,下载100张图片花了45秒。
再来看多线程的情况下。
#!/usr/bin/env python#coding:utf8#Author: hz_oracle
importMySQLdbimportgeventimportrequestsimporttimeimportthreadingimportQueue
lock1=threading.RLock()
url_queue=Queue.Queue()
urls_list=list()classDbHandler(object):def __init__(self, host, port, user, pwd, dbname):
self.host=host
self.port=port
self.user=user
self.pwd=pwd
self.db=dbnamedefdb_conn(self):try:
self.conn= MySQLdb.connect(host=self.host, port=self.port, user=self.user, passwd=self.pwd, db=self.db, charset="utf8")
self.cursor=self.conn.cursor()return 1
exceptException as e:return0defget_urls(self, limitation):
sql= """select pic from picurltable limit %s""" %limitationtry:
self.cursor.execute(sql)
fetchresult=self.cursor.fetchall()for line infetchresult:
url_queue.put(line[0])exceptException as e:print u"数据库查询失败:%s" %ereturn0return 1
defdb_close(self):
self.conn.close()classMyThread(threading.Thread):def __init__(self):
super(MyThread, self).__init__()defrun(self):
url=url_queue.get()try:
pic_obj=requests.get(url).contentexceptException as e:print u"图片出错"
return ""filename= url.split('/')[-2]
file_path= "./picture/" + filename + '.jpg'fp= file(file_path, 'wb')
fp.write(pic_obj)
fp.close()defmain():
start_time=time.time()
db_obj= DbHandler(host='127.0.0.1', port=3306, user='root', pwd='123456', dbname='pic')
db_obj.db_conn()
db_obj.get_urls(100)for i in range(100):
i=MyThread()
i.start()whileTrue:if threading.active_count()<=1:breakend_time=time.time()
costtime= float(end_time) -float(start_time)printcosttimeprint "download END"
if __name__ == "__main__":
main()
运行结果
15.408192873download END
启用100个线程发现只要花15秒即可完成任务,100个线程可能不是最优的方案,但较单线程有很明显的提升。接着再来看协程。
协程代码
#!/usr/bin/env python#coding:utf8#Author: hz_oracle
importMySQLdbimportrequestsimporttimeimportthreadingimportQueuefrom gevent importmonkey; monkey.patch_all()importgeventclassDbHandler(object):def __init__(self, host, port, user, pwd, dbname):
self.host=host
self.port=port
self.user=user
self.pwd=pwd
self.db=dbnamedefdb_conn(self):try:
self.conn= MySQLdb.connect(host=self.host, port=self.port, user=self.user, passwd=self.pwd, db=self.db, charset="utf8")
self.cursor=self.conn.cursor()return 1
exceptException as e:return0defget_urls(self, limitation):
urls_list=list()
sql= """select pic from picurltable limit %s""" %limitationtry:
self.cursor.execute(sql)
fetchresult=self.cursor.fetchall()for line infetchresult:
urls_list.append(line[0])exceptException as e:print u"数据库查询失败:%s" %ereturn[]returnurls_listdefdb_close(self):
self.conn.close()defget_pic(url):try:
pic_obj=requests.get(url).contentexceptException as e:print u"图片出错"
return ""filename= url.split('/')[-2]
file_path= "./picture/" + filename + '.jpg'fp= file(file_path, 'wb')
fp.write(pic_obj)
fp.close()return "ok"
defmain():
start_time=time.time()
db_obj= DbHandler(host='127.0.0.1', port=3306, user='root', pwd='123456', dbname='pic')
db_obj.db_conn()
url_list= db_obj.get_urls(100)
gevent.joinall([gevent.spawn(get_pic,url)for url inurl_list])
end_time=time.time()
costtime= float(end_time) -float(start_time)printcosttimeprint "download END"
if __name__ == "__main__":
main()
运行结果
10.6234440804download END
使用协程发现只花了10秒多,也就是三种方法中最快的。
总结:
三种方法中,单线程最慢,多线程次之,而协程最快。 不过如果对多线程进行优化,也可能变快,这里不讨论。