在网上看到了好多关于python的thread的问题,关于多线程的爬虫,我发现网上一些论坛的代码实际上是单线程的,而我也是刚刚开始搞多线程,这是我的理解,请各位高手多多指教,前面是论坛代码,后面是我自己写的代码,新手,还请指点一下,一起进步~
论坛代码(自己研究了一下,大体是这样):
# encoding: UTF-8
import threading
import urllib,urllib2
import Queue
import bs4
from bs4 import BeautifulSoup
import time
hosts=['http://www.baidu.com','http://www.weibo.com','http://www.renren.com']
queue=Queue.Queue(0);
out_queue=Queue.Queue(0)
class ContentThread(threading.Thread):
def __init__(self,queue,out_queue):
threading.Thread.__init__(self)
self.queue=queue
self.out_queue=out_queue
def run(self):
while True:
host=self.queue.get()
url=urllib.urlopen(host)
chunk=url.read()
self.out_queue.put(chunk)
self.queue.task_done()
class match(threading.Thread):
def __init__(self,out_queue):
threading.Thread.__init__(self)
self.out_queue=out_queue
def run(self):
while True:
chunk=self.out_queue.get()
soup=BeautifulSoup(chunk)
print soup.findAll(['title'])
print self.name
self.out_queue.task_done()
start=time.time()
def main():
t=ContentThread(queue,out_queue)
t.setDaemon(True)
t.start()
for i in hosts:
queue.put(i)
dt=match(out_queue)
dt.setDaemon(True)
dt.start()
queue.join()
out_queue.join()
if __name__=='__main__':
main()
print '爬虫耗时: %s秒'%(time.time()-start)
这个代码在task_done()之后加上time.sleep(10)会有明显的堵塞情况。输出thread name也是显示相同的线程名字。
这是是我写的多线程爬虫:
# encoding: UTF-8
import threading
import urllib,urllib2
import Queue
import bs4
from bs4 import BeautifulSoup
import time
queue=Queue.Queue(0)
hosts=['http://www.baidu.com','http://www.weibo.com','http://www.renren.com']
class MyThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.queue=queue
#self.out_queue=out_queue
def run(self):
global queue
host=self.queue.get()
print self.name
url = urllib.urlopen(host)
chunk = url.read()
soup=BeautifulSoup(chunk)
print soup.findAll(['title'])
print '爬虫耗时: %s秒'%(time.time()-start)
#print host
#print queue.qsize()
start = time.time()
def host():
for i in hosts:
t=MyThread()
t.start()
msg = str(i)
global queue
queue.put(str(i))
print msg
#print "Elapsed Time: %s" % (time.time() - start)
if __name__ == '__main__':
host()
这个在输出时间之后再sleep不会出现堵塞情况,输出thread name也是显示不同的线程名字
论坛代码(自己研究了一下,大体是这样):
# encoding: UTF-8
import threading
import urllib,urllib2
import Queue
import bs4
from bs4 import BeautifulSoup
import time
hosts=['http://www.baidu.com','http://www.weibo.com','http://www.renren.com']
queue=Queue.Queue(0);
out_queue=Queue.Queue(0)
class ContentThread(threading.Thread):
def __init__(self,queue,out_queue):
threading.Thread.__init__(self)
self.queue=queue
self.out_queue=out_queue
def run(self):
while True:
host=self.queue.get()
url=urllib.urlopen(host)
chunk=url.read()
self.out_queue.put(chunk)
self.queue.task_done()
class match(threading.Thread):
def __init__(self,out_queue):
threading.Thread.__init__(self)
self.out_queue=out_queue
def run(self):
while True:
chunk=self.out_queue.get()
soup=BeautifulSoup(chunk)
print soup.findAll(['title'])
print self.name
self.out_queue.task_done()
start=time.time()
def main():
t=ContentThread(queue,out_queue)
t.setDaemon(True)
t.start()
for i in hosts:
queue.put(i)
dt=match(out_queue)
dt.setDaemon(True)
dt.start()
queue.join()
out_queue.join()
if __name__=='__main__':
main()
print '爬虫耗时: %s秒'%(time.time()-start)
这个代码在task_done()之后加上time.sleep(10)会有明显的堵塞情况。输出thread name也是显示相同的线程名字。
这是是我写的多线程爬虫:
# encoding: UTF-8
import threading
import urllib,urllib2
import Queue
import bs4
from bs4 import BeautifulSoup
import time
queue=Queue.Queue(0)
hosts=['http://www.baidu.com','http://www.weibo.com','http://www.renren.com']
class MyThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.queue=queue
#self.out_queue=out_queue
def run(self):
global queue
host=self.queue.get()
print self.name
url = urllib.urlopen(host)
chunk = url.read()
soup=BeautifulSoup(chunk)
print soup.findAll(['title'])
print '爬虫耗时: %s秒'%(time.time()-start)
#print host
#print queue.qsize()
start = time.time()
def host():
for i in hosts:
t=MyThread()
t.start()
msg = str(i)
global queue
queue.put(str(i))
print msg
#print "Elapsed Time: %s" % (time.time() - start)
if __name__ == '__main__':
host()
这个在输出时间之后再sleep不会出现堵塞情况,输出thread name也是显示不同的线程名字