多线程爬虫会用深度算法,我们优化上一篇内容代码:爬虫实战(维基xx)-CSDN博客
代码如下:
import requests
import re
import time
import threading
#https://www.britannica.com/topic/Wikipedia
#创建一些列表
g_mutex=threading.Condition()
g_pages=[]
g_queueURL=[]
g_existURL=[]
g_writecount=0
class Crawler:
def __init__(self,url,threadnum):
self.url=url
self.threadnum=threadnum
self.threadpool=[]
def craw(self):
global g_queueURL
g_queueURL.append(url)
depth=1
while(depth<3):
print('searching depth',depth,'\n')
self.downloadAll()
self.updateQueueURL()
g_pages=[]
depth+=1
def downloadAll(self):
global g_queueURL
i=0
while i<len(g_queueURL):
j=0
while j<self.threadnum and i+j<len(g_queueURL):
threadresult=self.downloadAll(g_queueURL[i+j],j)
j+=1
i+=j
for thread in self.threadpool:
thread.join(30)
threadpool=[]
g_queueURL=[]
def download(self,url,tid):
crawthread=CrawlerThread(url,tid)
self.threadpool.append(crawthread)
crawthread.start()
def updateQueueURL(self): # 完成一个深度的爬虫之后,更新队列
global g_queueURL
global g_existURL
newUrlList = []
for content in g_pages:
newUrlList += self.getUrl(content)
g_queueURL = list(set(newUrlList) - set(g_existURL))
def getUrl(self,content):
# 从获取的网页中解析ur1
link_list=re.findall('<ahref="/wiki/([^:#=<>]*?)".*?</a>',content)
unique_list=list(set(link_list))
return unique_list
class CrawlerThread(threading.Thread):
def __init__(self,url,tid):
threading.Thread.__init__(self)
self.url=url
self.tid = tid
def run(self, g_writecount=None):
global g_mutex
global g_writecounttry
try:
print(self.tid, "crawl ", self.url)
headers = {
'User-Agent': 'Mozi1la/5.0(windows;U;windows NT 6.1;en-US;rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'}
r = requests.get("https://en,wikipedia.org/wiki/" +self.url, headers=headers)
html = r.text
link_list2 = re.findall('<ahre£="/wiki/([^:#=<>]*?)".*?</a>', html)
unique_list2=list(set(link_list2))
for eachone in unique_list2:
g_writecount+= 1
content2 = "No." + str(g_writecount)+"\t Thread" + str(self.tid) + "t" + self.url + '->' + eachone + '\n'
with open('title2.txt', "a+")as f:
f.write(content2)
except Exception as e:
g_mutex.acquire()
g_existURL.append(self.url)
g_mutex.release()
print('Failed downloading and saving', self.url)
print(e)
return None
g_mutex.acquire()
g_pages.append(self.url)
g_existURL.append(self.url)
g_mutex.release()
if __name__ == '__main__':
url='wiki'
threadnum=5
crwaler=Crawler(url,threadnum)
crwaler.craw()
多线程确实会加快爬虫的速度,我们假如线程开的多也会提高速度的。