# !控制主机程序
'''主机发送爬虫网址,从机进行爬取信息,并返回给主机'''
#本次优化主要是:由于发送url与爬取url速度差异较大,造成发送url的队列中存在数据较多,占用内存。
# 新方案是当发送url队列中数量大于200条时,暂不发送url任务;
# 当new-urls数量大于200条时,分批存储在临时文件夹下,减小内存占用
# 当old_urls数量大于200条时,保存到本地文件夹
import pickle, hashlib, sys, codecs, time, sys,tempfile,os,pickle
from multiprocessing import Process, Queue
from multiprocessing.managers import BaseManager
class url_manager(object):
def __init__(self):
self.new_urls = self.load_process('newurls.txt')
self.old_urls = self.load_process('oldurls.txt')
def add_new_url(self, url):#此处判定url可以只判断是否在old_urls里即可
if url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self, url):
if url!=None:
for i in url:
self.add_new_url(i)
def has_new_url(self):
return len(self.new_urls) != 0
def get_new_url(self):
a = self.new_urls.pop()
self.old_urls.add(a)
return a
# def md_url(self, url):
# a = hashlib.md5()
# a.update(bytes(url, encoding='utf-8'))
# return a.hexdigest()
def save_process(self, path, data):
''' print('is saving fidle:',path)'''
with open(path, 'ab+')as f:
pickle.dump(data, f)
f.close()
def load_process(self, path):
''' print('is loading file:%s',path)'''
print('从文件加载进度:%s' % path)
try:
with open(path, 'rb')as f:
data = pickle.load(f)
return data
f.close()
except:
print('is not created: ', path)
return set()
class data_save(object):
def __init__(self):
self.date = time.strftime(" %Y-%m-%d-%H-%M-%S", time.localtime())
self.filepath = 'baike%s.txt' % (self.date)
self.urlpath = 'url%s.txt' % (self.date)
self.data = []
def data_saving(self, path, datas):
self.data.append(datas)
if len(self.data) > 5 or datas == 'end':
with open(path, 'a+', encoding='utf-8') as f:
for i in self.data:
f.write(i)
f.write(r' ')
f.close()
self.data = []
class controller(object): # 建立网络队列
def __init__(self):
self.url_manag = url_manager()
self.dataing = data_save()
def multi_processmanager(self, url_q, result_q):
BaseManager.register('get_task_queue', callable=url_q)
BaseManager.register('get_result_queue', callable=result_q)
manager = BaseManager(address=('127.0.0.1', 8100), authkey='baike'.encode())
manager.start()
return manager
def send_url1(self, url_q, send_url_q, root_url): # 将接收到的新url队列,保存到url_manager,并发送给控制节点
self.url_manag.add_new_url(root_url)
num1 = 0
while True:
if not send_url_q.empty(): # 新接收到的urls,全部转入new_urls,进行爬虫
urls = send_url_q.get()
if urls == 'end':
self.url_manag.save_process(self.dataing.urlpath, self.url_manag.old_urls) # 保存已爬取的网页
break
self.url_manag.add_new_urls(urls)
if self.url_manag.has_new_url():
old_url = self.url_manag.get_new_url()
url_q.put(old_url) # 发送到网络队列,传输给爬虫节点
num1 += 1
print(num1, 'is running:', old_url)
def data_manager(self, result_q, send_data_q,
send_url_q): # 将网络上的爬虫节点传输的结果队列的数据和url分发到各控制节点的数据队列(用于保存到本地)和url队列(用于传输给url_manager),
while True:
if not result_q.empty():
data = result_q.get() # 接收到的爬虫网站数据包括data和url两类
if data[0] == 'end' or data[1] == 'end':
send_data_q.put('end') # 发送data数据到存储进程
send_url_q.put('end') # 发送url到进程sen_url中,
break
send_data_q.put(data[0]) # 发送data数据到存储进程
if data[1] != 'Null':
send_url_q.put(data[1]) # 发送url到进程sen_url中,
def data_saves(self, data_q): # 保存数据的进程
while True:
if not data_q.empty():
data1 = data_q.get()
if data1 == 'end':
break
self.dataing.data_saving(self.dataing.filepath, data1)
def send_url(self, url_q, send_url_q, root_url):#保存newurl和 oldurl到本地文件
self.url_manag.add_new_url(root_url)
num1,num2,num3=0,0,0
temp = tempfile.TemporaryFile()#创建临时文件夹,保存newurl
filename=temp.name
urls=[]
while True:
if self.url_manag.has_new_url():
old_url = self.url_manag.get_new_url()
url_q.put(old_url) # 发送到网络队列,传输给爬虫节点
num1 += 1
print(num1, 'is sending:', old_url)
if not send_url_q.empty(): # 新接收到的urls,全部转入new_urls,进行爬虫
urls = send_url_q.get()
if urls == 'end': # 或者爬虫结束时,进行保存本地
self.url_manag.save_process(self.dataing.urlpath, self.url_manag.old_urls)
self.url_manag.old_urls = set()
break
elif urls!=[]:
if num2 < 10:#刚开始爬虫时,数据直接添加到队列
self.url_manag.add_new_urls(urls)
num2 += 1
continue
else:
if len(urls)>8:#urls数据较大时,loads会报run out input
#self.url_manag.add_new_urls(urls)
for i in urls:
data1 = pickle.dumps(i)
temp.write(data1) # newurl全部保存到临时文件夹,从临时文件夹存取url
temp.write(b' ')
else:
data1=pickle.dumps(urls)
temp.write(data1)# newurl全部保存到临时文件夹,从临时文件夹存取url
temp.write(b' ')
if url_q.qsize() < 100: # 当发送任务url队列中数据较少时,添加数据
temp.seek(0)
lines = temp.readlines()
if num3 < len(lines):
urldata = lines[num3]
num3 += 1
url1 = pickle.loads(urldata)
if isinstance(url1, list):
self.url_manag.add_new_urls(url1)
else:
url0 = []
url0.append(url1)
self.url_manag.add_new_urls(url0)
if len(self.url_manag.old_urls) > 100: # old_urls中数据较多,进行保存本地
self.url_manag.save_process(self.dataing.urlpath, self.url_manag.old_urls)
self.url_manag.old_urls = set()
url_q = Queue() # 控制节点发给爬虫节点的队列
result_q = Queue() # 爬虫节点发送的网站数据
def url_q1():
return url_q
def result_q1():
return result_q
if __name__ == '__main__':
sys.setrecursionlimit(1000000) # 不加时,爬虫容易出现递归错误,
data_q = Queue() # 网站数据中关于title,reffer等数据,用于保存数据的队列
urlmanager_q = Queue() # 网址数据发送给url_manager的队列
url = r'https://baike.baidu.com/item/%E5%8C%96%E5%AD%A6/127240'
url1=r'https://baike.baidu.com/item/%E8%87%AA%E7%84%B6%E7%A7%91%E5%AD%A6/260539'
a = controller()
manag = a.multi_processmanager(url_q1, result_q1)
url_queue = manag.get_task_queue()
result_queue = manag.get_result_queue() # 获取网络队列
p1 = Process(target=a.send_url, args=(url_queue, urlmanager_q, url,))
p2 = Process(target=a.data_manager, args=(result_queue, data_q, urlmanager_q,))
p3 = Process(target=a.data_saves, args=(data_q,))
p1.start()
p2.start()
p3.start()
p1.join()
p2.join()
p3.join()