//serverimportsocket, select, re, queue, redisfrom multiprocessing importPool, cpu_countfrom pymongo importMongoClient
host= '192.168.1.107'ConnectionList=[]
Recv_buffer= 4096000Client_Status={}
Client_Num={}
redis1= redis.Redis(host='localhost', port=6379, db=0)
Num=0classDistributed_Web_Crawler:def __init__(self, port):
self.url_num= 1self.queue=queue.Queue()
self.db=MongoClient().CrawSpider.content
self.server_socket=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.server_socket.bind((host, port))
self.server_socket.listen(10)
self.pool= Pool(cpu_count() - 1)
ConnectionList.append(self.server_socket)print("服务器运行在端口:" +str(port))
address= 'https://movie.douban.com/'self.queue.put(address)
redis1.set(address, 0)
self.main()defmain(self):globalNumwhile 1:if not self.queue.empty() and ConnectionList.__len__() > 1 is notNone:
self.pool.apply_async(self.task_manage())
read_sockets, write_sockets, error_sockets=select.select(ConnectionList, [], [])for sock inread_sockets:if sock ==self.server_socket:
conn, addr=self.server_socket.accept()
ConnectionList.append(conn)
core_num= conn.recv(Recv_buffer).decode('utf8')
Client_Status[conn]=core_num
Client_Num[conn]= Client_Num.__len__() + 1
print('客户端' + addr[0] + ':' + str(addr[1]) + '已连接,核心数:' + core_num + '\n编号为' +str(Client_Num[
conn]))else:
data=sock.recv(Recv_buffer)ifdata:
Contents= data.decode('utf8').split('Page_ContentPPPPPP///')#print('收到'+str(Client_Num[sock])+'号机发来数据,正在处理')
Client_Status[sock] = int(Client_Status[sock]) +len(Contents)print('编号'+str(Client_Num[sock])+'可用核心'+str(Client_Status[sock]))for content inContents:ifcontent:
self.pool.apply_async(self.web_page_resolution(content))else:print('客户端' + addr[0] + ':' + str(addr[1]) + '断开连接')
sock.close()
Client_Status.pop(sock)
Client_Num.pop(sock)
ConnectionList.remove(sock)defweb_page_resolution(self, content):
db=MongoClient().Web.data
db.insert({'page_content': content})
pattern= re.compile('https://movie.douban.com/(.*?)"')
urls= re.findall(string=content, pattern=pattern)for url inurls:
url= 'https://movie.douban.com/' +urlif redis1.get(url) isNone:
redis1.set(url, self.url_num)
self.queue.put(url)
self.url_num+= 1
deftask_manage(self):
urls= ''
for socket inConnectionList:if socket !=self.server_socket:while not self.queue.empty() and int(Client_Status[socket]) !=0:
urls= urls + self.queue.get() + ' 'Client_Status[socket]= int(Client_Status[socket]) - 1
#print('向' + str(Client_Num[socket]) + '号终端分配任务')
socket.send(urls.encode('utf8'))if __name__ == "__main__":
port= 8888Distributed_Web_Crawler(port, )