python分布式爬虫_python--分布式爬虫

//serverimportsocket, select, re, queue, redisfrom multiprocessing importPool, cpu_countfrom pymongo importMongoClient

host= '192.168.1.107'ConnectionList=[]

Recv_buffer= 4096000Client_Status={}

Client_Num={}

redis1= redis.Redis(host='localhost', port=6379, db=0)

Num=0classDistributed_Web_Crawler:def __init__(self, port):

self.url_num= 1self.queue=queue.Queue()

self.db=MongoClient().CrawSpider.content

self.server_socket=socket.socket(socket.AF_INET, socket.SOCK_STREAM)

self.server_socket.bind((host, port))

self.server_socket.listen(10)

self.pool= Pool(cpu_count() - 1)

ConnectionList.append(self.server_socket)print("服务器运行在端口:" +str(port))

address= 'https://movie.douban.com/'self.queue.put(address)

redis1.set(address, 0)

self.main()defmain(self):globalNumwhile 1:if not self.queue.empty() and ConnectionList.__len__() > 1 is notNone:

self.pool.apply_async(self.task_manage())

read_sockets, write_sockets, error_sockets=select.select(ConnectionList, [], [])for sock inread_sockets:if sock ==self.server_socket:

conn, addr=self.server_socket.accept()

ConnectionList.append(conn)

core_num= conn.recv(Recv_buffer).decode('utf8')

Client_Status[conn]=core_num

Client_Num[conn]= Client_Num.__len__() + 1

print('客户端' + addr[0] + ':' + str(addr[1]) + '已连接,核心数:' + core_num + '\n编号为' +str(Client_Num[

conn]))else:

data=sock.recv(Recv_buffer)ifdata:

Contents= data.decode('utf8').split('Page_ContentPPPPPP///')#print('收到'+str(Client_Num[sock])+'号机发来数据,正在处理')

Client_Status[sock] = int(Client_Status[sock]) +len(Contents)print('编号'+str(Client_Num[sock])+'可用核心'+str(Client_Status[sock]))for content inContents:ifcontent:

self.pool.apply_async(self.web_page_resolution(content))else:print('客户端' + addr[0] + ':' + str(addr[1]) + '断开连接')

sock.close()

Client_Status.pop(sock)

Client_Num.pop(sock)

ConnectionList.remove(sock)defweb_page_resolution(self, content):

db=MongoClient().Web.data

db.insert({'page_content': content})

pattern= re.compile('https://movie.douban.com/(.*?)"')

urls= re.findall(string=content, pattern=pattern)for url inurls:

url= 'https://movie.douban.com/' +urlif redis1.get(url) isNone:

redis1.set(url, self.url_num)

self.queue.put(url)

self.url_num+= 1

deftask_manage(self):

urls= ''

for socket inConnectionList:if socket !=self.server_socket:while not self.queue.empty() and int(Client_Status[socket]) !=0:

urls= urls + self.queue.get() + ' 'Client_Status[socket]= int(Client_Status[socket]) - 1

#print('向' + str(Client_Num[socket]) + '号终端分配任务')

socket.send(urls.encode('utf8'))if __name__ == "__main__":

port= 8888Distributed_Web_Crawler(port, )

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值