- 解决Scrapyd远程管理URL消费者爬虫启动、状态、关闭、数量统计等问题
- 对Scrapyd进行封
import requests
import json
class Scrapyd:
def __init__(self, server_list):
self.server_list, _ = self.update_server(server_list)
def update_server(self, server_list):
stop_list = []
self.server_list = []
for _, v in enumerate(server_list):
try:
r = requests.get(f'http://{v}', timeout=1)
r.raise_for_status()
except requests.RequestException as e:
stop_list.append(v)
else:
self.server_list.append(v)
print(f"scrapyd running: {self.server_list}, stop: {stop_list}")
return self.server_list, stop_list
def server(self):
return self.server_list
def spider_list(self, project='yuqing_spider'):
result_dict = dict()
for _, v in enumerate(self.server_list):
res = requests.get(f'http://{v}/listjobs.json?project={project}')
if res.status_code == 200:
res_dict = json.loads(res.text)
result_dict[v] = {}
result_dict[v]['node_name'] = res_dict['node_name']
result_dict[v]['running'] = res_dict['running']
result_dict[v]['pending'] = res_dict['pending']
return result_dict
def spider2server(self, project='yuqing_spider'):
jobs_dict = dict()
for k, v in self.spider_list(project=project).items():
jobs_dict.update({y: k for y in [x['id'] for x in v['running']]})
return jobs_dict
def server2spider(self, project='yuqing_spider'):
jobs_dict = dict()
for k, v in self.spider_list(project=project).items():
jobs_dict[k] = [x['id'] for x in v['running']]
return jobs_dict
def spider_count(self, server=':'):
obj = self.spider_list()
count = 0
for k, v in obj.items():
if server in k:
count += len(v['running'])
return count
def spider_node_name(self):
obj = self.spider_list()
node_name_list = []
for k, v in obj.items():
node_name_list.append(v['node_name'])
return node_name_list
def spider_start(self, count=1, project='yuqing_spider', spider='consumer_spider'):
result_dict = dict()
post_data = {'project': project, 'spider': spider}
start_count = 0
for _ in range(len(self.server_list)*count):
i = start_count % len(self.server_list)
res = requests.post(f'http://{self.server_list[i]}/schedule.json', data=post_data)
if res.status_code == 200:
result_dict[json.loads(res.text)['jobid']] = self.server_list[i]
start_count += 1
if start_count >= count:
break
return result_dict
def spider_close(self, job_id_list=[], project='yuqing_spider'):
ok_list, lose_list = [], []
jobs_dict = self.spider2server(project=project)
if len(job_id_list) == 0:
job_id_list = jobs_dict.keys()
for _, job_id in enumerate(job_id_list):
post_data = {'project': project, 'job': f'{job_id}'}
if jobs_dict.get(job_id):
res = requests.post(f'http://{jobs_dict[job_id]}/cancel.json', data=post_data)
if res.status_code == 200:
ok_list.append(job_id)
else:
lose_list.append(job_id)
return ok_list, lose_list
if __name__ == '__main__':
scrapy = Scrapyd(['192.168.11.1:6800'])
scrapy.update_server(['192.168.11.1:6800', '192.168.11.19:6800'])
print(scrapy.spider2server())
scrapy URL消费者爬虫可以参考