# coding=utf8
from socket import *
import struct,sys, string, logging, json,pdb
sys.path.append("../server/")
reload(sys)
sys.setdefaultencoding('utf8')
import os
import hashlib
import time
import gearman
from ftp_utlis import Ftp
import zipfile
GEARMAN_HOST="cloud.tmlsystem.com"
GEARMAN_PORT=9095
norm_path1=os.path.abspath('.')
norm_path2=os.path.abspath('..')
# 设置日志
logger=logging.getLogger('gearman.py')
logger.setLevel(logging.INFO)
rq=time.strftime('%Y%m%d%H%M',time.localtime(time.time()))
log_path=os.path.dirname(os.getcwd())+'/Log/'
log_name=log_path+rq+'.log'
logfile=log_name
fh=logging.FileHandler(logfile,mode='w')
fh.setLevel(logging.DEBUG)
formatter=logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)
class MyGearmanWorker(gearman.GearmanWorker):
def on_job_execute(self, current_job):
print "Job started"
print "===================\n"
return super(MyGearmanWorker, self).on_job_execute(current_job)
# 解压从ftp下载的压缩文件
def unpacked_file(remote_path, local_path):
```# 解压从ftp下载的压缩文件
def unpacked_file(remote_path, local_path):
z = zipfile.ZipFile(remote_path, 'r')
z.extractall(path=local_path)
z.close()
# 根据解压出来的文件内容,指定爬取进行下载
def download_file(local_path,sha1):
with open('{}/crawled_data/bdks_mark/{}'.format(norm_path1,sha1),'w') as w:
w.write(' ')
count=len(open('{}/crawled_data/bdks_mark/{}'.format(norm_path1,sha1),'r').readlines())
f=open(local_path, 'r')
lines=f.readlines()
# 逐行读取文件内容(每一行就是一次爬虫任务)
for line in lines:
count+=1
#time.sleep(10)
# 命令行执行crawler.py文件
os.system('nohup python3 {}/crawler.py -t BaiduKeySearch -k {} -m 6 -f {} &'.format(norm_path1,str(line.strip()),str(sha1)))
# 死循环直到一次爬虫任务执行完
while True:
with open('{}/crawled_data/bdks_mark/{}'.format(norm_path1,sha1),'r') as q:
d=q.readlines()
time.sleep(10)
if len(d)==count:
break
f.close()
# 爬虫执行完所有任务后,发出通知信息给下一环节
def inform(sha1,GEARMAN_HOST,GEARMAN_PORT):
with open('{}/crawled_data/bdks_mark/{}'.format(norm_path1,sha1)) as f:
lines=f.readlines()
lines=map(lambda x:x.strip(),lines)
lines=map(lambda x:x.strip(),lines)
params = {'corpus_sha1':sha1,'corpus_file_list':lines}
worker_name = "download_corpus_from_oss"
gm_client = gearman.GearmanClient(['%s:%s' % (GEARMAN_HOST,GEARMAN_PORT)])
job_request = gm_client.submit_job(worker_name, json.dumps(params), background=True)
def task_callback(gearman_worker, gearman_job):
"""
*函数名 : task_callback
*函数功能描述 : gearman操作,部署规则模型
*函数参数 :
*函数返回值 :
*函数创建日期 : 2018-03-05
*函数修改日期 :
*修改人 :
"""
pdb.set_trace()
data = json.loads(gearman_job.data)
remote_path = data["crawler_list_path"] # 模型所在路径
corpus_sha1=data['corpus_sha1']
# 下载gearman压缩的任务的路径
time_stamp=time.time()
local_packed_file = '{}/local_packed/{}'.format(norm_path2,str(int(time_stamp)))
# 解压gearman的任务的路径
local_unpacked_path = '{}/local_unpacked'.format(norm_path2)
logger.info('开始下载信息')
ftp = Ftp(host='ftp.tmlsystem.com', port=8085, user='tml', pwd='mt12345PKU')
ftp.connect_server()
ftp.DownLoadFile(local_packed_file,remote_path)
logger.info('开始解压文件')
unpacked_file(local_packed_file, local_unpacked_path)
for filename in os.listdir(local_unpacked_path):
try:
download_file('{}/{}'.format(local_unpacked_path,filename),corpus_sha1)
except Exception,e:
logger.warning('download failed {}'.format(filename))
logger.warning('download failed {}'.format(filename))
os.remove('{}/{}'.format(local_unpacked_path,filename))
print('start informing')
inform(corpus_sha1,GEARMAN_HOST,GEARMAN_PORT)
logger.info("task done!")
my_worker = MyGearmanWorker(['cloud.tmlsystem.com:9095'])
my_worker.register_task("request_crawler", task_callback)
my_worker.work()
001
最新推荐文章于 2024-01-19 01:33:16 发布