001

最新推荐文章于 2024-01-19 01:33:16 发布

nio006

最新推荐文章于 2024-01-19 01:33:16 发布

阅读量101

点赞数

分类专栏： scrapy

本文链接：https://blog.csdn.net/weixin_43890188/article/details/88879149

版权

scrapy 专栏收录该内容

16 篇文章 0 订阅

订阅专栏

# coding=utf8
from socket import *
import struct,sys, string, logging, json,pdb
sys.path.append("../server/")
reload(sys)
sys.setdefaultencoding('utf8')
import os
import hashlib
import time
import gearman
from ftp_utlis import Ftp
import zipfile



GEARMAN_HOST="cloud.tmlsystem.com"
GEARMAN_PORT=9095
norm_path1=os.path.abspath('.')
norm_path2=os.path.abspath('..')

# 设置日志
logger=logging.getLogger('gearman.py')
logger.setLevel(logging.INFO)
rq=time.strftime('%Y%m%d%H%M',time.localtime(time.time()))
log_path=os.path.dirname(os.getcwd())+'/Log/'
log_name=log_path+rq+'.log'
logfile=log_name
fh=logging.FileHandler(logfile,mode='w')
fh.setLevel(logging.DEBUG)
formatter=logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)
class MyGearmanWorker(gearman.GearmanWorker):
    def on_job_execute(self, current_job):
        print "Job started"
        print "===================\n"
        return super(MyGearmanWorker, self).on_job_execute(current_job)

# 解压从ftp下载的压缩文件
def unpacked_file(remote_path, local_path):
                                                                                                                
                                                                                                               
```# 解压从ftp下载的压缩文件
def unpacked_file(remote_path, local_path):

    z = zipfile.ZipFile(remote_path, 'r')
    z.extractall(path=local_path)
    z.close()

# 根据解压出来的文件内容，指定爬取进行下载
def download_file(local_path,sha1):
    with open('{}/crawled_data/bdks_mark/{}'.format(norm_path1,sha1),'w') as w:
        w.write(' ')
    count=len(open('{}/crawled_data/bdks_mark/{}'.format(norm_path1,sha1),'r').readlines())
    f=open(local_path, 'r')
    lines=f.readlines()
    # 逐行读取文件内容（每一行就是一次爬虫任务）
    for line in lines:
        count+=1
        #time.sleep(10)
        # 命令行执行crawler.py文件
        os.system('nohup python3 {}/crawler.py  -t BaiduKeySearch -k {} -m 6 -f {} &'.format(norm_path1,str(line.strip()),str(sha1)))
        # 死循环直到一次爬虫任务执行完
        while True:
            with open('{}/crawled_data/bdks_mark/{}'.format(norm_path1,sha1),'r') as q:
                d=q.readlines()
            time.sleep(10)
            if len(d)==count:
                break
    f.close()
# 爬虫执行完所有任务后，发出通知信息给下一环节
def inform(sha1,GEARMAN_HOST,GEARMAN_PORT):
    with open('{}/crawled_data/bdks_mark/{}'.format(norm_path1,sha1)) as f:
        lines=f.readlines()
    lines=map(lambda x:x.strip(),lines)
                                                                                                                              lines=map(lambda x:x.strip(),lines)
    params = {'corpus_sha1':sha1,'corpus_file_list':lines}
    worker_name = "download_corpus_from_oss"
    gm_client = gearman.GearmanClient(['%s:%s' % (GEARMAN_HOST,GEARMAN_PORT)])
    job_request = gm_client.submit_job(worker_name, json.dumps(params), background=True)

def task_callback(gearman_worker, gearman_job):
    """
    *函数名 ： task_callback
    *函数功能描述 ： gearman操作,部署规则模型
    *函数参数 ：
    *函数返回值 ：
    *函数创建日期 ： 2018-03-05
    *函数修改日期 ：
    *修改人 ：
    """
    pdb.set_trace()
    data = json.loads(gearman_job.data)
    remote_path = data["crawler_list_path"]  # 模型所在路径
    corpus_sha1=data['corpus_sha1']
    # 下载gearman压缩的任务的路径
    time_stamp=time.time()
    local_packed_file = '{}/local_packed/{}'.format(norm_path2,str(int(time_stamp)))
    # 解压gearman的任务的路径
    local_unpacked_path = '{}/local_unpacked'.format(norm_path2)
    logger.info('开始下载信息')
    ftp = Ftp(host='ftp.tmlsystem.com', port=8085, user='tml', pwd='mt12345PKU')
    ftp.connect_server()
    ftp.DownLoadFile(local_packed_file,remote_path)
    logger.info('开始解压文件')
    unpacked_file(local_packed_file, local_unpacked_path)
    for filename in os.listdir(local_unpacked_path):
        try:
            download_file('{}/{}'.format(local_unpacked_path,filename),corpus_sha1)
        except Exception,e:
            logger.warning('download failed {}'.format(filename))

         logger.warning('download failed {}'.format(filename))
        os.remove('{}/{}'.format(local_unpacked_path,filename))
    print('start informing')
    inform(corpus_sha1,GEARMAN_HOST,GEARMAN_PORT)
    logger.info("task done!")


my_worker = MyGearmanWorker(['cloud.tmlsystem.com:9095'])
my_worker.register_task("request_crawler", task_callback)
my_worker.work()