python 对各app用多进程方式自动化建模并推送线上

multiprocessing模块,同时提供本地和远程并发,使用子进程代替线程,有效避免Global Interpreter Lock 带来的影响。因此, multiprocessing 模块允许程序员充分利用机器上的多个核心。Linux 和 Windows 上都可以运行。

HDFSAppid.py

# -*- coding: utf-8 -*-
import os
import datetime
import sys
import sys, getopt

"""今天"""
date = (datetime.datetime.now()-datetime.timedelta(1)).strftime("%Y-%m-%d")
app_ids = []

"""判断hadoop里当天有哪些APP_id,组成list"""
def summary_app():
    f = os.popen('/data/hadoop-2.9.0/bin/hdfs dfs -ls hdfs://ip地址:端口/ctrp/click/{}'.
                 format(date))
    file_list = f.readlines()
    for file in file_list[1:]:
        app_ids.append(file.strip().split(' ')[-1].split('/')[-1])
    return app_ids

fengff_click.py,对上述app_ids的list,分别执行每个app_id,对点击率建模

# -*- coding: utf-8 -*-
import os
import datetime
import sys
import sys, getopt
import multiprocessing
from HDFSAppid import summary_app
import time

"""今天"""
date = datetime.datetime.now().strftime("%Y-%m-%d")
last_date = (datetime.datetime.now()-datetime.timedelta(3)).strftime('%Y-%m-%d')
lib_file = 'orign.libffm'
libffm_file = 'click.libffm'
libffm_train = 'click_train.libffm'
libffm_eval = 'click_eval.libffm'
libffm_test = 'click_test.libffm'
"""判断hadoop里是否有目标数据,有:下载到服务器上;无:邮件提醒"""



def run_model(app_id):
    t1 = time.time()
    # 判断路径下是否有相关文件,没有的话,新建,否则,不新建
    def run_url():
        if os.path.exists('/data/marq/fengff/{}'.format(str(abs(int(app_id))))) == False:
            dir = os.chdir('/data/marq/fengff/')
            os.system('mkdir {}'.format(str(abs(int(app_id)))))
        if os.path.exists('/data/marq/fengff/{}/click_fm'.format(str(abs(int(app_id))))) == False:
            v_dir = os.chdir('/data/marq/fengff/{}/'.format(str(abs(int(app_id)))))
            os.system('mkdir click_fm')
        val_dir = os.chdir('/data/marq/fengff/{}/click_fm/'.format(str(abs(int(app_id)))))  # 把当前工作目录切换到dirname下
        os.system('mkdir {}'.format(date))
        if os.path.exists('/data/marq/fengff/{}/click_fm/{}'.format(str(abs(int(app_id))),last_date)) == True:
            os.system('rm -rf {}'.format(last_date))
        data_dir = os.chdir('/data/marq/fengff/{}/click_fm/{}'.format(str(abs(int(app_id))),date))  # 把当前工作目录切换到dirname下
        os.system('mkdir model')
    run_url()
    # 对点击率执行的过程及结果输出到相应的文件里,方便后续对每个APP_id根据结果,进行上线。
    with open ('click_{}.log'.format(app_id),'w+') as ff:
        click_dir = os.getcwd()  # 当前数据的工作目录路径
        ff.write("当前数据的工作目录路径:"+str(click_dir)+'\n')
     
        f = os.popen(
            '/data/hadoop-2.9.0/bin/hdfs dfs -ls hdfs://10.80.1.161:8020/ctrp/ifs-1-5-0/sample/ffm/click/{}/{}/'.format(
                date,str(app_id)))
        file_list = f.readlines()
        target_file = file_list[1].strip().split(' ')[-1]
        if "_SUCCESS" in target_file:
            os.system(
                '/data/hadoop-2.9.0/bin/hdfs dfs -cat hdfs://10.80.1.161:8020/ctrp/ifs-1-5-0/sample/ffm/click/{}/{}/part-* > {}'.format(
                    date,str(app_id),os.path.join(click_dir, lib_file)))
            os.system(
                 '/data/hadoop-2.9.0/bin/hdfs dfs -cat hdfs://10.80.1.161:8020/ctrp/ifs-1-5-0/sample/ffm/click/{}/{}_schema.json > {}/click_schema.json'.format(
                     date,str(app_id),os.path.join(click_dir,'model')))
            f = os.popen('wc -l {}'.format(lib_file))
            lines = f.readlines()
            ff.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'\n')
            ff.write("点击率全样本共有{}行".format(str(lines))+'\n')
            os.system('shuf {} -o {}'.format(lib_file, libffm_file))
            a = ''.join(lines)
            a = int(a.split(' ')[0])
            b = int(a // 5)
            ff.write("每等份有{}行".format(str(b)) + '\n')

            os.system('split -{} {}'.format(b, libffm_file))
            os.system('cat xaa > {}'.format(libffm_eval))
            os.system('cat xab > {}'.format(libffm_test))
            os.system('cat xac xad xae xaf > {}'.format(libffm_train))
            os.system('rm -f xaa xab xac xad xae xaf')

            libff_dir = os.chdir("/data/fengsj/bin/libff")  # 把当前工作目录切换到模型路径下
            os.system(
                './ffm-train -l 0.00001 -k 8 -t 50 -r 0.05 -c 0.05 -s 24 -autostop 4 -p {} {} {}/clkoutput.txt'.format(
                    os.path.join(click_dir, libffm_eval), os.path.join(click_dir, libffm_train),
                    os.path.join(click_dir, 'model')))
            f = os.popen('./ffm-predict {} {}/clkoutput.txt {}/predict_output.txt {} 0.05'.format(
                os.path.join(click_dir, libffm_test), os.path.join(click_dir, 'model'),
                os.path.join(click_dir, 'model'), str(b)))
            t2 = time.time()
            ff.write("本APP_id执行时间为:"+str(int(t2-t1)) + 's' +'\n')
            ff.write(str(f.readlines()))
            f.close()
            return True
        else:
            ff.close()
            return False


def main():
    app_ids = summary_app()
    print("app_ids:",app_ids)
    t1 = time.time()
    pool = multiprocessing.Pool(processes=20)  #创建拥有20个进程数量的进程池
    pool.map(run_model,app_ids)
    pool.close()  #关闭进程池,不再接受新的进程
    pool.join()  #主进程阻塞等待子进程的退出
    t2 = time.time()
    print('并行执行时间:{}s'.format(str(int(t2-t1))))


if __name__ == '__main__':
    main()  # 0:运行的文件名,1往后是输入的参数   命令行里示例:Python fengff_imp.py -i inputfile -o outputfile

submitmodel.py 对上述APP各自建模情况,取各自最后的效果数据
,这里需要注意自定义变量

import sys
from HDFSAppid import summary_app

app_ids = summary_app()
for app_id in app_ids:
    exec('clkmap{} = {}'.format(abs(int(app_id)), {}))   # 自定义变量
    exec('impmap{} = {}'.format(abs(int(app_id)), {}))

for app_id in app_ids:
    with open('{}/{}/click_fm/{}/click_{}.log'.format(ddir,str(abs(int(app_id))),date,str(app_id))) as f:
        cli_fm = f.readlines()[-1]
        cli_list = cli_fm.split('[')[-1].split(',')[0:-1]  # 去掉imp log日志的最后一行的最后一句话“wantlsd from heart!\n”
        for l in cli_list:
            target = l.split(' = ')[0].split("'")[1]
            if target == 'oe':
                value = l.split(' = ')[1:][0].split(' ')[0]
            else:
                value = l.split(' = ')[-1]
            eval('clkmap'+str(abs(int(app_id))))[target] = value
        f.close()

    with open('{}/{}/req-imp/{}/imp_{}.log'.format(ddir,str(abs(int(app_id))),date,str(app_id))) as f:
        cli_fm = f.readlines()[-1]
        cli_list = cli_fm.split('[')[-1].split(',')[0:-1]  # 去掉imp log日志的最后一行的最后一句话“wantlsd from heart!\n”
        for l in cli_list:
            target = l.split(' = ')[0].split("'")[1]
            if target == 'oe':
                value = l.split(' = ')[1:][0].split(' ')[0]
            else:
                value = l.split(' = ')[-1]
            eval('impmap'+str(abs(int(app_id))))[target] = value
        f.close()
    print('app_id为{}的点击率指标为:{}'.format(app_id, eval('clkmap' + str(abs(int(app_id))))))
    print('app_id为{}的曝光率指标为:{}'.format(app_id, eval('impmap' + str(abs(int(app_id))))))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值