multiprocessing
模块,同时提供本地和远程并发,使用子进程代替线程,有效避免Global Interpreter Lock
带来的影响。因此, multiprocessing
模块允许程序员充分利用机器上的多个核心。Linux 和 Windows 上都可以运行。
HDFSAppid.py
# -*- coding: utf-8 -*-
import os
import datetime
import sys
import sys, getopt
"""今天"""
date = (datetime.datetime.now()-datetime.timedelta(1)).strftime("%Y-%m-%d")
app_ids = []
"""判断hadoop里当天有哪些APP_id,组成list"""
def summary_app():
f = os.popen('/data/hadoop-2.9.0/bin/hdfs dfs -ls hdfs://ip地址:端口/ctrp/click/{}'.
format(date))
file_list = f.readlines()
for file in file_list[1:]:
app_ids.append(file.strip().split(' ')[-1].split('/')[-1])
return app_ids
fengff_click.py,对上述app_ids的list,分别执行每个app_id,对点击率建模
# -*- coding: utf-8 -*-
import os
import datetime
import sys
import sys, getopt
import multiprocessing
from HDFSAppid import summary_app
import time
"""今天"""
date = datetime.datetime.now().strftime("%Y-%m-%d")
last_date = (datetime.datetime.now()-datetime.timedelta(3)).strftime('%Y-%m-%d')
lib_file = 'orign.libffm'
libffm_file = 'click.libffm'
libffm_train = 'click_train.libffm'
libffm_eval = 'click_eval.libffm'
libffm_test = 'click_test.libffm'
"""判断hadoop里是否有目标数据,有:下载到服务器上;无:邮件提醒"""
def run_model(app_id):
t1 = time.time()
# 判断路径下是否有相关文件,没有的话,新建,否则,不新建
def run_url():
if os.path.exists('/data/marq/fengff/{}'.format(str(abs(int(app_id))))) == False:
dir = os.chdir('/data/marq/fengff/')
os.system('mkdir {}'.format(str(abs(int(app_id)))))
if os.path.exists('/data/marq/fengff/{}/click_fm'.format(str(abs(int(app_id))))) == False:
v_dir = os.chdir('/data/marq/fengff/{}/'.format(str(abs(int(app_id)))))
os.system('mkdir click_fm')
val_dir = os.chdir('/data/marq/fengff/{}/click_fm/'.format(str(abs(int(app_id))))) # 把当前工作目录切换到dirname下
os.system('mkdir {}'.format(date))
if os.path.exists('/data/marq/fengff/{}/click_fm/{}'.format(str(abs(int(app_id))),last_date)) == True:
os.system('rm -rf {}'.format(last_date))
data_dir = os.chdir('/data/marq/fengff/{}/click_fm/{}'.format(str(abs(int(app_id))),date)) # 把当前工作目录切换到dirname下
os.system('mkdir model')
run_url()
# 对点击率执行的过程及结果输出到相应的文件里,方便后续对每个APP_id根据结果,进行上线。
with open ('click_{}.log'.format(app_id),'w+') as ff:
click_dir = os.getcwd() # 当前数据的工作目录路径
ff.write("当前数据的工作目录路径:"+str(click_dir)+'\n')
f = os.popen(
'/data/hadoop-2.9.0/bin/hdfs dfs -ls hdfs://10.80.1.161:8020/ctrp/ifs-1-5-0/sample/ffm/click/{}/{}/'.format(
date,str(app_id)))
file_list = f.readlines()
target_file = file_list[1].strip().split(' ')[-1]
if "_SUCCESS" in target_file:
os.system(
'/data/hadoop-2.9.0/bin/hdfs dfs -cat hdfs://10.80.1.161:8020/ctrp/ifs-1-5-0/sample/ffm/click/{}/{}/part-* > {}'.format(
date,str(app_id),os.path.join(click_dir, lib_file)))
os.system(
'/data/hadoop-2.9.0/bin/hdfs dfs -cat hdfs://10.80.1.161:8020/ctrp/ifs-1-5-0/sample/ffm/click/{}/{}_schema.json > {}/click_schema.json'.format(
date,str(app_id),os.path.join(click_dir,'model')))
f = os.popen('wc -l {}'.format(lib_file))
lines = f.readlines()
ff.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'\n')
ff.write("点击率全样本共有{}行".format(str(lines))+'\n')
os.system('shuf {} -o {}'.format(lib_file, libffm_file))
a = ''.join(lines)
a = int(a.split(' ')[0])
b = int(a // 5)
ff.write("每等份有{}行".format(str(b)) + '\n')
os.system('split -{} {}'.format(b, libffm_file))
os.system('cat xaa > {}'.format(libffm_eval))
os.system('cat xab > {}'.format(libffm_test))
os.system('cat xac xad xae xaf > {}'.format(libffm_train))
os.system('rm -f xaa xab xac xad xae xaf')
libff_dir = os.chdir("/data/fengsj/bin/libff") # 把当前工作目录切换到模型路径下
os.system(
'./ffm-train -l 0.00001 -k 8 -t 50 -r 0.05 -c 0.05 -s 24 -autostop 4 -p {} {} {}/clkoutput.txt'.format(
os.path.join(click_dir, libffm_eval), os.path.join(click_dir, libffm_train),
os.path.join(click_dir, 'model')))
f = os.popen('./ffm-predict {} {}/clkoutput.txt {}/predict_output.txt {} 0.05'.format(
os.path.join(click_dir, libffm_test), os.path.join(click_dir, 'model'),
os.path.join(click_dir, 'model'), str(b)))
t2 = time.time()
ff.write("本APP_id执行时间为:"+str(int(t2-t1)) + 's' +'\n')
ff.write(str(f.readlines()))
f.close()
return True
else:
ff.close()
return False
def main():
app_ids = summary_app()
print("app_ids:",app_ids)
t1 = time.time()
pool = multiprocessing.Pool(processes=20) #创建拥有20个进程数量的进程池
pool.map(run_model,app_ids)
pool.close() #关闭进程池,不再接受新的进程
pool.join() #主进程阻塞等待子进程的退出
t2 = time.time()
print('并行执行时间:{}s'.format(str(int(t2-t1))))
if __name__ == '__main__':
main() # 0:运行的文件名,1往后是输入的参数 命令行里示例:Python fengff_imp.py -i inputfile -o outputfile
submitmodel.py 对上述APP各自建模情况,取各自最后的效果数据
,这里需要注意自定义变量
import sys
from HDFSAppid import summary_app
app_ids = summary_app()
for app_id in app_ids:
exec('clkmap{} = {}'.format(abs(int(app_id)), {})) # 自定义变量
exec('impmap{} = {}'.format(abs(int(app_id)), {}))
for app_id in app_ids:
with open('{}/{}/click_fm/{}/click_{}.log'.format(ddir,str(abs(int(app_id))),date,str(app_id))) as f:
cli_fm = f.readlines()[-1]
cli_list = cli_fm.split('[')[-1].split(',')[0:-1] # 去掉imp log日志的最后一行的最后一句话“wantlsd from heart!\n”
for l in cli_list:
target = l.split(' = ')[0].split("'")[1]
if target == 'oe':
value = l.split(' = ')[1:][0].split(' ')[0]
else:
value = l.split(' = ')[-1]
eval('clkmap'+str(abs(int(app_id))))[target] = value
f.close()
with open('{}/{}/req-imp/{}/imp_{}.log'.format(ddir,str(abs(int(app_id))),date,str(app_id))) as f:
cli_fm = f.readlines()[-1]
cli_list = cli_fm.split('[')[-1].split(',')[0:-1] # 去掉imp log日志的最后一行的最后一句话“wantlsd from heart!\n”
for l in cli_list:
target = l.split(' = ')[0].split("'")[1]
if target == 'oe':
value = l.split(' = ')[1:][0].split(' ')[0]
else:
value = l.split(' = ')[-1]
eval('impmap'+str(abs(int(app_id))))[target] = value
f.close()
print('app_id为{}的点击率指标为:{}'.format(app_id, eval('clkmap' + str(abs(int(app_id))))))
print('app_id为{}的曝光率指标为:{}'.format(app_id, eval('impmap' + str(abs(int(app_id))))))