机器学习_模型调用预测

概要

本文将介绍模型调用预测的步骤,这里模型使用的是LGB

代码

导包

from datetime import date, timedelta
now = datetime.datetime.now()
day1 = datetime.datetime(now.year, now.month, 1)
day1 = day1.strftime("%Y%m%d")

print(day1)        # 20240201

设置时间参数

now = datetime.datetime.now()
this_month_start = datetime.datetime(now.year, now.month, 1)
this_month_start   #datetime.datetime(2024, 2, 1, 0, 0)
last_month_end = this_month_start - timedelta(days = 1)
last_month_start = datetime.datetime(last_month_end.year, last_month_end.month, 1).strftime("%Y%m%d")
last_month_end = last_month_end.strftime("%Y%m%d")
print(last_month_start, last_month_end)     # 20240101 20240131
print("预测的月,取月初:", day1)              #20240201
print("上个月月初:     ", last_month_start)  #20240101
print("上个月月末:     ", last_month_end)    #20240131

读取数据

project = spark.sql(f"""select *
                            ,row_number() over(order by customer_id desc) as rn
                      from
                      (
                          select a.cust_no
                                  ,a.*(除customer_id)
                                  ,nvl(duartion_is_lm, 0) as duartion_is_lm
                                  ,{day1} as dt
                          from
                          (
                              select t1.*
                              from ku.lstable t1
                              inner join
                              (
                                  select customer_id
                                  from kb.cust_info
                                  where dt = {last_month_end} and ka_type != ''
                              ) t2 on t1.customer_id = t2.customer_id
                              where t1.dt = {day1}
                          ) a
                          left join
                          (
                              select customer_id
                                    ,case when sum(duartion) > 60 then 1 else 0 end as duartion_is_lm
                              from ku.duartiontable
                              where dt between {last_month_start} and {last_month_end}
                              group by customer_id
                          ) b on a.customer_id = b.customer_id
                      )""")
project.show(3)

获取跑的批数

project.write.format("hive").mode("overwrite").saveAsTable("ku.cunchu")
del ls
rn = spark.sql(f"""select max(rn) as max_rn from ku.cunchu""")
rn.show(3)
rn_data = rn.toPandas()
del rn
rn_data_num = int(np.ceil(rn_data.values/1000000))  #间隔1百万跑一次
del rn_data
print("最多次数:", rn_data_num) # 12

导入模型

import joblib
model_joblib = joblib.load("project_joblib.pkl")
t1 = time.time()

模型预测

for num in range(1, rn_data_num + 1):
    if num == 1:
        print("开始第%d个" %num)
        data_rn = spark.sql(f"""select from ku.cunchu where rn <= 1000000""")
        df2 = data_rn.toPanas()
        del data_rn
        df2 = df2.drop(['rn'], axis=1)
        df2['customer_id'] = pd.to_numeric(df2['customer_id'], error="coerce")
        df2 = df2.dropna(subset=['customer_id'])
        for col in df2.columns:
            df2[col] = df2[col].astype("int")
        X = df2.drop(['customer_id', 'dt'], axis=1)
        y_pred = model_joblib.predict(X)
        y_pred = pd.DataFrame(y_pred)
        
        result = pd.concat((df2, y_pred), axis=1)
        del df2
        result.rename({0: u'y_pred'}, axis=1, inplace=True)
        result = result.fillna(0)
        result = result[result["customer_id"] != 0]
        for col in result.columns:
            result[col] = result[col].astype("int")
        spark_df = spark.createDataFrame(result)
        del result
        spark_df.write.format("hive").mode("append").saveAsTable("ku.cunchu_result")
        del spark_df
        print("第%d个跑完" %num)
    else:
        print("开始第%d个" %num)
        data_rn = spark.sql(f"""select * from ku.cunchu where rn between 1000000*{num-1}+1 and 1000000*{num}""")
        df2 = data_rn.toPanas()
        del data_rn
        df2 = df2.drop(['rn'], axis=1)
        df2['customer_id'] = pd.to_numeric(df2['customer_id'], error="coerce")
        df2 = df2.dropna(subset=['customer_id'])
        for col in df2.columns:
            df2[col] = df2[col].astype("int")
        X = df2.drop(['customer_id', 'dt'], axis=1)
        y_pred = model_joblib.predict(X)
        y_pred = pd.DataFrame(y_pred)
        
        result = pd.concat((df2, y_pred), axis=1)
        del df2
        result.rename({0: u'y_pred'}, axis=1, inplace=True)
        result = result.fillna(0)
        result = result[result["customer_id"] != 0]
        for col in result.columns:
            result[col] = result[col].astype("int")
        spark_df = spark.createDataFrame(result)
        del result
        spark_df.write.format("hive").mode("append").saveAsTable("ku.cunchu_result")
        del spark_df
        print("第%d个跑完" %num)
print("预测建表跑了 {time.time()-t1:.05f}s")

结果存储

df_pjzb = spark.sql(f"""select cust_no, y_pred from ku.cunchu_result where dt = {day1}""")
df_pjzb2 = df_pjzb.groupBy("y_pred").agg({"customer_id": "count"}).toDF("y_pred", "cou")

查看结果分布

df_pjzb2.show()
# y_pred cou
# 0      9000
# 1      3000
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值