基于自动机器学习工具hyperGBM的异常值识别中缺失值填补问题(含2022年全国服务外包大赛实例)

  我们以2022年全国服务外包大赛的A03题目作为示例代码演示缺失值填补过程。
  问题的主要任务时找出商品的销量异常和价格异常,提供4个月的商品信息数据,共1700万余条,4个月的店铺信息数据,共60万余条,强调时间复杂度空间复杂度、异常值识别率和准确率。我们用店铺分析辅助商品的异常,以提高可信度和准确率。但是店铺主要业务中存在较多缺失,对之后衍生变量计算有较大影响。
  店铺部分数据链接:https://pan.baidu.com/s/1iAp-s2JwG_YTB35BevMNyQ 提取码:jhnb
  个人认为,缺失值填补本质上是一个预测问题,因此,在随机森立算法效果不佳的情况下,我们采取一个AutoML工具hyperGBM(中文使用说明:https://hypergbm.readthedocs.io/zh_CN/latest/example.html)在10个模型中选取效果最好的模型预测缺失值。事实上这是一个分类问题,我们借助店铺的业务范围预测店铺的主要业务。因为店铺的业务范围长短不同,所以我们根据店铺的范围的长度将数据集分为13个部分,分别对这13个部分进行模型训练填补缺失值。在整体流程中这属于第一步缺失值填补:
在这里插入图片描述

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import get_scorer, classification_report
from hypergbm import make_experiment
import os
import pickle

def normalize(file, group):
    min_max = MinMaxScaler(feature_range=(0, 1))
    ret = file
    ret[file.columns[group]] = min_max.fit_transform(file[file.columns[group]])
    return ret


def make_number(df_file):
    dic_main_bussiness_labels = df_file.value_counts("MAIN_BUSINESS").to_dict() # 建字典,value后转编号
    no = 0          # 统计字符数,当编号
    for key, value in dic_main_bussiness_labels.items():
        dic_main_bussiness_labels[key] = no
        no += 1
    df_file["MAIN_BUSINESS"] = [dic_main_bussiness_labels[name] for name in df_file["MAIN_BUSINESS"]]   # 字符转编号

    list_main_scope = list()    # 存储每行的 范围
    set_main_scope = set()      # 统计范围的每个的名字,后转编号
    for name in df_file["BUSINESS_SCOPE"]:
        try:
            new = name.split(",")          # 对每行的统计
        except:
            new = str(name).split(",")   # 这破玩意有的没引号
        while "" in new:  # 判断是否有空值在列表中
            new.remove("")  # 如果有就直接通过remove删除
        list_main_scope.append(new)
        set_main_scope.update(new)          # 更新set
    set_main_scope_num = {i for i in range(len(set_main_scope))}   # 生成编号
    dic_mainscope_labels = dict(zip(set_main_scope, set_main_scope_num))       # 靠zip把两个set合到一起
    list_num_main_scope = list()            # 按照行存贮向量化的结果
    count = list()                          # 统计每行的main scope的长度,后面填补缺失值有用
    for item in list_main_scope:
        temp = []
        for bussiness in item:
            temp.append(dic_mainscope_labels[bussiness])
        list_num_main_scope.append(temp)
        count.append(len(temp))

    df_file["BUSINESS_SCOPE"] = list_num_main_scope
    df_file["count"]=count

    return df_file.drop(axis=1, columns="Unnamed: 0"), dic_main_bussiness_labels, dic_mainscope_labels  # 去掉那个莫名其妙的行


def make_number_Nan(df_Nan, dict_Scope):    # 向量化含缺失值的
    list_main_scope = []
    index_insetead = [i for i in range(df_Nan.shape[0])]
    df_Nan.index = index_insetead
    for name in df_Nan["BUSINESS_SCOPE"]:
        try:
            new = name.split(",")
        except:
            new = str(name).split(",")  # 这破玩意有的没引号
        list_main_scope.append(new)
    list_num_main_scope = list()
    count = list()
    hema_index = list()         # 处理盒马
    for i, item in enumerate(list_main_scope):
        temp = []
        for bussiness in item:
            try:
                temp.append(dict_Scope[bussiness])
            except:                 # 如果是盒马
                if(bussiness!=""):
                    temp.append(16)     # 16,目前最大16
                    hema_index.append(i)
        list_num_main_scope.append(temp)
        count.append(len(temp))
    df_Nan["BUSINESS_SCOPE"] = list_num_main_scope
    df_Nan["count"] = count
    for i in hema_index:    # 直接编号盒马
        df_Nan.at[i, "MAIN_BUSINESS"] = 33
    df_hema = df_Nan[df_Nan["MAIN_BUSINESS"] == 33]
    df_Nan = df_Nan.drop(df_Nan.index[hema_index])      # 编号盒马
    return df_Nan.drop(axis=1, columns="Unnamed: 0"), df_hema.drop(axis=1, columns="Unnamed: 0")


def fill_null(df_no_Nan, df_has_Nan, dict_BussinessScope):
    # stopline = [[], [], [], [], [], [], [], [], [], [24], [], [24], [], [24], []]    # 每种只出现一次的,不利于训练,去掉
    # print(df_no_Nan["count"].value_counts())  #
    print("预测数据长度:\n", df_has_Nan["count"].value_counts())   # 0-13
    df_predict_result = df_no_Nan               # 最终结果存这里
    for i in range(1, 14):      # 因为predict最多到13  把不同长度的分别训练
        # 这里把不同长度main scope切开
        col_names = ["col"+str(col_name) for col_name in range(i)]              # 生成列名
        train_data = df_no_Nan[df_no_Nan["count"] == i][["BUSINESS_SCOPE", "MAIN_BUSINESS"]]     # 根据列表长度训练

        train_data = train_data.groupby("MAIN_BUSINESS").filter(lambda x: (len(x) >= 5))
        print(train_data.value_counts("MAIN_BUSINESS"))
        predict_data = df_has_Nan[df_has_Nan["count"] == i]
        df_train_devided = train_data['BUSINESS_SCOPE'].apply(pd.Series, index=col_names)   # 切开商业范围
        df_predict_devided = predict_data['BUSINESS_SCOPE'].apply(pd.Series, index=col_names)
        train_data = pd.concat([df_train_devided, train_data.drop(columns="BUSINESS_SCOPE", axis=1)], axis=1) # 合并列
        # train_data = train_data[~train_data['MAIN_BUSINESS'].isin(stopline[i])]     # 去除那几个只有一个的
        predict_data_test = pd.concat([df_predict_devided, predict_data[["BUSINESS_SCOPE", "MAIN_BUSINESS"]].drop(columns="BUSINESS_SCOPE", axis=1)], axis=1)       # 预测数据

        # 分出训练集和验证集
        try:
            x_train, y_train = train_test_split(train_data, random_state=1129, test_size=0.2, stratify=train_data["MAIN_BUSINESS"])
        except:
            x_train, y_train = train_test_split(train_data, random_state=1129, test_size=0.2)
        # print(train_data.value_counts('MAIN_BUSINESS'))
        try:
            y_train, z_train = train_test_split(y_train, random_state=1129, test_size=0.5, stratify=y_train["MAIN_BUSINESS"])
        except:
            y_train, z_train = train_test_split(y_train, random_state=1129, test_size=0.5)

        # 转成csv,不然那个不接
        x_train.to_csv("train.csv", encoding="utf-8-sig")
        y_train.to_csv("eval.csv", encoding="utf-8-sig")
        predict_data_test.to_csv("test.csv", encoding="utf-8-sig")
        exp = make_experiment("train.csv", test_data=None, eval_data="eval.csv", target='MAIN_BUSINESS', reward_metric='accuracy', log_level='info', class_balancing='ClassWeight', cv=True)    # 模型参数
        estiamtor = exp.run()               # 跑模型
        with open('model'+str(i)+' .pkl', 'wb') as f:
            pickle.dump(estiamtor, f)
        print("完事!!\n\n")

        # 评价模型
        z_pred = estiamtor.predict(z_train.drop(axis=1, columns="MAIN_BUSINESS"))
        # print(y_pred)
        # print(y_train["MAIN_BUSINESS"].tolist())
        print(
            classification_report(z_train["MAIN_BUSINESS"].tolist(), pd.Series(z_pred, index=z_train.index), digits=5))

        # 预测
        pred_proba = estiamtor.predict_proba(predict_data_test)

        result = np.argmax(pred_proba, axis=1)
        predict_data["MAIN_BUSINESS"] = result  # 存储结果
        predict_data.to_csv("填补后店铺数据"+str(i)+".csv", encoding="utf-8-sig")
        df_predict_result = pd.concat([df_predict_result, predict_data], axis=0)    # 合并到最终结果里

        # 处理垃圾
        os.remove("test.csv")
        os.remove("train.csv")
        os.remove("eval.csv")
    df_predict_result.to_csv("填补后店铺数据.csv", encoding="utf-8-sig")
    return df_predict_result

def main():
    df_file = pd.read_csv("../new feature/店铺数据.csv", encoding="utf-8-sig")
    # 筛选无缺失的(主要商业范围) 缺了57871,商业范围缺了5045,都缺的是4928
    df_NoNan = df_file.dropna(axis=0, how='any', subset=["MAIN_BUSINESS"])
    # 有缺失的(主要业务)
    df_has_Nan = df_file[df_file[["MAIN_BUSINESS"]].isnull().T.any()]
    # 找出缺主要业务但是商业范围不缺的
    df_has_Nan.dropna(axis=0, how='any', subset=["BUSINESS_SCOPE"])[["MAIN_BUSINESS", "BUSINESS_SCOPE"]]

    # 向量化(其实就是给那俩编号)
    df_NoNan_numbered, dict_MainBussiness, dict_BussinessScope = make_number(df_NoNan)
    df_has_Nan_numbered, df_hema = make_number_Nan(df_has_Nan, dict_BussinessScope)
    df_NoNan_numbered = pd.concat([df_NoNan_numbered, df_hema])     # 把盒马合并进来
    dict_MainBussiness.update(zip({"盒马"}, {33}))  # 处理盒马
    dict_BussinessScope.update(zip({"盒马"}, {16}))
    print("主要业务编号词典:\n", dict_MainBussiness)
    print("业务范围词典:\n", dict_BussinessScope)
    dict_numbers_to_MainBussiness = dict([value, key] for key, value in dict_MainBussiness.items())  # 制作反向传导
    dict_numbers_to_BussinessScope = dict([value, key] for key, value in dict_BussinessScope.items())
    print("向量化完毕!!!")

    # 缺失值填补
    df_fill_null = fill_null(df_NoNan_numbered, df_has_Nan_numbered, dict_BussinessScope)
    print("缺失值填补完毕!!!")

  后面几组数据由于维度较高且数据量较少,所以预测的准确率较低,因此准确率较低,因为数据较少,我们采取人工手动标注的方法填补缺失值。
  模型调参结果图:请添加图片描述
相关文章
  数据概览与预处理https://blog.csdn.net/Hjh1906008151/article/details/124313507
  衍生变量计算(缺失值填补就是为了计算衍生变量)https://blog.csdn.net/Hjh1906008151/article/details/124330708
  异常值识别基础方法https://blog.csdn.net/Hjh1906008151/article/details/124342492
  基于pyod的异常值识别方法https://editor.csdn.net/md/?articleId=124340047
  异常值识别效果不佳的解决思路https://blog.csdn.net/Hjh1906008151/article/details/124341064

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

铖铖的花嫁

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值