我们以2022年全国服务外包大赛的A03题目作为示例代码演示缺失值填补过程。
问题的主要任务时找出商品的销量异常和价格异常,提供4个月的商品信息数据,共1700万余条,4个月的店铺信息数据,共60万余条,强调时间复杂度空间复杂度、异常值识别率和准确率。我们用店铺分析辅助商品的异常,以提高可信度和准确率。但是店铺主要业务中存在较多缺失,对之后衍生变量计算有较大影响。
店铺部分数据链接:https://pan.baidu.com/s/1iAp-s2JwG_YTB35BevMNyQ 提取码:jhnb
个人认为,缺失值填补本质上是一个预测问题,因此,在随机森立算法效果不佳的情况下,我们采取一个AutoML工具hyperGBM(中文使用说明:https://hypergbm.readthedocs.io/zh_CN/latest/example.html)在10个模型中选取效果最好的模型预测缺失值。事实上这是一个分类问题,我们借助店铺的业务范围预测店铺的主要业务。因为店铺的业务范围长短不同,所以我们根据店铺的范围的长度将数据集分为13个部分,分别对这13个部分进行模型训练填补缺失值。在整体流程中这属于第一步缺失值填补:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import get_scorer, classification_report
from hypergbm import make_experiment
import os
import pickle
def normalize(file, group):
min_max = MinMaxScaler(feature_range=(0, 1))
ret = file
ret[file.columns[group]] = min_max.fit_transform(file[file.columns[group]])
return ret
def make_number(df_file):
dic_main_bussiness_labels = df_file.value_counts("MAIN_BUSINESS").to_dict() # 建字典,value后转编号
no = 0 # 统计字符数,当编号
for key, value in dic_main_bussiness_labels.items():
dic_main_bussiness_labels[key] = no
no += 1
df_file["MAIN_BUSINESS"] = [dic_main_bussiness_labels[name] for name in df_file["MAIN_BUSINESS"]] # 字符转编号
list_main_scope = list() # 存储每行的 范围
set_main_scope = set() # 统计范围的每个的名字,后转编号
for name in df_file["BUSINESS_SCOPE"]:
try:
new = name.split(",") # 对每行的统计
except:
new = str(name).split(",") # 这破玩意有的没引号
while "" in new: # 判断是否有空值在列表中
new.remove("") # 如果有就直接通过remove删除
list_main_scope.append(new)
set_main_scope.update(new) # 更新set
set_main_scope_num = {i for i in range(len(set_main_scope))} # 生成编号
dic_mainscope_labels = dict(zip(set_main_scope, set_main_scope_num)) # 靠zip把两个set合到一起
list_num_main_scope = list() # 按照行存贮向量化的结果
count = list() # 统计每行的main scope的长度,后面填补缺失值有用
for item in list_main_scope:
temp = []
for bussiness in item:
temp.append(dic_mainscope_labels[bussiness])
list_num_main_scope.append(temp)
count.append(len(temp))
df_file["BUSINESS_SCOPE"] = list_num_main_scope
df_file["count"]=count
return df_file.drop(axis=1, columns="Unnamed: 0"), dic_main_bussiness_labels, dic_mainscope_labels # 去掉那个莫名其妙的行
def make_number_Nan(df_Nan, dict_Scope): # 向量化含缺失值的
list_main_scope = []
index_insetead = [i for i in range(df_Nan.shape[0])]
df_Nan.index = index_insetead
for name in df_Nan["BUSINESS_SCOPE"]:
try:
new = name.split(",")
except:
new = str(name).split(",") # 这破玩意有的没引号
list_main_scope.append(new)
list_num_main_scope = list()
count = list()
hema_index = list() # 处理盒马
for i, item in enumerate(list_main_scope):
temp = []
for bussiness in item:
try:
temp.append(dict_Scope[bussiness])
except: # 如果是盒马
if(bussiness!=""):
temp.append(16) # 16,目前最大16
hema_index.append(i)
list_num_main_scope.append(temp)
count.append(len(temp))
df_Nan["BUSINESS_SCOPE"] = list_num_main_scope
df_Nan["count"] = count
for i in hema_index: # 直接编号盒马
df_Nan.at[i, "MAIN_BUSINESS"] = 33
df_hema = df_Nan[df_Nan["MAIN_BUSINESS"] == 33]
df_Nan = df_Nan.drop(df_Nan.index[hema_index]) # 编号盒马
return df_Nan.drop(axis=1, columns="Unnamed: 0"), df_hema.drop(axis=1, columns="Unnamed: 0")
def fill_null(df_no_Nan, df_has_Nan, dict_BussinessScope):
# stopline = [[], [], [], [], [], [], [], [], [], [24], [], [24], [], [24], []] # 每种只出现一次的,不利于训练,去掉
# print(df_no_Nan["count"].value_counts()) #
print("预测数据长度:\n", df_has_Nan["count"].value_counts()) # 0-13
df_predict_result = df_no_Nan # 最终结果存这里
for i in range(1, 14): # 因为predict最多到13 把不同长度的分别训练
# 这里把不同长度main scope切开
col_names = ["col"+str(col_name) for col_name in range(i)] # 生成列名
train_data = df_no_Nan[df_no_Nan["count"] == i][["BUSINESS_SCOPE", "MAIN_BUSINESS"]] # 根据列表长度训练
train_data = train_data.groupby("MAIN_BUSINESS").filter(lambda x: (len(x) >= 5))
print(train_data.value_counts("MAIN_BUSINESS"))
predict_data = df_has_Nan[df_has_Nan["count"] == i]
df_train_devided = train_data['BUSINESS_SCOPE'].apply(pd.Series, index=col_names) # 切开商业范围
df_predict_devided = predict_data['BUSINESS_SCOPE'].apply(pd.Series, index=col_names)
train_data = pd.concat([df_train_devided, train_data.drop(columns="BUSINESS_SCOPE", axis=1)], axis=1) # 合并列
# train_data = train_data[~train_data['MAIN_BUSINESS'].isin(stopline[i])] # 去除那几个只有一个的
predict_data_test = pd.concat([df_predict_devided, predict_data[["BUSINESS_SCOPE", "MAIN_BUSINESS"]].drop(columns="BUSINESS_SCOPE", axis=1)], axis=1) # 预测数据
# 分出训练集和验证集
try:
x_train, y_train = train_test_split(train_data, random_state=1129, test_size=0.2, stratify=train_data["MAIN_BUSINESS"])
except:
x_train, y_train = train_test_split(train_data, random_state=1129, test_size=0.2)
# print(train_data.value_counts('MAIN_BUSINESS'))
try:
y_train, z_train = train_test_split(y_train, random_state=1129, test_size=0.5, stratify=y_train["MAIN_BUSINESS"])
except:
y_train, z_train = train_test_split(y_train, random_state=1129, test_size=0.5)
# 转成csv,不然那个不接
x_train.to_csv("train.csv", encoding="utf-8-sig")
y_train.to_csv("eval.csv", encoding="utf-8-sig")
predict_data_test.to_csv("test.csv", encoding="utf-8-sig")
exp = make_experiment("train.csv", test_data=None, eval_data="eval.csv", target='MAIN_BUSINESS', reward_metric='accuracy', log_level='info', class_balancing='ClassWeight', cv=True) # 模型参数
estiamtor = exp.run() # 跑模型
with open('model'+str(i)+' .pkl', 'wb') as f:
pickle.dump(estiamtor, f)
print("完事!!\n\n")
# 评价模型
z_pred = estiamtor.predict(z_train.drop(axis=1, columns="MAIN_BUSINESS"))
# print(y_pred)
# print(y_train["MAIN_BUSINESS"].tolist())
print(
classification_report(z_train["MAIN_BUSINESS"].tolist(), pd.Series(z_pred, index=z_train.index), digits=5))
# 预测
pred_proba = estiamtor.predict_proba(predict_data_test)
result = np.argmax(pred_proba, axis=1)
predict_data["MAIN_BUSINESS"] = result # 存储结果
predict_data.to_csv("填补后店铺数据"+str(i)+".csv", encoding="utf-8-sig")
df_predict_result = pd.concat([df_predict_result, predict_data], axis=0) # 合并到最终结果里
# 处理垃圾
os.remove("test.csv")
os.remove("train.csv")
os.remove("eval.csv")
df_predict_result.to_csv("填补后店铺数据.csv", encoding="utf-8-sig")
return df_predict_result
def main():
df_file = pd.read_csv("../new feature/店铺数据.csv", encoding="utf-8-sig")
# 筛选无缺失的(主要商业范围) 缺了57871,商业范围缺了5045,都缺的是4928
df_NoNan = df_file.dropna(axis=0, how='any', subset=["MAIN_BUSINESS"])
# 有缺失的(主要业务)
df_has_Nan = df_file[df_file[["MAIN_BUSINESS"]].isnull().T.any()]
# 找出缺主要业务但是商业范围不缺的
df_has_Nan.dropna(axis=0, how='any', subset=["BUSINESS_SCOPE"])[["MAIN_BUSINESS", "BUSINESS_SCOPE"]]
# 向量化(其实就是给那俩编号)
df_NoNan_numbered, dict_MainBussiness, dict_BussinessScope = make_number(df_NoNan)
df_has_Nan_numbered, df_hema = make_number_Nan(df_has_Nan, dict_BussinessScope)
df_NoNan_numbered = pd.concat([df_NoNan_numbered, df_hema]) # 把盒马合并进来
dict_MainBussiness.update(zip({"盒马"}, {33})) # 处理盒马
dict_BussinessScope.update(zip({"盒马"}, {16}))
print("主要业务编号词典:\n", dict_MainBussiness)
print("业务范围词典:\n", dict_BussinessScope)
dict_numbers_to_MainBussiness = dict([value, key] for key, value in dict_MainBussiness.items()) # 制作反向传导
dict_numbers_to_BussinessScope = dict([value, key] for key, value in dict_BussinessScope.items())
print("向量化完毕!!!")
# 缺失值填补
df_fill_null = fill_null(df_NoNan_numbered, df_has_Nan_numbered, dict_BussinessScope)
print("缺失值填补完毕!!!")
后面几组数据由于维度较高且数据量较少,所以预测的准确率较低,因此准确率较低,因为数据较少,我们采取人工手动标注的方法填补缺失值。
模型调参结果图:
相关文章
数据概览与预处理https://blog.csdn.net/Hjh1906008151/article/details/124313507
衍生变量计算(缺失值填补就是为了计算衍生变量)https://blog.csdn.net/Hjh1906008151/article/details/124330708
异常值识别基础方法https://blog.csdn.net/Hjh1906008151/article/details/124342492
基于pyod的异常值识别方法https://editor.csdn.net/md/?articleId=124340047
异常值识别效果不佳的解决思路https://blog.csdn.net/Hjh1906008151/article/details/124341064