# - 依赖库:numpy、pandas、sklearn、imblearn
# - 程序输入:abnormal_orders.txt
# - 程序输出:预测数据直接打印输出
## 程序
# 导入库
import numpy as np
import pandas as pd # pandas库
from imblearn.over_sampling import SMOTE # 过抽样处理库SMOTE
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier, \
RandomForestClassifier # 四种集成分类库和投票方法库
from sklearn.model_selection import StratifiedKFold, cross_val_score # 导入交叉检验算法
from sklearn.preprocessing import OrdinalEncoder # 字符串转数值
# 函数模块
# 日期和时间拓展
def datetime_exp(data):
'''
将日期和时间数据拓展出其他属性,例如星期几、周几、小时、分钟等。
:param data: 数据集
:return: 拓展后的属性矩阵
'''
date_set = [pd.datetime.strptime(dates, '%Y-%m-%d') for dates in
data['order_date']] # 将data中的order_date列转换为特定日期格式
data['weekday_data'] = [data.weekday() for data in date_set] # 周几
data['daysinmonth_data'] = [data.day for data in date_set] # 当月几号
data['month_data'] = [data.month for data in date_set] # 月份
time_set = [pd.datetime.strptime(times, '%H:%M:%S') for times in
data['order_time']] # 将data中的order_time列转换为特定时间格式
data['second_data'] = [data.second for data in time_set] # 秒
data['minute_data'] = [data.minute for data in time_set] # 分钟
data['hour_data'] = [data.hour for data in time_set] # 小时
return data.drop(['order_date', 'order_time'], axis=1)
# 读取数据
raw_data = pd.read_table('abnormal_orders.txt', delimiter=',') # 读取数据集
# 数据审查
# 查看基本状态
print('{:*^60}'.format('Data overview:'))
print(raw_data.tail(2)) # 打印原始数据后2条
print('{:*^60}'.format('Data dtypes:'))
print(raw_data.dtypes) # 打印数据类型
print('{:*^60}'.format('Data DESC:'))
print(raw_data.describe().round(2).T) # 打印原始数据基本描述性信息
# 缺失值审查
na_cols = raw_data.isnull().any(axis=0) # 查看每一列是否具有缺失值
print('{:*^60}'.format('NA Cols:'))
print(na_cols[na_cols == True]) # 查看具有缺失值的列
print('Total number of NA lines is: {0}'.format(
raw_data.isnull().any(axis=1).sum())) # 查看具有缺失值的行总记录数
# 样本均衡审查
print('{:*^60}'.format('Labesl samples count:'))
print(raw_data.iloc[:, -1].value_counts())
# 数据预处理
# Nan处理
drop_na_set = raw_data.dropna() # 丢弃
# 丢弃订单ID列
drop_na_set = drop_na_set.drop(['order_id'], axis=1)
# 字符串转数值
convert_cols = ['cat', 'attribution', 'pro_id', 'pro_brand', 'order_source', 'pay_type', 'user_id', 'city'] # 定义要转换的列
enc = OrdinalEncoder()
drop_na_set[convert_cols] = enc.fit_transform(drop_na_set[convert_cols])
# 日期特征拓展
data_final = datetime_exp(drop_na_set)
# 分割测试集和训练集X和y
num = int(0.7 * data_final.shape[0])
X_raw, y_raw = data_final.drop(['abnormal_label'], axis=1), data_final['abnormal_label']
X_train, X_test = X_raw.iloc[:num, :], X_raw.iloc[num:, :]
y_train, y_test = y_raw.iloc[:num], y_raw.iloc[num:]
# 样本均衡
model_smote = SMOTE() # 建立SMOTE模型对象
x_smote_resampled, y_smote_resampled = model_smote.fit_sample(X_train, y_train) # 输入数据并作过抽样处理
# 模型训练
# 交叉检验
model_rf = RandomForestClassifier(max_features=0.8, random_state=0) # 随机森林分类模型对象
model_gdbc = GradientBoostingClassifier(max_features=0.8, random_state=0) # GradientBoosting分类模型对象
estimators = [('randomforest', model_rf), ('gradientboosting', model_gdbc)] # 建立组合评估器列表
model_vot = VotingClassifier(estimators=estimators, voting='soft', weights=[0.9, 1.2],
n_jobs=-1) # 建立组合评估模型
cv = StratifiedKFold(5, random_state=2) # 设置交叉检验方法
cv_score = cross_val_score(model_gdbc, x_smote_resampled, y_smote_resampled, cv=cv) # 交叉检验
print('{:*^60}'.format('Cross val scores:'), '\n', cv_score) # 打印每次交叉检验得分
print('Mean scores is: %.2f' % cv_score.mean()) # 打印平均交叉检验得分
# 训练模型
model_vot.fit(x_smote_resampled, y_smote_resampled) # 模型训练
# 新数据集做预测
# 读取新数据集
X_new = pd.read_csv('new_abnormal_orders.csv')
# 丢弃订单ID列
X_new_drop = X_new.drop(['order_id'], axis=1)
# 字符串转数值
X_new_drop[convert_cols] = enc.transform(X_new_drop[convert_cols])
# 日期特征拓展
X_new_final = datetime_exp(X_new_drop)
# 预测结果
predict_label = model_vot.predict(X_new_final)
predict_proba = model_vot.predict_proba(X_new_final)
predict_np = np.hstack((predict_label.reshape(-1, 1), predict_proba))
predict_pd = pd.DataFrame(predict_np, columns=['lables', 'proba_0', 'proba_1'])
print('{:*^60}'.format('Predicted Labesls:'), '\n', predict_pd)
注:数据文件链接: https://pan.baidu.com/s/1uxY-UF7wemzk9hILwEBMlA 密码: 75rt