import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.externals import joblib # 将模型导出所需包
def get_cust_age_stage(birth_year):
"""根据出生年份获取年龄段"""
age_stage = []
for i in range(len(birth_year)):
if int(birth_year[i]) == 0:
age_stage.append("未知")
elif int(birth_year[i]) < 1960:
age_stage.append("60前")
elif int(birth_year[i]) < 1970:
age_stage.append("60后")
elif int(birth_year[i]) < 1980:
age_stage.append("70后")
elif int(birth_year[i]) < 1990:
age_stage.append("80后")
elif int(birth_year[i]) < 2000:
age_stage.append("90后")
elif int(birth_year[i]) >= 2000:
age_stage.append("00后")
else:
age_stage.append("未知")
return age_stage
def get_top5_onehot(data):
"""对c字段排名top5的进行one hot"""
# 获取top5的值
c_top5_counts = data['c'].value_counts()[:5]
c_top5_names = list(c_top5_counts.keys())
# 进行one-hot编码,只保留top5的列
c_one_hot = pd.get_dummies(data['c'])
c_top5 = c_one_hot[