任务1:报名比赛
报名比赛
读取训练数据
import numpy as np
import pandas as pd
train_data = pd.read_csv("./data/train.csv")
train_data.info()
任务2:比赛数据分析
步骤1:使用pandas完成如下数据分析
训练集和测试集的行数分别是多少?
训练集中每列的类型是什么?
train_data.info()
训练集中标签是如何分布,与哪一个特征最相关?
对数据集中的类型字段进行编码转换成数字类型,使用sns显示字段之间的关系
from sklearn.preprocessing import LabelEncoder
col = ['HomePlanet' ,'CryoSleep' ,'Cabin' ,'Destination' ,'VIP']
ded = data_train.copy()
for i in col:
encoder= LabelEncoder().fit(ded[i])
ded[i] = encoder.transform(ded[i])
plt.figure(figsize=(10,10))
sns.heatmap(ded.corr(),annot=True)
可知CryoSleep与Transportted的相关性最大
训练集中列缺失值如何分布的?
# 统计缺失值
fea_lst = []
num_lst = []
num_rate_lst = []
for i in list(data_test.columns):
# print(i)
num = data_test[i].isnull().sum()
num_rate = num / len(data_train)
fea_lst.append(i)
num_lst.append(num)
num_rate_lst.append(num_rate)
null_train = pd.DataFrame({"训练集":fea_lst,"缺失个数":num_lst,"缺失率":num_rate_lst})
null_train
步骤2:使用seaborn或matplotlib完成如下可视化
HomePlanet 与 Transported 的分布关系
CryoSleep 与 Transported 的分布关系
Cabin 与 Transported 的分布关系
Destination 与 Transported 的分布关系
Age 与 Transported 的分布关系
VIP 与 Transported 的分布关系
Name 与 Transported 的分布关系
步骤3:根据上述分析结果,你找出什么规律,如什么类型的乘客更加容易被Transported?
CryoSleep与Transportted的相关性最大
任务3:验证集划分与树模型
步骤1:学习sklearn中的数据划分方法
from sklearn.model_selection import train_test_split
y = ded['Transported']
X = ded.drop(['Transported'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
步骤2:导入sklearn中的树模型
树模型有随机深林、决策树,高级树模型有XGBoos、LightGBM、CatBoost。
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
步骤3:训练集和测试集进行缺失值填充(数值列填充列均值,类别列填充众数)
# 处理缺失值
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Name', 'Cabin']
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in num_cols:
train_data[col].fillna(train_data[col].mean(), inplace=True)
train_data[col].fillna(train_data[col].mean(), inplace=True)
for col in cat_cols:
train_data[col].fillna(train_data[col].mode()[0], inplace=True)
train_data[col].fillna(train_data[col].mode()[0], inplace=True)
任务4:特征工程入门
步骤1:学习特征工程基础
步骤2:对类别字段分别进行onehot和labelencoder
在机器学习中,通常需要对类别变量单独做处理,这是因为模型的输入项基本都需要是数值型变量,而因为类别变量本身不带数值属性,所以需要进行一层转换。常用的方法一般有两种:label encoding和one hot encoding。
label encoding
# lableencoder
from sklearn.preprocessing import LabelEncoder
col_1 = ['HomePlanet', 'Destination', 'Name', 'Cabin']
train = train_data.copy()
for col in col_1:
lab = LabelEncoder()
train_data[col] = lab.fit_transform(train_data[col])
dict_mp = {True:1, False:0}
for col in col_2:
train_data[col] = train_data[col].astype(int)
one hot encoding
from sklearn.preprocessing import OneHotEncoder
train_temp = train.copy();
train_temp.drop("PassengerId", axis=1, inplace=True)
col_2 = ['HomePlanet', 'Destination', 'Name', 'Cabin']
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(train_temp[col_2])
after_onehot_features = enc.get_feature_names_out(col_2)
data_onehot = pd.DataFrame(enc.transform(train_temp[col_2]).toarray(), columns=after_onehot_features)
train_temp.drop(col_2, axis=1, inplace=True)
train_temp = train_temp.join(data_onehot)
col_2 = ['CryoSleep', 'VIP', 'Transported']
for col in col_2:
train_temp[col] = train_temp[col].astype(int)
train_temp.head()
步骤3:使用分类树模型和Kfold验证onehot和labelencoder在验证集的精度。
使用随机森林模型和5折进行验证。
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
def train(X, y):
kfold = KFold(n_splits=5, shuffle=True, random_state=1234)
scorelist = []
rfc = RandomForestClassifier(max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
for train_index, test_index in kfold.split(X, y):
X_train = X.iloc[train_index]
y_train = y[train_index]
X_test = X.iloc[test_index]
y_test = y[test_index]
rfc = rfc.fit(X_train, y_train)
score_r = rfc.score(X_test, y_test)
scorelist.append(score_r)
print(scorelist)
print("mean {}".format(np.mean(scorelist)))
labelencoder结果:
[0.8033352501437608, 0.7883841288096607, 0.7952846463484762, 0.7658227848101266, 0.7813578826237054]
onehot结果:
[0.7866589994249569, 0.7849338700402531, 0.8039102932719954, 0.7796317606444189, 0.7871116225546605]
(ps:感觉类别处理的不是很好,差距有点大)
任务5:特征工程进阶
步骤1:对所有类别字段进行target encoding
Target
Encoding是用于类别特征的。这是一种将类别编码为数字的方法,就像One-hot或Label-encoding一样,但和这种两种方法不同的地方在于target
encoding还使用目标来创建编码,这就是我们所说的有监督特征工程方法。 Target
Encoding是任何一种可以从目标中派生出数字替换特征类别的编码方式。这种目标编码有时被称为平均编码。应用于二进制目标时,也被称为bin
counting。(可能会遇到的其他名称包括:likelihood encoding, impact encoding, and
leave-one-out encoding。)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)
encoder = TargetEncoder(cols=cat_cols,
handle_unknown='value',
handle_missing='value').fit(X_train, y_train)
encoder_train = encoder.transform(X_train)
encoder_valid = encoder.transform(X_valid)
步骤2:使用树模型的feature importance筛选top10特征。
import matplotlib.pyplot as plt
importances = pd.DataFrame(data={
'Attribute': encoder_train.columns,
'Importance': rfc.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False)
# 可视化
plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances', size=20)
plt.xticks(rotation='vertical')
plt.show()
步骤3:使用筛选后的特征从新进行训练和验证,对比模型精度。
根据步骤二选取重要的10个特征,重新进行训练和验证。
训练前:score 0.759057
训练后:score 0.764232
任务6:高阶树模型
步骤1:安装LightGBM,并学习基础的使用方法;
安装LightGBM
import lightgbm as lgb
LightGBM基本使用
params = {'num_leaves': 60, #结果对最终效果影响较大,越大值越好,太大会出现过拟合
'min_data_in_leaf': 30,
'objective': 'binary', #定义的目标函数
'max_depth': -1,
'learning_rate': 0.03,
"min_sum_hessian_in_leaf": 6,
"boosting": "gbdt",
"feature_fraction": 0.9, #提取的特征比率
"bagging_freq": 1,
"bagging_fraction": 0.8,
"bagging_seed": 11,
"lambda_l1": 0.1, #l1正则
# 'lambda_l2': 0.001, #l2正则
"verbosity": -1,
"nthread": -1, #线程数量,-1表示全部线程,线程越多,运行的速度越快
'metric': {'binary_logloss', 'auc'}, ##评价函数选择
"random_state": 2019, #随机数种子,可以防止每次运行的结果不一致
# 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
}
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
prob_oof = np.zeros((train_x.shape[0], ))
test_pred_prob = np.zeros((test.shape[0], ))
## train and predict
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train)):
print("fold {}".format(fold_ + 1))
trn_data = lgb.Dataset(train_x.iloc[trn_idx], label=train_y[trn_idx])
val_data = lgb.Dataset(train_x.iloc[val_idx], label=train_y[val_idx])
clf = lgb.train(params,
trn_data,
num_round,
valid_sets=[trn_data, val_data],
verbose_eval=20,
early_stopping_rounds=60)
prob_oof[val_idx] = clf.predict(train_x.iloc[val_idx], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
fold_importance_df["Feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
test_pred_prob += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
threshold = 0.5
for pred in test_pred_prob:
result = 1 if pred > threshold else 0
步骤2:将训练集20%划分为验证集,使用LightGBM完成训练,精度是否有提高?
将训练集20%划分为验证集
y = train_data["Transported"].astype(int)
X = train_data.drop(labels=['Transported'], axis=1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state=22)
使用LightGBM训练
lgb_train = lgb.Dataset(encoder_train, y_train, silent=True)
lgb_eval = lgb.Dataset(encoder_valid, y_valid, reference=lgb_train, silent=True)
gbm = lgb.train(params, lgb_train, num_boost_round=400, valid_sets=[lgb_train, lgb_eval], verbose_eval=100,
early_stopping_rounds=200, categorical_feature=cat_cols)
#对测试集进行操作
test_pre = gbm.predict(encoder_test, num_iteration=gbm.best_iteration)
训练结果
步骤3:将步骤2预测的结果文件提交到比赛,截图分数
步骤4:尝试调节搜索LightGBM的参数;
根据网上教程进行LightGBM的参数搜索
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'n_jobs':-1,
'learning_rate':0.1
}
### 交叉验证(调参)
print('交叉验证')
max_auc = float('0')
best_params = {}
# 准确率
print("调参1:提高准确率")
for num_leaves in range(5,100,5): #指定叶子的个数,默认值为31,此参数的数值应该小于 2^{max\_depth}
for max_depth in range(3,8,1): #指定树的最大深度,默认值为-1,表示不做限制,合理的设置可以防止过拟合。
params['num_leaves'] = num_leaves
params['max_depth'] = max_depth
cv_results = lgb.cv(
params,
lgb_train,
seed=1,
nfold=2,
metrics=['auc'],
early_stopping_rounds=10,
verbose_eval=True
)
mean_auc = pd.Series(cv_results['auc-mean']).max()
boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
if (mean_auc >= max_auc):
max_auc = mean_auc
best_params['num_leaves'] = num_leaves
best_params['max_depth'] = max_depth
if ('num_leaves' and 'max_depth' in best_params.keys()):
params['num_leaves'] = best_params['num_leaves']
params['max_depth'] = best_params['max_depth']
# 过拟合
print("调参2:降低过拟合")
for max_bin in range(5,256,10): #最大的桶的数量,用来装数值的;
for min_data_in_leaf in range(1,102,10): #每个叶子上的最少数据;
params['max_bin'] = max_bin
params['min_data_in_leaf'] = min_data_in_leaf
cv_results = lgb.cv(
params,
lgb_train,
seed=1,
nfold=2,
metrics=['auc'],
early_stopping_rounds=10,
verbose_eval=True
)
mean_auc = pd.Series(cv_results['auc-mean']).max()
boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
if (mean_auc >= max_auc):
max_auc = mean_auc
best_params['max_bin']= max_bin
best_params['min_data_in_leaf'] = min_data_in_leaf
if ('max_bin' and 'min_data_in_leaf' in best_params.keys()):
params['min_data_in_leaf'] = best_params['min_data_in_leaf']
params['max_bin'] = best_params['max_bin']
print("调参3:降低过拟合")
for feature_fraction in [0.6,0.7,0.8,0.9,1.0]: #默认值为1;指定每次迭代所需要的特征部分;
for bagging_fraction in [0.6,0.7,0.8,0.9,1.0]: #默认值为1;指定每次迭代所需要的数据部分,并且它通常是被用来提升训练速度和避免过拟合的。
for bagging_freq in range(0,50,5):
params['feature_fraction'] = feature_fraction
params['bagging_fraction'] = bagging_fraction
params['bagging_freq'] = bagging_freq
cv_results = lgb.cv(
params,
lgb_train,
seed=1,
nfold=2,
metrics=['auc'],
early_stopping_rounds=10,
verbose_eval=True
)
mean_auc = pd.Series(cv_results['auc-mean']).max()
boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
if (mean_auc >= max_auc):
max_auc=mean_auc
best_params['feature_fraction'] = feature_fraction
best_params['bagging_fraction'] = bagging_fraction
best_params['bagging_freq'] = bagging_freq
if( 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys()):
params['feature_fraction'] = best_params['feature_fraction']
params['bagging_fraction'] = best_params['bagging_fraction']
params['bagging_freq'] = best_params['bagging_freq']
print("调参4:降低过拟合")
for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
params['lambda_l1'] = lambda_l1
params['lambda_l2'] = lambda_l2
cv_results = lgb.cv(
params,
lgb_train,
seed=1,
nfold=2,
metrics=['auc'],
early_stopping_rounds=10,
verbose_eval=True
)
mean_auc = pd.Series(cv_results['auc-mean']).max()
boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
if( mean_auc >= max_auc):
max_auc=mean_auc
best_params['lambda_l1'] = lambda_l1
best_params['lambda_l2'] = lambda_l2
if ('lambda_l1' and 'lambda_l2' in best_params.keys()):
params['lambda_l1'] = best_params['lambda_l1']
params['lambda_l2'] = best_params['lambda_l2']
print("调参5:降低过拟合2")
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]: #指定叶节点进行分支所需的损失减少的最小值,默认值为0。设置的值越大,模型就越保守。
params['min_split_gain'] = min_split_gain
cv_results = lgb.cv(
params,
lgb_train,
seed=1,
nfold=2,
metrics=['auc'],
early_stopping_rounds=10,
verbose_eval=True
)
mean_auc = pd.Series(cv_results['auc-mean']).max()
boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
if mean_auc >= max_auc:
max_auc=mean_auc
best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
params['min_split_gain'] = best_params['min_split_gain']
print(best_params)
步骤5:将步骤4调参之后的模型从新训练,将最新预测的结果文件提交到比赛,截图分数;
根据上一步骤参数进行调整LightGBM,再次进新训练。
结果如下:
比赛截图分数:
任务7:多折训练与集成
步骤1:使用KFold完成数据划分;
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import f1_score
def culatescore(predict, real):
f1=f1_score(real, predict, average='macro')
scores.append(f1)
return scores
params = {
'bagging_freq': 5,
'lambda_l1': 0.5,
'lambda_l2': 0.001,
'min_split_gain': 0.0,
'feature_fraction': 0.8,
'objective': 'binary',
'metric': ['auc', 'binary_logloss'],
'num_leaves': 2 ** 5,
'max_bin': 225,
'max_depth': 7,
'num_boost_round': 5000,
"learning_rate": 0.05,
"colsample_bytree": 0.8, # 每次迭代中随机选择特征的比例
"bagging_fraction": 0.8, # 每次迭代时用的数据比例
'min_child_samples': 25,
'n_jobs': -1,
'silent': True, # 信息输出设置成1则没有信息输出
'seed': 1000,
} #设置出参数
results=[] #这个是f1的值,每一次交叉验证的f1
bigtestresults=[] #这个是测试集各个交叉验证汇总后的结果
smalltestresults=[] #每一次运行这一大段代码,初始化各个list,这份是测试集预测的交叉验证的每一次存放结果的list
scores=[] #这是汇总后的交叉验证
# cat = ["CryoSleep", "county_name"] #这个是类别特征,catgorical_feature=cat
kf = KFold(n_splits=5, shuffle=True, random_state=123)
for i,(train_index,valid_index) in enumerate(kf.split(X, y)):
print("第",i+1,"次")
x_train,y_train = X.iloc[train_index], y.iloc[train_index]
x_valid,y_valid = X.iloc[valid_index], y.iloc[valid_index] #取出数据
lgb_train = lgb.Dataset(x_train, y_train, silent=True)
lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train, silent=True)
gbm = lgb.train(params, lgb_train, num_boost_round=400, valid_sets=[lgb_train, lgb_eval], verbose_eval=100,
early_stopping_rounds=200, categorical_feature=cat_cols)
#varbose_eval 迭代多少次打印 early_stopping_rounds:有多少次分数没有提高就停止
#categorical_feature:lightgbm可以处理标称型(类别)数据。通过指定'categorical_feature' 这一参数告诉它哪些feature是标称型的。
#它不需要将数据展开成独热码(one-hot),其原理是对特征的所有取值,做一个one-vs-others,从而找出最佳分割的那一个特征取值
#bagging_fraction:和bagging_freq同时使用可以更快的出结果
vaild_preds = gbm.predict(x_valid, num_iteration=gbm.best_iteration)
#对测试集进行操作
test_pre = gbm.predict(encoder_test, num_iteration=gbm.best_iteration)
threshold = 0.5 #设置阈值
smalltestresults=[] #这个是测试集预测的交叉验证的每一次存放结果的list
# 对每个交叉验证的测试集进行0 , 1 化,然后将每次结果放入bigtestresults中汇总
for w in test_pre:
temp = 1 if w > threshold else 0
# print(w)
smalltestresults.append(temp)
bigtestresults.append(smalltestresults)
# 对每次交叉验证的验证集进行 0 ,1 化,然后评估f1值
results=[]
for pred in vaild_preds:
result = 1 if pred > threshold else 0
results.append(result)
c = culatescore(results, y_valid)
print("score {}".format(c))
print('---N折交叉验证分数---')
print(np.average(c))
#将汇总的交叉验证的测试集的数据转变为dataframe,取出出现次数最多的那类,用作预测结果。
finalpres = pd.DataFrame(bigtestresults)
finaltask = []
lss=[] #这个是最终结果
for i in finalpres.columns:
temp1=finalpres.iloc[:,i].value_counts().index[0]
lss.append(temp1)
sample = pd.read_csv('./data/sample_submission.csv')
mp_dict = {1:True, 0:False}
sample['Transported'] = lss
# print(sample['Transported'])
sample['Transported'] = sample['Transported'].map(mp_dict)
sample.to_csv('submission6.csv', index=False)
步骤2:使用StratifiedKFold完成数据划分;
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import f1_score
def culatescore(predict, real):
f1=f1_score(real, predict, average='macro')
scores.append(f1)
return scores
params = {
'bagging_freq': 5,
'lambda_l1': 0.5,
'lambda_l2': 0.001,
'min_split_gain': 0.0,
'feature_fraction': 0.8,
'objective': 'binary',
'metric': ['auc', 'binary_logloss'],
'num_leaves': 2 ** 5,
'max_bin': 225,
'max_depth': 7,
'num_boost_round': 5000,
"learning_rate": 0.05,
"colsample_bytree": 0.8, # 每次迭代中随机选择特征的比例
"bagging_fraction": 0.8, # 每次迭代时用的数据比例
'min_child_samples': 25,
'n_jobs': -1,
'silent': True, # 信息输出设置成1则没有信息输出
'seed': 1000,
} #设置出参数
results=[] #这个是f1的值,每一次交叉验证的f1
bigtestresults=[] #这个是测试集各个交叉验证汇总后的结果
smalltestresults=[] #每一次运行这一大段代码,初始化各个list,这份是测试集预测的交叉验证的每一次存放结果的list
scores=[] #这是汇总后的交叉验证
# cat = ["CryoSleep", "county_name"] #这个是类别特征,catgorical_feature=cat
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
for i,(train_index,valid_index) in enumerate(kf.split(X, y)):
print("第",i+1,"次")
x_train,y_train = X.iloc[train_index], y.iloc[train_index]
x_valid,y_valid = X.iloc[valid_index], y.iloc[valid_index] #取出数据
lgb_train = lgb.Dataset(x_train, y_train, silent=True)
lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train, silent=True)
gbm = lgb.train(params, lgb_train, num_boost_round=400, valid_sets=[lgb_train, lgb_eval], verbose_eval=100,
early_stopping_rounds=200, categorical_feature=cat_cols)
#varbose_eval 迭代多少次打印 early_stopping_rounds:有多少次分数没有提高就停止
#categorical_feature:lightgbm可以处理标称型(类别)数据。通过指定'categorical_feature' 这一参数告诉它哪些feature是标称型的。
#它不需要将数据展开成独热码(one-hot),其原理是对特征的所有取值,做一个one-vs-others,从而找出最佳分割的那一个特征取值
#bagging_fraction:和bagging_freq同时使用可以更快的出结果
vaild_preds = gbm.predict(x_valid, num_iteration=gbm.best_iteration)
#对测试集进行操作
test_pre = gbm.predict(encoder_test, num_iteration=gbm.best_iteration)
threshold = 0.5 #设置阈值
smalltestresults=[] #这个是测试集预测的交叉验证的每一次存放结果的list
# 对每个交叉验证的测试集进行0 , 1 化,然后将每次结果放入bigtestresults中汇总
for w in test_pre:
temp = 1 if w > threshold else 0
# print(w)
smalltestresults.append(temp)
bigtestresults.append(smalltestresults)
# 对每次交叉验证的验证集进行 0 ,1 化,然后评估f1值
results=[]
for pred in vaild_preds:
result = 1 if pred > threshold else 0
results.append(result)
c = culatescore(results, y_valid)
print("score {}".format(c))
print('---N折交叉验证分数---')
print(np.average(c))
#将汇总的交叉验证的测试集的数据转变为dataframe,取出出现次数最多的那类,用作预测结果。
finalpres = pd.DataFrame(bigtestresults)
finaltask = []
lss=[] #这个是最终结果
for i in finalpres.columns:
temp1=finalpres.iloc[:,i].value_counts().index[0]
lss.append(temp1)
sample = pd.read_csv('./data/sample_submission.csv')
mp_dict = {1:True, 0:False}
sample['Transported'] = lss
# print(sample['Transported'])
sample['Transported'] = sample['Transported'].map(mp_dict)
sample.to_csv('submission6.csv', index=False)
步骤3:使用StratifiedKFold配合LightGBM完成模型的训练和预测
步骤4:在步骤3训练得到了多少个模型,对测试集多次预测,将最新预测的结果文件提交到比赛,截图分数;
使用5折训练,得到了5个模型,对测试集进行了5次训练。比赛得分如下:
步骤5:使用交叉验证训练5个机器学习模型(svm、lr等),使用stacking完成集成,将最新预测的结果文件提交到比赛,截图分数;
from sklearn.metrics import f1_score
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
clf1 = LogisticRegression(random_state=2022,tol=1e-6, max_iter=10000) # 逻辑回归模型
clf2 = DecisionTreeClassifier(random_state=2022) #决策树模型
clf3 = SVC(probability=True,random_state=2022,tol=1e-6) # SVM模型
clf4 = RandomForestClassifier(n_estimators=100,random_state=2022) # 随机森林
clf5 = GradientBoostingClassifier(random_state=2022) #GBDT
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5],
meta_classifier=clf5)
sclf = StackingClassifier(
estimators=[('lr', clf1), ('dvtree', clf2), ('svm', clf3), ('random', clf4), ('gdbt', clf5)],
final_estimator=LogisticRegression()
)
sclf.fit(encoder_train, y_train)