1.进入服务器
2. 进入有数据集的机器学习文件夹
cd /public/home/xx/xx/xx/xx
3.进入机器学习环境
conda activate xx
注意这时需要先进入python才能·import各种包
python
4.导入各种包(根据自己的调整。这里代码仅供展示)
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
import joblib
5.读取测试集和训练集的数据
# 读取训练集和测试集数据
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
如果报错FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'说明路径不对,试着换成绝对路径和行对路径,我改成了这样
train_data = pd.read_csv('../train.csv')
test_data = pd.read_csv('../test.csv')
如果是excel文件记得换成
data = pd.read_excel('xx.xlsx')
6.准备特征和标签
# 准备特征和标签
X = train_data.drop(columns=['target_column'])
y = train_data['target_column']
如果不清楚测试集和训练集的列名可以
print(train_data.columns)
然后target_column改成对应的列名
7.划分训练集和验证集
# 划分训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
8.初始化 XGBoost 分类器
# 初始化 XGBoost 分类器
model = XGBClassifier()
9.使用五折交叉验证进行模型评估
# 使用五折交叉验证进行模型评估
cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
如果报错ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`. Invalid columns:mutations: object
是因为在XGBoost中使用具有分类数据的列时,需要设置enable_categorical
参数为True
。此外,如果要将DataFrame中的列转换为分类类型,则需要确保相应的列已经被正确转换。
也就意味着你选错了特征列需要重新选择特征列和标签
# 重新选择特征和标签
>>> selected_features = ['OneHot1', 'OneHot2', 'OneHot3', 'OneHot4', 'OneHot5', 'OneHot6',
'OneHot7', 'OneHot8', 'OneHot9', 'OneHot10', 'OneHot11', 'OneHot12',
'OneHot13', 'OneHot14', 'OneHot15', 'OneHot16', 'OneHot17', 'OneHot18',
'OneHot19', 'OneHot20', 'OneHot21', 'OneHot22', 'OneHot23', 'OneHot24',
'OneHot25', 'OneHot26', 'OneHot27', 'OneHot28', 'OneHot29', 'OneHot30',
'OneHot31', 'OneHot32', 'OneHot33', 'OneHot34', 'OneHot35', 'OneHot36',
'OneHot37', 'OneHot38', 'OneHot39', 'OneHot40', 'gMVP', 'ESM1b',
'AlphaMissence', 'AlphaScore', 'DEOGEN2', 'REVEL', 'CADD', 'Solv_acc',
'B_factor']
然后再次划分训练集和验证集然后再评估
X = train_data[selected_features]
y = train_data['Label']
# 划分训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
# 初始化 XGBoost 分类器
model = XGBClassifier()
# 使用五折交叉验证进行模型评估
cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
10.训练模型
# 训练模型
model.fit(X_train, y_train)
11.验证模型
# 验证模型
y_pred = model.predict(X_valid)
12.评估模型性能
# 评估模型性能
accuracy = metrics.accuracy_score(y_valid, y_pred)
precision = metrics.precision_score(y_valid, y_pred)
recall = metrics.recall_score(y_valid, y_pred)
f1_score = metrics.f1_score(y_valid, y_pred)
mcc = metrics.matthews_corrcoef(y_valid, y_pred)
13.输出模型指标
# 输出评估指标
print("Cross-validation AUC scores:", cv_scores)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("MCC:", mcc)
14. 如果模型性能好,则保存模型
joblib.dump(model, 'xgboost_model.pkl')