接触CTG数据很久了,今天突然想起来之前刚入门的时候几乎所有的机器学习模型用的数据集都是iris、horse,现在就用CTG的数据来跑一下吧,就当是帮助一些医学类机器学习算法的人提前入门。
CTG的数据直接点击下载,下载后看表的说明,把那一部分(不知道哪一部分的可以看我代码里面的feature和标签’NSP’)截下来拿一个csv文件存起来,然后手动把有空白格的那一行删掉或者使用pandas的fillna去掉含有空格的行。
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# read in the iris data
# 显示所有数据
np.set_printoptions(precision=5) # 精度为小数点后5位
pd.set_option('display.max_columns', None) # 显示最大的列数
pd.set_option('display.max_rows', None) # 显示最大的行数
pd.set_option('display.width', 20000) # 横向最多显示20000个字符
# 特征列名
feature = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max',
'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency']
# 读取数据,将标签NSP规范化,1表示正常,2、3表示可以和病理
set = pd.read_csv(
open(r'D:\Desktop\Desktop\实验室\Python\LGB\CTG.csv', encoding='gbk'))
# 用sklearn.cross_validation进行训练数据集划分,这里训练集和交叉验证集比例为7:3,可以自己根据需要设置
X_train, X_test, y_train, y_test = train_test_split(
set[feature],
set['NSP'],
test_size=0.2,
random_state=1,
stratify=set['NSP'] # 这里保证分割后y的比例分布与原数据一致
)
#定义网格搜索参数,如果不需要调参则注释这部分
parameter_space = {
'n_estimators': range(300,400,100),
# 'validation_fraction':[i/10.0 for i in range(1,10)],
'max_features':range(5,20,3),
'max_leaf_nodes':range(2,7,1)
}
#拟合数据1
gbc = GradientBoostingClassifier(criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=16,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=0.05, min_samples_split=35,
min_weight_fraction_leaf=0.0, n_estimators=350,
n_iter_no_change=None, presort='auto',
random_state=44, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0,
warm_start=False)
# 计算最好的参数,如果不需要调参则注释这部分
grid = GridSearchCV(gbc, parameter_space, n_jobs=-1, cv=5, verbose=5)
grid.fit(X_train, y_train.astype('int'))
print('最好的参数设置为:')
print(grid.best_params_)
bclt = grid.best_estimator_
# 拟合数据2,如果不需要调参这将下一行代码取消注释
#bclt = gbc
bclt = bclt.fit(X_train, y_train)
y_true = y_test.astype('int')
print('Start predicting...')
# preds = gbm.predict(test_x, num_iteration=bclt.best_iteration) # 输出的是概率结果
# 获得预测值
y_pred = bclt.predict(X_test)
y_pred_pro = bclt.predict_proba(X_test)
y_score = pd.DataFrame(y_pred_pro, columns=bclt.classes_.tolist())[1].values
# 导出模型预测结果
for pred in y_pred:
result = prediction = int(np.argmax(pred))
with open('Result.txt', 'w+', encoding='utf-8') as f:
for y in y_pred:
strout = str(y) + '\n'
f.write(strout)
# 获取混淆矩阵
m = confusion_matrix(y_test, y_pred)
print('Confusion Matrix', m, sep='\n')
# 获取分类报告
r = classification_report(y_test, y_pred)
print('The Classification Report', r, sep='\n')
print('Finish!')
就这样了。