import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer #预处理函数 处理缺失值
from sklearn.model_selection import train_test_split #自动分成训练集和测试集
from sklearn.metrics import classification_report #预测结果评估函数
from sklearn.neighbors import KNeighborsClassifier #K近邻分类起
from sklearn.tree import DecisionTreeClassifier #决策树分类起
from sklearn.naive_bayes import GaussianNB #高斯朴树贝叶斯分类起
def load_dataset(feature_path, label_path):
#读取文件列表和标签文件列表的内容, 归并后返回
feature = np.ndarray(shape=(0,41))
label = np.ndarray(shape=(0,1))
for file in feature_path:
#使用逗号分割符读取特征数据, 将问号替换成缺失值, 没有文件头
df = pd.read_table(file, delimiter=',', na_values='?', header = None)
#建立补全缺失值模型, 使用平均值补全缺失值,然后将数据补全
imp = Imputer(missing_values='Nan', strategy='mean', axis=0)
imp.fit(df)
df = imp.transform(df)
#将新的数据读入数据合并到特征集合中
feature = np.concatenate((feature, df))
for file in label_path:
#读取标签数据, 文件中不包含表头
df = pd.read_table(file, header=None)
#将新读入的数据合并到标签集合中
label = np.concatenate(label, df)
label = np.ravel(label) #转化为一维
return feature, label
if __name__ == '__main__':
feature_paths = ['A.feature', 'B.feature', 'C.feature', 'D.feature', 'E.feature']
label_paths = ['A.label', 'B.label', 'C.label', 'D.label', 'E.label']
#将前4个数据作为训练集读入
x_train, y_train = load_dataset(feature_paths[:4], label_paths[:4])
#将后面的作为测试集读入
x_test, y_test = load_dataset(feature_paths[4:], label_paths[4:])
#将训练集打乱顺序
x_train, x_, y_train, y_ = train_test_split(x_train,y_train, test_size=0.0)
print('Start training knn')
knn = KNeighborsClassifier().fit(x_train, y_train)
print('Training done')
answer_knn = knn.predict(x_test)
print('Prediction done')
print('Start training DT')
dt = DecisionTreeClassifier().fit(x_train, y_train)
print('Training done')
answer_dt = dt.predict(x_test)
print('Prediction done')
print('Start training Bayes')
gnb = GaussianNB().fit(x_train, y_train)
print('Training done')
answer_gnb = gnb.predict(x_test)
print('Prediction done')
# classification_report(正确值,和预测值),
# 输出结果分别是 precision(准确率) recall 召回率, f1-score F值 support
print('\n\nThe classification report for knn:')
print(classification_report(y_test, answer_knn))
print('\n\nThe classification report for DT:')
print(classification_report(y_test, answer_dt))
print('\n\nThe classification report for Bayes:')
print(classification_report(y_test, answer_gnb))
利用Knn, 决策树, 高斯朴树贝叶斯,对行为进行预测
最新推荐文章于 2022-12-16 10:16:51 发布