pnn
添加链接描述
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 12 13:21:13 2018
@author: lj
"""
import numpy as np
import math
import copy
def load_data(file_name):
'''导入数据
input: file_name(string):文件的存储位置
output: feature_data(mat):特征
label_data(mat):标签
n_class(int):类别的个数
'''
# 1、获取特征
f = open(file_name) # 打开文件
feature_data = []
label = []
for line in f.readlines():
feature_tmp = []
lines = line.strip().split("\t")
for i in range(len(lines) - 1):
feature_tmp.append(float(lines[i]))
label.append(int(lines[-1]))
feature_data.append(feature_tmp)
f.close() # 关闭文件
return np.mat(feature_data), label
def Normalization(data):
'''样本数据归一化
input:data(mat):样本特征矩阵
output:Nor_feature(mat):归一化的样本特征矩阵
'''
m,n = np.shape(data)
Nor_feature = copy.deepcopy(data)
sample_sum = np.sqrt(np.sum(np.square(data),axis = 1))
for i in range(n):
Nor_feature[:,i] = Nor_feature[:,i] / sample_sum
return Nor_feature
def distance(X,Y):
'''计算两个样本之间的距离
'''
return np.sum(np.square(X-Y),axis = 1)
def distance_mat(Nor_trainX,Nor_testX):
'''计算待测试样本与所有训练样本的欧式距离
input:Nor_trainX(mat):归一化的训练样本
Nor_testX(mat):归一化的测试样本
output:Euclidean_D(mat):测试样本与训练样本的距离矩阵
'''
m,n = np.shape(Nor_trainX)
p = np.shape(Nor_testX)[0]
Euclidean_D = np.mat(np.zeros((p,m)))
for i in range(p):
for j in range(m):
Euclidean_D[i,j] = distance(Nor_testX[i,:],Nor_trainX[j,:])[0,0]
return Euclidean_D
def Gauss(Euclidean_D,sigma):
'''测试样本与训练样本的距离矩阵对应的Gauss矩阵
input:Euclidean_D(mat):测试样本与训练样本的距离矩阵
sigma(float):Gauss函数的标准差
output:Gauss(mat):Gauss矩阵
'''
m,n = np.shape(Euclidean_D)
Gauss = np.mat(np.zeros((m,n)))
for i in range(m):
for j in range(n):
Gauss[i,j] = math.exp(- Euclidean_D[i,j] / (2 * (sigma ** 2)))
return Gauss
def Prob_mat(Gauss_mat,labelX):
'''测试样本属于各类的概率和矩阵
input:Gauss_mat(mat):Gauss矩阵
labelX(list):训练样本的标签矩阵
output:Prob_mat(mat):测试样本属于各类的概率矩阵
label_class(list):类别种类列表
'''
## 找出所有的标签类别
label_class = []
for i in range(len(labelX)):
if labelX[i] not in label_class:
label_class.append(labelX[i])
n_class = len(label_class)
## 求概率和矩阵
p,m = np.shape(Gauss_mat)
Prob = np.mat(np.zeros((p,n_class)))
for i in range(p):
for j in range(m):
for s in range(n_class):
if labelX[j] == label_class[s]:
Prob[i,s] += Gauss_mat[i,j]
Prob_mat = copy.deepcopy(Prob)
Prob_mat = Prob_mat / np.sum(Prob,axis = 1)
return Prob_mat,label_class
def calss_results(Prob,label_class):
'''分类结果
input:Prob(mat):测试样本属于各类的概率矩阵
label_class(list):类别种类列表
output:results(list):测试样本分类结果
'''
arg_prob = np.argmax(Prob,axis = 1) ##类别指针
results = []
for i in range(len(arg_prob)):
results.append(label_class[arg_prob[i,0]])
return results
if __name__ == '__main__':
# 1、导入数据
print ("--------- 1.load data ------------")
trainX, labelX = load_data("data.txt")
# 2、样本数据归一化
Nor_trainX = Normalization(trainX)
Nor_testX = Normalization(trainX[100:300,:])
# 3、计算Gauss矩阵
Euclidean_D = distance_mat(Nor_trainX,Nor_testX)
Gauss_mat = Gauss(Euclidean_D,0.1)
Prob,label_class = Prob_mat(Gauss_mat,labelX)
# 4、求测试样本的分类
predict_results = calss_results(Prob,label_class)
SVM(2分类)
添加链接描述
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from sklearn import datasets
sess = tf.compat.v1.Session()
#sess = tf.Session()
# 加载数据
# iris.data = [(Sepal Length, Sepal Width, Petal Length, Petal Width)]
iris = datasets.load_iris()
x_vals = np.array([[x[0], x[3]] for x in iris.data])
y_vals = np.array([1 if y == 0 else -1 for y in iris.target])
# 分离训练和测试集
train_indices = np.random.choice(len(x_vals),
round(len(x_vals)*0.8),
replace=False)
test_indices = np.array(list(set(range(len(x_vals))) - set(train_indices)))
x_vals_train = x_vals[train_indices]
x_vals_test = x_vals[test_indices]
y_vals_train = y_vals[train_indices]
y_vals_test = y_vals[test_indices]
batch_size = 100
# 初始化feedin
x_data = tf.placeholder(shape=[None, 2], dtype=tf.float32)
y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)
# 创建变量
A = tf.Variable(tf.random_normal(shape=[2, 1]))
b = tf.Variable(tf.random_normal(shape=[1, 1]))
# 定义线性模型
model_output = tf.subtract(tf.matmul(x_data, A), b)
# Declare vector L2 'norm' function squared
l2_norm = tf.reduce_sum(tf.square(A))
# Loss = max(0, 1-pred*actual) + alpha * L2_norm(A)^2
alpha = tf.constant([0.01])
classification_term = tf.reduce_mean(tf.maximum(0., tf.subtract(1., tf.multiply(model_output, y_target))))
loss = tf.add(classification_term, tf.multiply(alpha, l2_norm))
my_opt = tf.train.GradientDescentOptimizer(0.01)
train_step = my_opt.minimize(loss)
init = tf.global_variables_initializer()
sess.run(init)
# Training loop
loss_vec = []
train_accuracy = []
test_accuracy = []
for i in range(20000):
rand_index = np.random.choice(len(x_vals_train), size=batch_size)
rand_x = x_vals_train[rand_index]
rand_y = np.transpose([y_vals_train[rand_index]])
sess.run(train_step, feed_dict={x_data: rand_x, y_target: rand_y})
[[a1], [a2]] = sess.run(A)
[[b]] = sess.run(b)
slope = -a2/a1
y_intercept = b/a1
best_fit = []
x1_vals = [d[1] for d in x_vals]
for i in x1_vals:
best_fit.append(slope*i+y_intercept)
# Separate I. setosa
setosa_x = [d[1] for i, d in enumerate(x_vals) if y_vals[i] == 1]
setosa_y = [d[0] for i, d in enumerate(x_vals) if y_vals[i] == 1]
not_setosa_x = [d[1] for i, d in enumerate(x_vals) if y_vals[i] == -1]
not_setosa_y = [d[0] for i, d in enumerate(x_vals) if y_vals[i] == -1]
plt.plot(setosa_x, setosa_y, 'o', label='I. setosa')
plt.plot(not_setosa_x, not_setosa_y, 'x', label='Non-setosa')
plt.plot(x1_vals, best_fit, 'r-', label='Linear Separator', linewidth=3)
plt.ylim([0, 10])
plt.legend(loc='lower right')
plt.title('Sepal Length vs Pedal Width')
plt.xlabel('Pedal Width')
plt.ylabel('Sepal Length')
plt.show()
SVM
%输入和输出数据
tic
data=csvread('1NewTrainTest.csv');
[M,N]=size(data);
input=data(:,1:N-1);
output=data(:,N);
%随机选择1900组作为训练,100组最为测试
TestNum=M-5;
input_train=input(1:TestNum,:)';
output_train=output(1:TestNum,:)';
input_test=input(TestNum+1:M,:)';
output_test=output(TestNum+1:M,:)';
%训练数据的归一化
[inputn,inputps]=mapminmax(input_train);
[outputn,outputps]=mapminmax(output_train);
%构建SVM
Mdl=fitrsvm(inputn',outputn');
%待预测数据的归一化
inputn_test=mapminmax('apply',input_test,inputps);
%输出预测值,并反归一化
an=predict(Mdl,inputn_test');
SVMoutput=mapminmax('reverse',an,outputps);
%预测值和准确值
% plot(SVMoutput,'r');
% hold on;
% plot(output_test,'b')
% toc
% output_test=output_test';
% b=SVMoutput;
% d=output_test;
% rmse=sqrt(sum((b-d).^2)/length(b));
% mape=sum(abs(b-d)./d)/length(b);
% mae=sum(abs(b-d))/length(b);
fprintf('经次训练误,差用时%f\n\n',rmse,mape);
SVM
clc;
clear;
close all;
tic
fprintf('-----已开始请等待-----\n\n');
%% 造数据不用关心,直接跳过
% 造数据 20*2
data = [0.4,0.3;-0.5,0.1;-0.2,-0.3;0.5,-0.3;
2.1,1.9;1.8,2.2;1.7,2.5;2.3,1.6;
-2.2,1.6;-1.9,2.1;-1.7,2.6;-2.3,2.5;
-3.1,-1.9;-2.8,-2.1;-1.9,-2.5;-2.3,-3.2;
3.9,-3.5;2.8,-2.2;1.7,-3.1;2.5,-3.4];
data1 = data + 2.5*rand(20,2);
data2 = data + 2.5*rand(20,2);
data3 = data + 2.5*rand(20,2); data1(17:20,:);
% 训练数据
train_data = [data1(1:4,:);data2(1:4,:);data3(1:4,:);
data1(5:8,:);data2(5:8,:);data3(5:8,:);
data1(9:12,:);data2(9:12,:);data3(9:12,:);
data1(13:16,:);data2(13:16,:);data3(13:16,:);
data1(17:20,:);data2(17:20,:);data3(17:20,:)];
% 画图显示
figure;
% gscatter函数可以按分类或者分组画离散点
% group为分组向量,对应每一个坐标的类别
group_train = [1;1;1;1;1;1;1;1;1;1;1;1;
2;2;2;2;2;2;2;2;2;2;2;2;
3;3;3;3;3;3;3;3;3;3;3;3;
4;4;4;4;4;4;4;4;4;4;4;4;
5;5;5;5;5;5;5;5;5;5;5;5];
gscatter(train_data(:,1),train_data(:,2),group_train);
title('训练数据样本分布');
xlabel('样本特征1');
ylabel('样本特征2');
legend('Location','Northwest');
grid on;
%%
% 测试数据
test_data = data + 3.0*rand(20,2);
test_features = test_data;
% 测试数据的真实标签
test_labels = [1;1;1;1;2;2;2;2;3;3;3;3;4;4;4;4;5;5;5;5];
%%
% 训练数据分为5类
% 类别i的 正样本 选择类别i的全部,负样本 从其余类别中随机选择(个数与正样本相同)
% 类别1
class1_p = train_data(1:12,:);
% randperm(n,k)是从1到n的序号中随机返回k个
index1 = randperm(48,12);
% 从其余样本中随机选择k个
train_data_c = train_data;
train_data_c(1:12,:) = [];
class1_n = train_data_c(index1,:);
train_features1 = [class1_p;class1_n];
% 正类表示为1,负类表示为-1
train_labels1 = [ones(12,1);-1*ones(12,1)];
% 类别2
class2_p = train_data(13:24,:);
% randperm(n,k)是从1到n的序号中随机返回k个
index1 = randperm(48,12);
% 从其余样本中随机选择k个
train_data_c = train_data;
train_data_c(13:24,:) = [];
class2_n = train_data_c(index1,:);
train_features2 = [class2_p;class2_n];
% 正类表示为1,负类表示为-1
train_labels2 = [ones(12,1);-1*ones(12,1)];
% 类别3
class3_p = train_data(25:36,:);
% randperm(n,k)是从1到n的序号中随机返回k个
index1 = randperm(48,12);
% 从其余样本中随机选择k个
train_data_c = train_data;
train_data_c(25:36,:) = [];
class3_n = train_data_c(index1,:);
train_features3 = [class3_p;class3_n];
% 正类表示为1,负类表示为-1
train_labels3 = [ones(12,1);-1*ones(12,1)];
% 类别4
class4_p = train_data(37:48,:);
% randperm(n,k)是从1到n的序号中随机返回k个
index1 = randperm(48,12);
% 从其余样本中随机选择k个
train_data_c = train_data;
train_data_c(37:48,:) = [];
class4_n = train_data_c(index1,:);
train_features4 = [class4_p;class4_n];
% 正类表示为1,负类表示为-1
train_labels4 = [ones(12,1);-1*ones(12,1)];
% 类别5
class5_p = train_data(49:60,:);
% randperm(n,k)是从1到n的序号中随机返回k个
index1 = randperm(48,12);
% 从其余样本中随机选择k个
train_data_c = train_data;
train_data_c(49:60,:) = [];
class5_n = train_data_c(index1,:);
train_features5 = [class5_p;class5_n];
% 正类表示为1,负类表示为-1
train_labels5 = [ones(12,1);-1*ones(12,1)];
%%
% 分别训练5个类别的SVM模型
model1 = fitcsvm(train_features1,train_labels1,'ClassNames',{'-1','1'});
model2 = fitcsvm(train_features2,train_labels2,'ClassNames',{'-1','1'});
model3 = fitcsvm(train_features3,train_labels3,'ClassNames',{'-1','1'});
model4 = fitcsvm(train_features4,train_labels4,'ClassNames',{'-1','1'});
model5 = fitcsvm(train_features5,train_labels5,'ClassNames',{'-1','1'});
fprintf('-----模型训练完毕-----\n\n');
%%
% label是n*1的矩阵,每一行是对应测试样本的预测标签;
% score是n*2的矩阵,第一列为预测为“负”的得分,第二列为预测为“正”的得分。
% 用训练好的5个SVM模型分别对测试样本进行预测分类,得到5个预测标签
[label1,score1] = predict(model1,test_features);
[label2,score2] = predict(model2,test_features);
[label3,score3] = predict(model3,test_features);
[label4,score4] = predict(model4,test_features);
[label5,score5] = predict(model5,test_features);
% 求出测试样本在5个模型中预测为“正”得分的最大值,作为该测试样本的最终预测标签
score = [score1(:,2),score2(:,2),score3(:,2),score4(:,2),score5(:,2)];
% 最终预测标签为k*1矩阵,k为预测样本的个数
final_labels = zeros(20,1);
for i = 1:size(final_labels,1)
% 返回每一行的最大值和其位置
[m,p] = max(score(i,:));
% 位置即为标签
final_labels(i,:) = p;
end
fprintf('-----样本预测完毕-----\n\n');
% 分类评价指标
group = test_labels; % 真实标签
grouphat = final_labels; % 预测标签
[C,order] = confusionmat(group,grouphat,'Order',[1;2;3;4;5]); % 'Order'指定类别的顺序
c1_p = C(1,1) / sum(C(:,1));
c1_r = C(1,1) / sum(C(1,:));
c1_F = 2*c1_p*c1_r / (c1_p + c1_r);
fprintf('c1类的查准率为%f,查全率为%f,F测度为%f\n\n',c1_p,c1_r,c1_F);
c2_p = C(2,2) / sum(C(:,2));
c2_r = C(2,2) / sum(C(2,:));
c2_F = 2*c2_p*c2_r / (c2_p + c2_r);
fprintf('c2类的查准率为%f,查全率为%f,F测度为%f\n\n',c2_p,c2_r,c2_F);
c3_p = C(3,3) / sum(C(:,3));
c3_r = C(3,3) / sum(C(3,:));
c3_F = 2*c3_p*c3_r / (c3_p + c3_r);
fprintf('c3类的查准率为%f,查全率为%f,F测度为%f\n\n',c3_p,c3_r,c3_F);
c4_p = C(4,4) / sum(C(:,4));
c4_r = C(4,4) / sum(C(4,:));
c4_F = 2*c4_p*c4_r / (c4_p + c4_r);
fprintf('c4类的查准率为%f,查全率为%f,F测度为%f\n\n',c4_p,c4_r,c4_F);
c5_p = C(5,5) / sum(C(:,5));
c5_r = C(5,5) / sum(C(5,:));
c5_F = 2*c5_p*c5_r / (c5_p + c5_r);
fprintf('c5类的查准率为%f,查全率为%f,F测度为%f\n\n',c5_p,c5_r,c5_F);
figure;
subplot(121);
% gscatter函数可以按分类或者分组画离散点
% group为分组向量,对应每一个坐标的类别
group_test = test_labels;
gscatter(test_data(:,1),test_data(:,2),group_test);
title('测试数据样本真实分布');
xlabel('样本特征1');
ylabel('样本特征2');
legend('Location','Northwest');
grid on;
subplot(122);
% gscatter函数可以按分类或者分组画离散点
% group为分组向量,对应每一个坐标的类别
group_test = final_labels;
gscatter(test_data(:,1),test_data(:,2),group_test);
title('测试数据样本预测分布');
xlabel('样本特征1');
ylabel('样本特征2');
legend('Location','Northwest');
grid on;
KELM
添加链接描述