python机器学习。
代写+q: one three five one zero nine seven four three one
决策树
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
#读取数据
filename='data_LDA.txt'
train_data = np.loadtxt(filename, delimiter=',', dtype=np.str_)
train_data = np.float64(train_data)
data_num=len(train_data)
class_num=2
each_class_num=data_num/class_num
each_class_num=int(each_class_num)
m=np.zeros([2,2])
m0=np.mean(train_data,axis=0)
S_LDA_w=np.zeros([2,2])
#求各个类的均值,求类内方差矩阵S_LDA_w
for i in range(class_num):
temp_data=train_data[i*each_class_num:(i+1)*each_class_num,:]
m[i]=np.mean(temp_data,axis=0)
S_LDA_w=S_LDA_w+np.dot((temp_data-m[i]).T,(temp_data-m[i]))/data_num
#m的每一行是一个类的均值
#求类间的方差S_LDA_b,由于每类样本数一样,所以这里可以简化计算
S_LDA_b=np.zeros([2,2])
S_LDA_b=S_LDA_b+np.dot((m-m0).T,(m-m0))/2
#求特征值,特征向量
eigenvalue,featurevector=np.linalg.eig(np.linalg.inv(S_LDA_w).dot(S_LDA_b))
#求最大特征值对应的特征向量
newspace=featurevector[:,np.argmax(eigenvalue)]
##new_data_1=(train_data[0:201,:]).dot(newspace)
#new_data_2=(train_data[201:402,:]).dot(newspace)
k=newspace[1]/newspace[0]
new_data=train_data.dot(newspace)
new_data_map=np.zeros_like(train_data)
for i in range(data_num):
new_data_map[i][0]=train_data[i].dot([[1],[k]])/(k*k+1)
new_data_map[i][1]=new_data_map[i][0]*k
#画图
plt.scatter(train_data[0:201,0], train_data[0:201,1], s=5,c='g')
plt.scatter(train_data[201:402,0], train_data[201:402,1], s=5,c='b')
plt.show()
plt.scatter(new_data_map[0:201,0], new_data_map[0:201,1], s=5,c='g')
plt.scatter(new_data_map[201:402,0], new_data_map[201:402,1], s=5,c='b')
plt.show()
线性判别
import numpy as np
import scipy as sp
import pydotplus
from sklearn import tree
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# 读入数据
data = []
labels = []
with open("data.txt") as ifile:
for line in ifile:
tokens = line.strip().split(' ')
data.append([float(tk) for tk in tokens[:-1]])
labels.append(tokens[-1])
x = np.array(data)
labels = np.array(labels)
y = np.zeros(labels.shape)
# 胖瘦类别数字化
y[labels=='fat']=1
# 数据拆分,80%训练,20%测试
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size = 0.2,random_state=0)
# 使用DecisionTreeClassifier建立模型并训练
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf.fit(x_train, y_train)
# 测试结果
answer = clf.predict(x_train)
print("训练的样本数据:\n ", x_train) # 训练样本数据
print("训练结果: ", answer)
print("实际结果: ", y_train) # 训练样本类别
print("准确率: ", np.mean( answer == y_train))
print("影响比例: 身高-体重\n", clf.feature_importances_)
# 保存决策树为dot文件,后续图形处理
with open("tree.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f)
聚类分析
def KMeans():
'''二维数据聚类过程演示'''
print(u'聚类过程展示...\n')
data = spio.loadmat("data.mat")
X = data['X']
K = 3 # 总类数
initial_centroids = np.array([[3,3],[6,2],[8,5]]) # 初始化类中心
max_iters = 10
runKMeans(X,initial_centroids,max_iters,True) # 执行K-Means聚类算法
'''
图片压缩
'''
print(u'K-Means压缩图片\n')
img_data = imageio.imread("bird.png") # 读取图片像素数据
img_data = img_data/255.0 # 像素值映射到0-1
img_size = img_data.shape
X = img_data.reshape(img_size[0]*img_size[1],3) # 调整为N*3的矩阵,N是所有像素点个数
K = 16
max_iters = 5
initial_centroids = kMeansInitCentroids(X,K)
centroids,idx = runKMeans(X, initial_centroids, max_iters, False)
print(u'\nK-Means运行结束\n')
print(u'\n压缩图片...\n')
idx = findClosestCentroids(X, centroids)
X_recovered = centroids[idx,:]
X_recovered = X_recovered.reshape(img_size[0],img_size[1],3)
print(u'绘制图片...\n')
plt.subplot(1,2,1)
plt.imshow(img_data)
plt.title(u"原先图片",fontproperties=font)
plt.subplot(1,2,2)
plt.imshow(X_recovered)
plt.title(u"压缩图像",fontproperties=font)
plt.show()
print(u'运行结束!')
# 找到每条数据距离哪个类中心最近
def findClosestCentroids(X,initial_centroids):
m = X.shape[0] # 数据条数
K = initial_centroids.shape[0] # 类的总数
dis = np.zeros((m,K)) # 存储计算每个点分别到K个类的距离
idx = np.zeros((m,1)) # 要返回的每条数据属于哪个类
'''计算每个点到每个类中心的距离'''
for i in range(m):
for j in range(K):
dis[i,j] = np.dot((X[i,:]-initial_centroids[j,:]).reshape(1,-1),(X[i,:]-initial_centroids[j,:]).reshape(-1,1))
'''返回dis每一行的最小值对应的列号,即为对应的类别
- np.min(dis, axis=1)返回每一行的最小值
- np.where(dis == np.min(dis, axis=1).reshape(-1,1)) 返回对应最小值的坐标
- 注意:可能最小值对应的坐标有多个,where都会找出来,所以返回时返回前m个需要的即可(因为对于多个最小值,属于哪个类别都可以)
'''
dummy,idx = np.where(dis == np.min(dis, axis=1).reshape(-1,1))
return idx[0:dis.shape[0]] # 注意截取一下