import numpy as np
from scipy.stats import mode
from sklearn.model_selection import train_test_split
from scipy.stats import mode
from sklearn.model_selection import train_test_split
def main():
#读入数据
data = np.loadtxt('/Users/dengdeshun/Desktop/Vacation_Task/环境污染.csv', \
delimiter = ',')
#标签
labels = data[:, data.shape[1]-1]
#数据
data = data[:, :data.shape[1]-1]
#测试集所占比例
test_size = eval(input('测试集所占比重,(0 ~ 1):'))
#拆分
data_train, data_test, labels_train, labels_test = \
train_test_split(data, labels, test_size = test_size, random_state = 70)
#中心个数
k = eval(input('中心个数:'))
#将训练数据进行最大最小标准化,同时输出列的最大,最小数据
data_train, max_col, min_col = data_transform(data_train)
#利用由训练集得到的最大最小数据,对测试集进行标准化
data_test = (data_test - min_col)/(max_col - min_col)
#通过迭代计算出k个中心,并将训练数据按到中心的距离远近分好类
centers, clusters = center(data_train, k)
#为计算出的中心定标签,并利用测试数据计算分类正确率
accuracy(centers, clusters, data_test, labels_train, labels_test, k)
#读入数据
data = np.loadtxt('/Users/dengdeshun/Desktop/Vacation_Task/环境污染.csv', \
delimiter = ',')
#标签
labels = data[:, data.shape[1]-1]
#数据
data = data[:, :data.shape[1]-1]
#测试集所占比例
test_size = eval(input('测试集所占比重,(0 ~ 1):'))
#拆分
data_train, data_test, labels_train, labels_test = \
train_test_split(data, labels, test_size = test_size, random_state = 70)
#中心个数
k = eval(input('中心个数:'))
#将训练数据进行最大最小标准化,同时输出列的最大,最小数据
data_train, max_col, min_col = data_transform(data_train)
#利用由训练集得到的最大最小数据,对测试集进行标准化
data_test = (data_test - min_col)/(max_col - min_col)
#通过迭代计算出k个中心,并将训练数据按到中心的距离远近分好类
centers, clusters = center(data_train, k)
#为计算出的中心定标签,并利用测试数据计算分类正确率
accuracy(centers, clusters, data_test, labels_train, labels_test, k)
def data_transform(data):
'''
输入:原始数据
功能:将原始数据进行标准化,即(data - min)/(max - min)
输出:标准化后的数据
'''
max_col = np.max(data, axis = 0)
min_col = np.min(data, axis = 0)
return (data - min_col)/(max_col - min_col), max_col, min_col
def center(data, k):
'''
输入:标准化后的数据,中心的个数
功能:通过k均值算法,找出k个中心
输出:k个中心
'''
num_of_cols = data.shape[1]
center_k = np.ones((k, num_of_cols))
for i in range(k):
center_k[i] = np.random.uniform()
'''
输入:标准化后的数据,中心的个数
功能:通过k均值算法,找出k个中心
输出:k个中心
'''
num_of_cols = data.shape[1]
center_k = np.ones((k, num_of_cols))
for i in range(k):
center_k[i] = np.random.uniform()
result, clusters = cal_loop(center_k, data, k)
return result, clusters
def cal_loop(center_k, data, k):
'''
输入:随机选择的k个中心,标准化后的数据,中心的个数
功能:通过k均值算法,找出k个中心
输出:k个中心
'''
clusters_before = np.ones(len(data))
print('start')
while True:
data_of_distance = distance(center_k, data, k)
clusters_after = np.argmin(data_of_distance, axis = 1)
for j in range(k):
data_of_cluster = data[clusters_after == j]
if len(data_of_cluster) != 0:
center_k[j] = data_of_cluster.sum(axis = 0) / len(data_of_cluster)
else:
center_k[j] = 0
if np.sum(clusters_before == clusters_after) == len(data):
break
else:
clusters_before = clusters_after
return center_k, clusters_after
'''
输入:随机选择的k个中心,标准化后的数据,中心的个数
功能:通过k均值算法,找出k个中心
输出:k个中心
'''
clusters_before = np.ones(len(data))
print('start')
while True:
data_of_distance = distance(center_k, data, k)
clusters_after = np.argmin(data_of_distance, axis = 1)
for j in range(k):
data_of_cluster = data[clusters_after == j]
if len(data_of_cluster) != 0:
center_k[j] = data_of_cluster.sum(axis = 0) / len(data_of_cluster)
else:
center_k[j] = 0
if np.sum(clusters_before == clusters_after) == len(data):
break
else:
clusters_before = clusters_after
return center_k, clusters_after
def distance(center_k, data, k):
'''
输入:随机选择的k个中心,标准化后的数据,中心的个数
功能:计算每一条数据到k个中心的距离
输出:距离
'''
distances = np.ones((len(data), k))
for i in range(len(data)):
squred = (data[i] - center_k) ** 2
sum_of_squred = squred.sum(axis = 1)
distances[i] = np.sqrt(sum_of_squred)
return distances
def accuracy(centers, clusters, data_test, labels_train, labels_test, k):
'''
输入:聚类的k个中心,训练数据离哪个中心近的情况,测试数据,训练标签,测试标签, 中心个数
功能:根据训练情况对测试数据进行分类,并计算分类准确率
输出:无
'''
pick_labels = np.ones(len(labels_test))
center_labels = np.ones(k)
for i in range(k):
center_labels[i] = mode(labels_train[clusters == i])[0]
distances = distance(centers, data_test, k)
index_min = np.argmin(distances, axis = 1)
for j in range(len(index_min)):
pick_labels[j] = center_labels[index_min[j]]
compare = pd.DataFrame({'原始分类':labels_test, 'k_means分类':pick_labels},\
dtype = int)
print(compare)
print('分类准确率:', round((labels_test == pick_labels).sum() / \
len(labels_test) * 100, 2), '%')
if __name__ == '__main__':
main()