import os
import cv2 as cv
import numpy as np
import time
from sklearn.cluster import KMeans as Kmn
from sklearn.metrics.cluster import contingency_matrix
from sklearn.decomposition import PCA
from skfuzzy.cluster import cmeans as Cmn
def purity(cluster, labels, label_set, k = 3):
p = np.zeros((k, len(label_set)))
purity = 0
for i in range(len(cluster)):
p[int(cluster[i]), label_set.index(labels[i])] += 1
purity = sum(p/len(labels))
return purity
def purity_score(y_true, y_pred):
# compute contingency matrix (also called confusion matrix)
contingency_mat = contingency_matrix(y_true, y_pred)
# return purity
return np.sum(np.amax(contingency_mat, axis=0)) / np.sum(contingency_mat)
def labels_to_original(labels, forclusterlist):
assert len(labels) == len(forclusterlist)
maxlabel = max(labels)
numberlabel = [i for i in range(0, maxlabel + 1, 1)]
numberlabel.append(-1)
result = [[] for i in range(len(numberlabel))]
for i in range(len(labels)):
index = numberlabel.index(labels[i])
result[index].append(forclusterlist[i])
final = []
for i in range(len(result)):
temp = result[i]
if temp:
for j in range(len(temp)):
final.append(temp[j])
final = np.array(final)
return final
def get_path(filepath):
filelist = []
filedirs = []
for root, _, files in os.walk(filepath, topdown = False):
for name in files:
filelist.append(name)
filedirs.append(os.path.join(root, name))
return filelist, filedirs
def read_file(path,filelist, filedirs, n_component = 1, Use_smote = False):
dic ={'DCIS':0, 'IDC':0, 'Muc':0, 'ILC':0, 'MC':0,
'normal':1,
'TIL':2}
datalist = []
claslist = []
for path in filedirs:
img = cv.imread(path, 0) #单通道读入灰度图像
vec = img.flatten() / 255
name = filelist[filedirs.index(path)]
clas = name.split('_')[0] #保留_前面的字符串即:'DCIS'...
if len(vec) == 51*51:
datalist.append(vec)
claslist.append(dic[clas])
claslist = np.array(claslist)
datalist = np.array(datalist)
pca = PCA(n_component)
datalist_n = pca.fit_transform(datalist)
if Use_smote == True:
smo = SMOTE(random_state = 40)
datalist_smo, claslist_smo = smo.fit_sample(datalist_n, claslist)
return datalist_smo, claslist_smo
else:
return datalist_n, claslist
def clustering_Kmn(datalist, claslist):
labels = [0, 1, 2]
right = 0
clustering = Kmn(n_clusters = 3, random_state = 9).fit(datalist)
newlabel = labels_to_original(clustering.labels_, claslist)
for i in range(len(newlabel)):
if newlabel[i] == claslist[i]:
right += 1
acc = right/len(newlabel) * 100
pur = purity_score(claslist, newlabel)
return acc, pur
def clustering_Cmn(datalist, claslist):
label = []
datalist = datalist.T
center, u, u0, d, jm, p, fpc = Cmn(datalist, m=2, c=3, error=0.005, maxiter=1000)
for i in u:
label = np.argmax(u, axis=0)
right = 0
# print('现标签长度为:',len(label))
# print('原标签长度为:',len(claslist))
newlabel = labels_to_original(label, claslist)
for i in range(len(newlabel)):
if newlabel[i] == claslist[i]:
right += 1
acc = right / len(newlabel) * 100
pur = purity_score(claslist, newlabel)
return acc, pur
time_start = time.time()
path = 'D:\\STUDYFILE\\RUN\\cells\\train'
n_components = 0.3
if n_components < 1:
pass
# print('方差下限为:', n_components)
else:
print('PCA降维至:', n_components)
filepath, filedirs = get_path(path)
# print(filepath)
# print(filedirs)
datalist, claslist = read_file(path,filepath, filedirs, n_components, Use_smote = False)
acc1, pur1 = clustering_Cmn(datalist, claslist)
acc2, pur2 = clustering_Kmn(datalist, claslist)
print('Cmeans聚类准确率为:', acc1, '% ,Cmeans聚类纯度为:', pur1, ';')
print('Kmeans聚类准确率为:', acc2, '% ,Kmeans聚类纯度为:', pur2, ';')
time_end = time.time()
print('耗时:', time_end-time_start, 's')
模式识别(七)聚类算法(模糊聚类Cmeans)识别细胞数据集
于 2020-04-27 23:17:57 首次发布