import re
from sklearn.cluster import KMeans
from sklearn.externals import joblib
import numpy
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn import metrics
import numpy as np
from collections import Counter
from operator import itemgetter
def map_label(true_labels, pred_labels):
label_pair = list(zip(pred_labels, true_labels))
count = tuple(Counter(label_pair).items())
mapping = dict()
n_label = len(np.unique(true_labels))
for label in range(n_label):
tuples = [tup for tup in count if tup[0][0] == label]
likely_tuple = max(tuples, key=itemgetter(1), default=0)[0]
mapping[likely_tuple[0]] = likely_tuple[1]
pred_labels_mapped = [mapping[x] for x in pred_labels]
return pred_labels_mapped
def cluster_quality(true_labels, pred_labels, show=True):
f = 'data/word2vec_result.txt'
r = open(f, 'w', encoding='utf-8')
h, c, v = metrics.homogeneity_completeness_v_measure(true_labels, pred_labels)
nmi = metrics.normalized_mutual_info_score(true_labels, pred_labels)
rand = metrics.adjusted_rand_score(true_labels, pred_labels)
pred_labels_mapped = map_label(true_labels, pred_labels)
acc = metrics.accuracy_score(true_labels, pred_labels_mapped)
if show:
r.write("Homogeneity: %0.3f" % h)
r.write('\n')
r.write("Completeness: %0.3f" % c)
r.write('\n')
r.write("V-measure: %0.3f" % v)
r.write('\n')
r.write("NMI: %0.3f" % nmi)
r.write('\n')
r.write("Rand score: %0.3f" % rand)
r.write('\n')
r.write("Accuracy: %0.3f" % acc)
return dict(
homogeneity=h,
completeness=c,
vmeasure=v,
nmi=nmi,
rand=rand,
accuracy=acc,
)
def wordsCluster(text, vectorSize, classCount):
'''
text:输入文本的本地路径
vectorSize:词向量大小
classCount:k值
'''
name = []
data = open(text, 'r', encoding='utf-8')
for line in data.readlines():
line = line.replace('\n', '')
if line not in name:
name.append(line)
#正确的标签,计算准确率
true_labels = []
labels = open('data/short_label.txt', 'r', encoding='utf-8')
for label in labels.readlines():
label = label.replace('\n', '')
true_labels.append(label)
# word2vec向量化
model = Word2Vec(LineSentence(text), size=vectorSize, window=5, min_count=1, workers=4)
model.wv.save_word2vec_format('word_model.txt', binary=False)
# 获取model里面的说有关键词
keys = model.wv.vocab.keys()
# 获取词对于的词向量
wordvector = []
for key in keys:
wordvector.append(model[key])
# 聚类
clf = KMeans(n_clusters=classCount)
pred = clf.labels_
cluster_quality(true_labels, pred)
wordsCluster('data/short_text.txt', 300, 21)
若不计算准确率,只输出聚类结果,如下所示
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.cluster import KMeans
def wordsCluster(text, vectorSize, classCount):
'''
text:输入文本的本地路径
vectorSize:词向量大小
classCount:k值
'''
name = []
data = open(text, 'r', encoding='utf-8')
for line in data.readlines():
line = line.replace('\n', '')
if line not in name:
name.append(line)
true_labels = []
labels = open('data/short_label.txt', 'r', encoding='utf-8')
for label in labels.readlines():
label = label.replace('\n', '')
true_labels.append(label)
# word2vec向量化
model = Word2Vec(LineSentence(text), size=vectorSize, window=5, min_count=1, workers=4)
model.wv.save_word2vec_format('word_model.txt', binary=False)
# 获取model里面的说有关键词
keys = model.wv.vocab.keys()
# 获取词对于的词向量
wordvector = []
for key in keys:
wordvector.append(model[key])
# 聚类
clf = KMeans(n_clusters=classCount)
s = clf.fit_predict(wordvector)
for i in range(0, 21):
label_i = []
for j in range(0, len(s)):
if s[j] == i:
label_i.append(name[j])
print('label_' + str(i) + ':' + str(label_i))
wordsCluster('data/short_text.txt', 300, 21)