基于word2vec和k-means的词聚类

import re
from sklearn.cluster import KMeans
from sklearn.externals import joblib
import numpy
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn import metrics
import numpy as np
from collections import Counter
from operator import itemgetter

def map_label(true_labels, pred_labels):
    label_pair = list(zip(pred_labels, true_labels))
    count = tuple(Counter(label_pair).items())
    mapping = dict()
    n_label = len(np.unique(true_labels))

    for label in range(n_label):
        tuples = [tup for tup in count if tup[0][0] == label]
        likely_tuple = max(tuples, key=itemgetter(1), default=0)[0]
        mapping[likely_tuple[0]] = likely_tuple[1]

    pred_labels_mapped = [mapping[x] for x in pred_labels]
    return pred_labels_mapped

def cluster_quality(true_labels, pred_labels, show=True):
    f = 'data/word2vec_result.txt'
    r = open(f, 'w', encoding='utf-8')
    h, c, v = metrics.homogeneity_completeness_v_measure(true_labels, pred_labels)
    nmi = metrics.normalized_mutual_info_score(true_labels, pred_labels)
    rand = metrics.adjusted_rand_score(true_labels, pred_labels)
    pred_labels_mapped = map_label(true_labels, pred_labels)
    acc = metrics.accuracy_score(true_labels, pred_labels_mapped)
    if show:
        r.write("Homogeneity: %0.3f" % h)
        r.write('\n')
        r.write("Completeness: %0.3f" % c)
        r.write('\n')
        r.write("V-measure: %0.3f" % v)
        r.write('\n')
        r.write("NMI: %0.3f" % nmi)
        r.write('\n')
        r.write("Rand score: %0.3f" % rand)
        r.write('\n')
        r.write("Accuracy: %0.3f" % acc)
    return dict(
        homogeneity=h,
        completeness=c,
        vmeasure=v,
        nmi=nmi,
        rand=rand,
        accuracy=acc,
    )

def wordsCluster(text, vectorSize, classCount):
    '''
    text:输入文本的本地路径
    vectorSize:词向量大小
    classCount:k值
    '''

    name = []
    data = open(text, 'r', encoding='utf-8')
    for line in data.readlines():
        line = line.replace('\n', '')
        if line not in name:
            name.append(line)
    
    #正确的标签,计算准确率
    true_labels = []
    labels = open('data/short_label.txt', 'r', encoding='utf-8')
    for label in labels.readlines():
        label = label.replace('\n', '')
        true_labels.append(label)

    # word2vec向量化
    model = Word2Vec(LineSentence(text), size=vectorSize, window=5, min_count=1, workers=4)
    model.wv.save_word2vec_format('word_model.txt', binary=False)
    # 获取model里面的说有关键词
    keys = model.wv.vocab.keys()

    # 获取词对于的词向量
    wordvector = []
    for key in keys:
        wordvector.append(model[key])

    # 聚类
    clf = KMeans(n_clusters=classCount)
    pred = clf.labels_
    cluster_quality(true_labels, pred)


wordsCluster('data/short_text.txt', 300, 21)

若不计算准确率,只输出聚类结果,如下所示

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from sklearn.cluster import KMeans

def wordsCluster(text, vectorSize, classCount):
    '''
    text:输入文本的本地路径
    vectorSize:词向量大小
    classCount:k值
    '''

    name = []
    data = open(text, 'r', encoding='utf-8')
    for line in data.readlines():
        line = line.replace('\n', '')
        if line not in name:
            name.append(line)

    true_labels = []
    labels = open('data/short_label.txt', 'r', encoding='utf-8')
    for label in labels.readlines():
        label = label.replace('\n', '')
        true_labels.append(label)

    # word2vec向量化
    model = Word2Vec(LineSentence(text), size=vectorSize, window=5, min_count=1, workers=4)
    model.wv.save_word2vec_format('word_model.txt', binary=False)
    # 获取model里面的说有关键词
    keys = model.wv.vocab.keys()

    # 获取词对于的词向量
    wordvector = []
    for key in keys:
        wordvector.append(model[key])

    # 聚类
    clf = KMeans(n_clusters=classCount)
    s = clf.fit_predict(wordvector)
    for i in range(0, 21):
        label_i = []
        for j in range(0, len(s)):
            if s[j] == i:
                label_i.append(name[j])
        print('label_' + str(i) + ':' + str(label_i))

wordsCluster('data/short_text.txt', 300, 21)

 

评论 16
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值