kmeans聚类非常好用，强烈推荐强烈推荐强烈推荐

最新推荐文章于 2021-10-28 23:49:15 发布

辽宁大学

最新推荐文章于 2021-10-28 23:49:15 发布

阅读量823

点赞数

分类专栏： nlp 文章标签： nlp

本文链接：https://blog.csdn.net/zhuiyunzhugang/article/details/103916817

版权

nlp 专栏收录该内容

84 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-

import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans

class KmeansClustering():
def __init__(self, stopwords_path=None):
self.stopwords = self.load_stopwords(stopwords_path)
self.vectorizer = CountVectorizer()
self.transformer = TfidfTransformer()

def load_stopwords(self, stopwords=None):
"""
加载停用词
:param stopwords:
:return:
"""
if stopwords:
with open(stopwords, 'r', encoding='utf-8') as f:
return [line.strip() for line in f]
else:
return []

def preprocess_data(self, corpus_path):
"""
文本预处理，每行一个文本
:param corpus_path:
:return:
"""
corpus = []
with open(corpus_path, 'r', encoding='utf-8') as f:
for line in f:
corpus.append(' '.join([word for word in jieba.lcut(line.strip()) if word not in self.stopwords]))
return corpus

def get_text_tfidf_matrix(self, corpus):
"""
获取tfidf矩阵
:param corpus:
:return:
"""
tfidf = self.transformer.fit_transform(self.vectorizer.fit_transform(corpus))

# 获取词袋中所有词语
# words = self.vectorizer.get_feature_names()

# 获取tfidf矩阵中权重
weights = tfidf.toarray()
return weights

def kmeans(self, corpus_path, n_clusters=5):
"""
KMeans文本聚类
:param corpus_path: 语料路径（每行一篇）,文章id从0开始
:param n_clusters: ：聚类类别数目
:return: {cluster_id1:[text_id1, text_id2]}
"""
corpus = self.preprocess_data(corpus_path)
weights = self.get_text_tfidf_matrix(corpus)

clf = KMeans(n_clusters=n_clusters)

# clf.fit(weights)

y = clf.fit_predict(weights)

# 中心点
# centers = clf.cluster_centers_

# 用来评估簇的个数是否合适,距离约小说明簇分得越好,选取临界点的簇的个数
# score = clf.inertia_

# 每个样本所属的簇
result = {}
for text_idx, label_idx in enumerate(y):
if label_idx not in result:
result[label_idx] = [text_idx]
else:
result[label_idx].append(text_idx)
return result

if __name__ == '__main__':
Kmeans = KmeansClustering(stopwords_path='../data/stop_words.txt')
result = Kmeans.kmeans('../data/keyword.txt', n_clusters=50)
print(result)

辽宁大学

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
kmeans聚类非常好用，强烈推荐强烈推荐强烈推荐

# -*- coding: utf-8 -*-import jiebafrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.feature_extraction.text import TfidfTransformerfrom sklearn.cluster import KMeansclas...
复制链接

扫一扫

专栏目录