1、直接上代码
# -*- coding: utf-8 -*-
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA, KernelPCA
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('TkAgg')
from sklearn.utils import shuffle
class DbscanClustering():
def __init__(self, stopwords_path=None):
self.stopwords = self.load_stopwords(stopwords_path)
self.vectorizer = CountVectorizer()
self.transformer = TfidfTransformer()
def load_stopwords(self, stopwords=None):
"""
加载停用词
:param stopwords:
:return:
"""
if stopwords:
with open(stopwords, 'r', encoding='utf-8') as f:
return [line.strip() for line in f]
else:
return []
def preprocess_data(self, corpus_path):
"""
文本预处理,每行一个文本
:param corpus_path:
:return:
"""
corpus = []
with open(corpus_path, 'r', encoding='utf-8') as f:
for line in f:
corpus.append(' '.join([word for word in jieba.lcut(line.strip()) if word not in self.stopwords]))
shuffle(corpus)
return corpus
def get_text_tfidf_matrix(self, corpus):
"""
获取tfidf矩阵
:param corpus:
:return:
"""
tfidf = self.transformer.fit_transform(self.vectorizer.fit_transform(corpus))
# 获取词袋中所有词语
# words = self.vectorizer.get_feature_names()
# 获取tfidf矩阵中权重
weights = tfidf.toarray()
return weights
def pca(self, weights, n_components=2):
"""
PCA对数据进行降维
:param weights:
:param n_components:
:return:
"""
pca = PCA(n_components=n_components)
#pca = KernelPCA(kernel="rbf",n_components=n_components)
return pca.fit_transform(weights)
def dbscan(self, corpus_path, eps=0.1, min_samples=3, fig=True):
"""
DBSCAN:基于密度的文本聚类算法
:param corpus_path: 语料路径,每行一个文本
:param eps: DBSCA中半径参数
:param min_samples: DBSCAN中半径eps内最小样本数目
:param fig: 是否对降维后的样本进行画图显示
:return:
"""
corpus = self.preprocess_data(corpus_path)
weights = self.get_text_tfidf_matrix(corpus)
pca_weights = self.pca(weights,2)
clf = DBSCAN(eps=eps, min_samples=min_samples)
y = clf.fit_predict(pca_weights)
if fig:
plt.scatter(pca_weights[:, 0], pca_weights[:, 1], c=y)
plt.show()
'''
plt.ion()
plt.pause(10)
plt.close()
'''
# 中心点
# centers = clf.cluster_centers_
# 用来评估簇的个数是否合适,距离约小说明簇分得越好,选取临界点的簇的个数
# score = clf.inertia_
# 每个样本所属的簇
result = {}
for text_idx, label_idx in enumerate(y):
if label_idx not in result:
result[label_idx] = [text_idx]
else:
result[label_idx].append(text_idx)
return result
if __name__ == '__main__':
dbscan = DbscanClustering(stopwords_path='../data/stop_words.txt')
result = dbscan.dbscan('../data/test_data.txt', eps=0.05, min_samples=10,fig=True)
print(result)