# -*- coding: utf-8 -*-
"""
K-means-Single-Test
"""
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from tools.preprocess import *
from tools.visualizer import plot_result
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from tools.labelText import LabelText
import settings
import time
import pandas as pd
from sklearn import metrics
"""
loading source
载入资源
"""
print('------Loading Source...')
ori_path = settings.SOURCE_DATA + 'goods_data.csv'
# sentences = loading_source(file_name=ori_path)
sentences = []
# content_lines = loading_source(file_name=ori_path)
# ori_path = settings.SOURCE_DATA + 'cut_data.csv'
sentences = loading_source(file_name=ori_path)
# start = time.time()
# cut_source(content_lines, sentences, write=True)
# end = time.time()
# print('------- cutting cost', end - start)
"""
Vertorizer
向量化
"""
print('------Vertorizer...')
start = time.time()
# 词频矩阵 Frequency Matrix Of Words
vertorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.46)
transformer = TfidfTransformer()
# Fit Raw Documents
freq_words_matrix = vertorizer.fit_transform(sentences)
# Get Words Of Bag
words = vertorizer.get_feature_names()
tfidf = transformer.fit_transform(freq_words_matrix)
weight = freq_words_matrix.toarray()
end = time.time()
print("Shape: Documents(Class) / Words")
print(weight.shape)
print('------ vectorizer cost', end-start)
"""
Dimension Reduction
降维
"""
pca = PCA(n_components=10)
trainingData = pca.fit_transform(weight)
# svd = TruncatedSVD(n_components=10, n_iter=10, random_state=42)
# trainingData = svd.fit_transform(weight)
"""
Compute K-Means
"""
numOfClass: int = 10
SSE = [] # 存放每次结果的误差平方和
for i in range(20,300,20):
start = time.time()
clf = KMeans(n_clusters=i, max_iter=10000, init="k-means++", tol=1e-6)
result = clf.fit(trainingData)
end = time.time()
print("n_cluster is ",i," time is",end-start)
SSE.append(clf.inertia_)
X = range(20,300,20)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X, SSE, 'o-')
plt.savefig('sse.png')
结果如下: