# -*- coding: utf-8 -*-
"""
Created on Tue Jul 17 21:00:19 2018
@author: wenyun.wxw
"""
#特征提取
#- Tf-idf
# 词频矩阵:矩阵元素a[i][j] 表示j词在i类文本下的词频
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer = CountVectorizer()
# 统计每个词语的tf-idf权值
transformer = TfidfTransformer()
corpus=ratecontent[0]
corpus=wordcount
freq_word_matrix = vectorizer.fit_transform(corpus)
#获取词袋模型中的所有词语
w = vectorizer.get_feature_names()
tfidf = transformer.fit_transform(freq_word_matrix)
# 元素w[i][j]表示j词在i类文本中的tf-idf权重
weight = tfidf.toarray()
#word2vec
# doc2vec
#训练并保存模型
import gensim
#sentence = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
sentences = gensim.models.doc2vec.TaggedLineDocument(ratecontent)
model=gensim.models.Doc2Vec(sentences, size=100, window=2, min_count=3)
model.train(sentences,total_examples=model.corpus_count, epochs=1000)
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("my_doc2vec_model")
model.save(fname)
model = gensim.models.Doc2Vec.load(fname)
#聚类算法
# K-means聚类
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=20)
s = clf.fit(model.docvecs)
print(s)
#20个中心点
print(clf.cluster_centers_)
#每个样本所属的簇
print(clf.labels_)
i = 1
while i <= len(clf.labels_):
print(i, clf.labels_[i-1])
i = i + 1
#用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数
print(clf.inertia_)
# dbscan 密度聚类
from sklearn.cluster import DBSCAN
# Compute DBSCAN
db = DBSCAN(eps=0.005, min_samples=10).fit(weight)
print(db.core_sample_indices_)
db.labels_