概览:
1.doc2vec 训练句子向量
2.word2vec训练词向量
3.kmeans/dbscan训练句子聚类
4.tfidf寻找关键信息。
5.LDA主题文本聚类
组合使用:
doc2vec+tfidf+kmeans
先从doc2vec谈起来:
1.文本分词 jieba/hanlp
2.建立分词和下标id
## words_list =
['aaa','bbb']
## tags = 1
document = TaggedDocument(words=words_list, tags=[i])
类似于(['aaa','bbb'],1),(['ccc','ddd'],2) 这种方式进行数据输入训练。
训练代码可以如下所示:
#coding:utf-8
#使用doc2vec 判断文档相似性
from gensim import models,corpora,similarities
import jieba.posseg as pseg
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
import os
import gensim.models as g
vector_size = 300
window_size = 5
min_count = 1
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 10
dm = 0 #0 = dbow; 1 = dmpv
worker_count = 1 #number of parallel processes
#output model
saved_path = "model.bin"
def a_sub_b(a,b):
ret = []
for el in a:
if el not in b:
ret.append(el)
return ret
stop = [line.strip().decode('utf-8') for line in open('stopword.txt').readlines() ]
#读取文件
raw_documents=[]
walk = os.walk(os.path.realpath("E:\\E_all\\BaiduNetdiskDownload\\doc2vec-1-master\\data"))
for root, dirs, files in walk:
for name in files:
f = open(os.path.join(root, name), 'r',encoding='utf-8')
raw = str(os.path.join(root, name))+" "
raw += f.read()
raw_documents.append(raw)
#构建语料库
corpora_documents = []
doc=[] #输出时使用,用来存储未经过TaggedDocument处理的数据,如果输出document,前面会有u
for i, item_text in enumerate(raw_documents):
words_list=[]
item=(pseg.cut(item_text))
for j in list(item):
words_list.append(j.word)
#words_list=a_sub_b(words_list,list(stop))
document = TaggedDocument(words=words_list, tags=[i])
corpora_documents.append(document)
doc.append(words_list)
#创建model
# docs = g.doc2vec.TaggedDocument(corpora_documents)
model = g.Doc2Vec(corpora_documents, size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, iter=train_epoch)
print('#########', model.vector_size)
model.save(saved_path)