1.将txt文本用word2vec将每个词转换成词向量
2.将300维度的词向量用pca转换为2维
3.将2维的数据作为k-means聚类的输入
text.txt:作为训练的文本(最好是英文,如果是中文的话可以再用jieba库把中文解析)
word_model.txt:创建一个空的文本
data.csv:创建一个空的csv文件
#1。将文本的标点替换成空格
import re
import os
list=[',','?','.','?','!','*','(',')','“','”',':','"','`','\''] ##要替换的标点符号做成一个列表
with open('text.txt','r') as f: ##text.txt是用来训练的文本(英文小说)
result = f.read()
for i in range(len(list)):
result=result.replace(list[i],' ')
with open('text.txt','w') as w:
w.write(str(result))
##2。wordvec2获取词向量
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
def wordsCluster(text, vectorSize): ##text:输入文本的本地路径 vectorSize:词向量大小(多少维度)
name = []
data = open(text, 'r', encoding='utf-8')
for line in data.readlines():
line = line.replace('\n', '')
if line not in name:
name.append(line)
# word2vec向量化
model = Word2Vec(LineSentence(text), size=vectorSize, window=5, min_count=1, workers=4)
model.wv.save_word2vec_format('word_model.txt', binary=False) ##将词向量保存在word_model.txt文本中
# 获取model里面的所有关键词
keys = model.wv.vocab.keys()
wordsCluster('text.txt',300)
##3。将含词向量的txt文本转换为csv文本
f = open("word_model.txt","r")
new=[]
for line in f:
new.append(line)
new[0]='\n'
f.close()
f = open("word_model.txt","w")
for n in new:
f.write(n)
f.close()
import csv
with open('data.csv', 'w', newline='') as csvfile: ##data.csv是用来存放词向量的csv文件
writer = csv.writer(csvfile)
data = open('word_model.txt')
for each_line in data:
a = each_line.split()
writer.writerow(a)
##4。用pca将300维的数据降低到2维
# coding=utf-8
import numpy as np
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
l = []
words=[]
with open('data.csv', 'r') as fd:
line = fd.readline()
line=fd.readline()
while line:
if line == "":
continue
line = line.strip()
word = line.split(",")
words.append(word[0])
l.append(word[1:])
line = fd.readline()
X = np.array(l) #导入数据,维度为300
pca = PCA(n_components=2) #降到2维
pca.fit(X) #训练
newX=pca.fit_transform(X) #降维后的数据存放在newX列表中
##5。构建词向量字典并用kmeans训练,得出分类情况
dict={}
for i in range(len(words)):
word_=words[i]
dict[word_]=newX[i]
for j in range(len(words)):
print(words[j]+':',end='')
print(dict[words[j]])
from sklearn.cluster import KMeans
import numpy as np
X = np.array(newX)
kmeans = KMeans(n_clusters=5, random_state=0).fit(X)
print("五个中心词的坐标:")
print(kmeans.cluster_centers_)
list1=[]
list2=[]
list3=[]
list4=[]
list5=[]
for j in range(len(words)):
if kmeans.labels_[j]==0:
list1.append(words[j])
elif kmeans.labels_[j]==1:
list2.append(words[j])
elif kmeans.labels_[j]==2:
list3.append(words[j])
elif kmeans.labels_[j]==3:
list4.append(words[j])
elif kmeans.labels_[j]==4:
list5.append(words[j])
print("与关键词"+list1[0]+"相关的词有:",end='')
print(list1)
print("与关键词"+list2[0]+"相关的词有:",end='')
print(list2)
print("与关键词"+list3[0]+"相关的词有:",end='')
print(list3)
print("与关键词"+list4[0]+"相关的词有:",end='')
print(list4)
print("与关键词"+list5[0]+"相关的词有:",end='')
print(list5)
##将数据用散点图可视化
f1=[]
f2=[]
for i in range(len(newX)):
f1.append(newX[i][0])
f2.append(newX[i][1])
plt.scatter(f1, f2, c='blue', s=6)
plt.show()
测试效果