# @Time : 2021/3/27 14:45
# @Author : chao
#代码参考自:https://blog.csdn.net/weixin_45314989/article/details/104390725?utm_medium=distribute.pc_relevant.none-task-blog-baidujs_title-0&spm=1001.2101.3001.4242
#采用word2vec对分词后的文件进行训练,将每个词语映射到词向量空间
import logging
import multiprocessing
import os
import sys
from collections import Counter
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
#word2vec获取词向量
from pandas import np
from sklearn.metrics import silhouette_score
#word2vec
def wordsCluster():
program = os.path.basename(sys.argv[0]) # 读取当前文件的文件名
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# inp为输入语料, outp1为输出模型, outp2为vector格式的模型
inp = r"C:\Users\代码\去除停用词并分词\去除停用词并分词结果\zong_fengci_tingyongci2.txt"
out_model = r'C:\Users\代码\word2vec\数据\corpusSegDone_zong2.model'
out_vector = r'C:\User\代码\word2vec\数据\corpusSegDone_zong2.txt'
# 训练skip-gram模型
#原始参数设置
#model = Word2Vec(LineSentence(inp), size=100, window=5, min_count=5,
#workers=multiprocessing.cpu_count(), iter=5)
#第二种参数设置
model = Word2Vec(LineSentence(inp), size=100, window=3, min_count=5,
workers=multiprocessing.cpu_count(), iter=10, sg=1)
# 保存模型
model.save(out_model)
# 保存词向量
model.wv.save_word2vec_format(out_vector, binary=False)
print("word2vec成功!!")
#将词向量的txt文本转换为csv文本
def changeTxtToCsv():
out_vector = r'C:\Users\代码\word2vec\数据\corpusSegDone_zong2.txt'
f = open(out_vector, "r", encoding='utf-8')
new = []
for line in f:
new.append(line)
new[0] = '\n'
f.close()
f = open(out_vector, "w", encoding='utf-8')
for n in new:
f.write(n)
f.close()
import csv
with open(r'C:\Users\代码\word2vec\数据\corpusSegDone_zong2.csv',
'w', newline='') as csvfile: ##data.csv是用来存放词向量的csv文件
writer = csv.writer(csvfile)
data = open(out_vector,encoding='utf-8')
next(data)
for each_line in data:
a = each_line.split()
writer.writerow(a)
print("转换为csv文件成功!")
#用pca将100维的数据降维至2维
def jiangwei():
import numpy as np
from sklearn.decomposition import PCA
l = []
words = []
with open(r'C:\Users\代码\word2vec\数据\corpusSegDone_zong2.csv',
'r') as fd:
line = fd.readline()
while line:
if line == "":
continue
line = line.strip()
word = line.split(",")
words.append(word[0])
l.append(word[1:])
line = fd.readline()
X = np.array(l) # 导入数据,维度为300
pca = PCA(n_components=2) # 降到2维
pca.fit(X) # 训练
newX = pca.fit_transform(X) # 降维后的数据存放在newX列表中
#将词汇和降维后的数据进行匹配
dict={}
for i in range(len(words)):
word_=words[i]
dict[word_]=newX[i]
# for j in range(len(words)):
# print(words[j]+':',end='')
# print(dict[words[j]])
#将高频名词的word2vec值从总数据的值(即dict中)中提取出来
mingci_list = []
with open(r'C:\Users\代码\词性标注\名词提取\名词提取数据\高频名词(过滤后).txt',
'r', encoding='ANSI') as mf:
for i in mf.readlines():
mingci_list.append(i.strip('\n'))
mingci_jiangwei_list = []
for i in mingci_list:
mingci_jiangwei_list.append(dict[i.strip('\n')])
result = []
result.append(mingci_jiangwei_list)
result.append(mingci_list)
print("降维成功!!")
return result
#构建词向量字典并用k-means训练,得出分类情况
def k_means(mingci_jiangwei_list,mingci_list):
from sklearn.cluster import KMeans
import numpy as np
from matplotlib import pyplot as plt
#聚类个数
num = 4
X = np.array(mingci_jiangwei_list)
kmeans = KMeans(n_clusters=num, random_state=0).fit(X)
print(str(num) + "个中心词的坐标:")
print(kmeans.cluster_centers_)
list1=[]
list2=[]
list3=[]
list4=[]
list5=[]
list6=[]
list7=[]
for j in range(len(mingci_list)):
if kmeans.labels_[j]==0:
list1.append(mingci_list[j])
elif kmeans.labels_[j]==1:
list2.append(mingci_list[j])
elif kmeans.labels_[j]==2:
list3.append(mingci_list[j])
elif kmeans.labels_[j]==3:
list4.append(mingci_list[j])
elif kmeans.labels_[j]==4:
list5.append(mingci_list[j])
elif kmeans.labels_[j]==5:
list6.append(mingci_list[j])
elif kmeans.labels_[j]==6:
list7.append(mingci_list[j])
print("与关键词"+list1[0]+"相关的词有:",end='')
print(list1)
print("与关键词"+list2[0]+"相关的词有:",end='')
print(list2)
print("与关键词"+list3[0]+"相关的词有:",end='')
print(list3)
print("与关键词"+list4[0]+"相关的词有:",end='')
print(list4)
# print("与关键词"+list5[0]+"相关的词有:",end='')
# print(list5)
# print("与关键词"+list6[0]+"相关的词有:",end='')
# print(list6)
# print("与关键词"+list7[0]+"相关的词有:",end='')
# print(list7)
##将数据用散点图可视化
f1=[]
f2=[]
for i in range(len(mingci_jiangwei_list)):
f1.append(mingci_jiangwei_list[i][0])
f2.append(mingci_jiangwei_list[i][1])
plt.scatter(f1, f2, c='blue', s=6)
plt.show()
#肘部法则
def sse(mingci_jiangwei_list):
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt
X = np.array(mingci_jiangwei_list)
x1 = mingci_jiangwei_list[0]
x2 = mingci_jiangwei_list[1]
plt.plot()
plt.xlim([0, 10])
plt.ylim([0, 10])
plt.title('Dataset')
plt.scatter(x1, x2)
plt.show()
# create new plot and data
plt.plot()
#X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
colors = ['b', 'g', 'r']
markers = ['o', 'v', 's']
# k means determine k
distortions = []
K = range(1, 10)
for k in K:
kmeanModel = KMeans(n_clusters=k).fit(X)
kmeanModel.fit(X)
distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
#轮廓系数
def cs(mingci_jiangwei_list):
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
X = np.array(mingci_jiangwei_list)
x1 = np.array(mingci_jiangwei_list[0])
x2 = np.array(mingci_jiangwei_list[1])
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Instances')
plt.scatter(x1,x2)
colors = ['b','g','r','c','m','y','k','b']
markers = ['o','s','D','v','^','p','*','+']
clusters=[2,3,4,5,8]
subplot_counter = 1
sc_scores = []
for t in clusters:
subplot_counter += 1
plt.subplot(3,2,subplot_counter)
kmeans_model = KMeans(n_clusters=t).fit(X)
for i,l in enumerate(kmeans_model.labels_):
plt.plot(x1[i], x2[i], color = colors[l], marker = markers[l], ls = 'None')
plt.xlim([0,10])
plt.ylim([0,10])
sc_score = silhouette_score(X,kmeans_model.labels_,markers='euclidean')
sc_scores.append(sc_score)
plt.title('K=%s,silhouette coefficient=%0.03f'%(t,sc_score))
plt.figure()
plt.plot(clusters, sc_scores,'*-')
plt.xlabel('Numbers of clusters')
plt.ylabel('Silhouette Coefficient score')
plt.show()
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets, metrics
def km_sse_cs(mingci_jiangwei_list):
"""
KMeans算法效果评价
1、簇内误方差(SSE, sum of the squared errors),手肘法,肘部法,其大小表明函数拟合的好坏。
使用图形工具肘部法,根据簇的数量来可视化簇内误方差。下降率突然变缓时即认为是最佳的k值(拐点)。
当KMeans算法训练完成后,可以通过使用内置inertia属性来获取簇内的误方差。
2、轮廓系数法(Silhouette Coefficient)结合了聚类的凝聚度(Cohesion)和分离度(Separation)
平均轮廓系数的取值范围为[-1,1],系数越大,聚类效果越好。当值为负时,暗含该点可能被误分了。
:return:
"""
data = np.array(mingci_jiangwei_list)
# 存放设置不同簇数时的SSE值
sse_list = []
# 轮廓系数
silhouettes = []
# 循环设置不同的聚类簇数
for i in range(2, 15):
model = KMeans(n_clusters=i)
model.fit(data)
# kmeans算法inertia属性获取簇内的SSE
sse_list.append(model.inertia_)
# 轮廓系数
silhouette = metrics.silhouette_score(data, model.labels_, metric='euclidean')
silhouettes.append(silhouette)
# 绘制簇内误方差曲线
plt.subplot(211)
plt.title('KMeans 簇内误方差')
plt.plot(range(2, 15), sse_list, marker='*')
plt.xlabel('簇数量')
plt.ylabel('簇内误方差(SSE)')
# 绘制轮廓系数曲线
plt.subplot(212)
plt.title('KMeans 轮廓系数')
plt.plot(range(2, 15), silhouettes, marker='o')
plt.xlabel('簇数量')
plt.ylabel('轮廓系数')
plt.tight_layout()
plt.show()
if __name__ == '__main__':
import matplotlib as mpl
import numpy as np
import pandas as pd
wordsCluster()
changeTxtToCsv()
result = jiangwei()
mingci_jiangwei_list = result[0]
mingci_list = result[1]
(三)文本挖掘——Word2vec
最新推荐文章于 2023-01-21 15:25:15 发布