import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk.data
#nltk.download()
#from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from gensim.models.word2vec import Word2Vec
import pandas as pd
listd=pd.Series(t1.collect())
# df=pd.DataFrame({"words":listd})
l_list=listd.tolist()
# 设定词向量训练的参数
num_features = 300 # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 10 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)
print('Training model...')
model = Word2Vec(l_list, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
model.save(os.path.join('/home/user_image/ssd_user_image/qingyu/2019/model', model_name))
画图展示词关联
def plot_with_labels(low_dim_embs, labels, filename):
assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
plt.figure(figsize=(18, 18)) # in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig(filename)
try:
# pylint: disable=g-import-not-at-top
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
plot_only = 500
low_dim_embs = tsne.fit_transform(word_vectors[:plot_only])
labels = model.wv.index2word[:plot_only]
plot_with_labels(low_dim_embs, labels, os.path.join("/home/user_image/ssd_user_image/qingyu/2019/model", 'tsne.png'))
except ImportError as ex:
print('Please install sklearn, matplotlib, and scipy to show embeddings.')
print(ex)