NLP word2vector

最新推荐文章于 2022-12-08 22:44:09 发布

qingyu24

最新推荐文章于 2022-12-08 22:44:09 发布

阅读量214

点赞数

分类专栏： NLP 文章标签： NLP

本文链接：https://blog.csdn.net/weixin_40244593/article/details/91858934

版权

NLP 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk.data
#nltk.download()
#from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from gensim.models.word2vec import Word2Vec 
import pandas as pd

listd=pd.Series(t1.collect())

# df=pd.DataFrame({"words":listd})
l_list=listd.tolist()

# 设定词向量训练的参数
num_features = 300    # Word vector dimensionality
min_word_count = 40   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)

print('Training model...')
model = Word2Vec(l_list, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model.save(os.path.join('/home/user_image/ssd_user_image/qingyu/2019/model', model_name))

画图展示词关联

def plot_with_labels(low_dim_embs, labels, filename):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

    plt.savefig(filename)


try:
    # pylint: disable=g-import-not-at-top
    from sklearn.manifold import TSNE
    import matplotlib.pyplot as plt

    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
    plot_only = 500
    low_dim_embs = tsne.fit_transform(word_vectors[:plot_only])
    labels = model.wv.index2word[:plot_only]
    plot_with_labels(low_dim_embs, labels, os.path.join("/home/user_image/ssd_user_image/qingyu/2019/model", 'tsne.png'))

except ImportError as ex:
    print('Please install sklearn, matplotlib, and scipy to show embeddings.')
    print(ex)