# -*- coding: utf-8 -*- import gensim.models.word2vec as word2vec import gensim.models.doc2vec as doc2vec from nltk.tokenize import LineTokenizer,SpaceTokenizer,TweetTokenizer from nltk import word_tokenize import gensim import logging import numpy as np from nltk.corpus import stopwords # word2vec Text8 的训练 def train_save_model(): # logging.basicConfig(format='%(asctime)s : %(levelname)s :%(message)',level=logging.INFO) # 加载预料 sentences = word2vec.Text8Corpus('text8') model = word2vec.Word2Vec(sentences, size=200) model.save('text2020.model') # 加载模型 def load_model(): model = word2vec.Word2Vec.load('text2020.model') # simi = model.similar_by_vector('women', 'men') # print(simi) #print(model.most_similar('man')) address = 'C:\\Users\\Administrator\\Desktop\\过度用\\keyword.txt' f = open(address, 'r', encoding='utf-8') z=np.zeros((200,)) str = '' for read in f.readlines(): read=read.lower() #print(read) stop_words = stopwords.words('english') for readperword in read: if readperword!='\n': # print(read) t = word_tokenize(readperword) for k in t: z = z + model[k] # print(count) else: print(z) continue str=str+read #print(str) #for i in str: #print(i) #print(model(list1)) """ def get_keyword(path): f=open(path,'r',encoding='utf-8') list=[] for read in f.readlines(): list.append(read) """ # 执行代码 #load_model() if __name__ == '__main__': #train_save_model() # 执行代码 #address='C:\\Users\\Administrator\\Desktop\\过度用\\keyword.txt' #get_keyword(address) load_model()
word2vec如何对关键词进行转换。将关键词转换为向量。句子也同样可以使用。(累加法)
最新推荐文章于 2022-06-06 14:11:15 发布