先预处理,进行分词等
import jieba
from gensim.models import Word2Vec
stopword = [line.strip() for line in open('password.txt', 'r',encoding='utf-8').readlines()]
def seg_sentence(sentence):
""""进行分词"""
sentence_seged = jieba.cut(sentence.strip()) #分词
stopwords = [' ']
# # stopwords = stopwordslist('E:\\pythonimg\\stopword.txt') # 这里加载停用词的路径 这里可以再加自定义的停用词
outstr = '' # 必须字符,不能列表
for word in sentence_seged:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
# return outstr
return outstr.split(' ') # 以空格分割 列表
生成词向量
def vec_produce(sentence,word,size):
"""生成词向量"""
sentenceseg = seg_sentence(sentence) # 已分词可向量化的句子
model = Word2Vec(sentences=[sentenceseg], vector_size=size, window=5, min_count=1, workers=4)
word_vectors = model.wv
wordvec = word_vectors[word]
return wordvec