首先进行jieba分词,去除停用词;然后通过正则表达式去除无关字符,构建词向量;最后提取小说的所有人名并画图展示出来。
import jieba
import re
from gensim.models import Word2Vec
#读取数据
file = open(r'笑傲江湖.txt',encoding = 'utf-8')
text = file.readlines()
file.close()
#将换行符等特殊字符替换掉
text1 = text[1:] #第一行是这本小说的作者信息
text2 = [re.sub('\u3000| |\n','',i) for i in text1]
#分词,去除停用词
with open(r'停用词.txt','r',encoding = 'utf-8') as f:
stop_words = f.reanlines()
text_cut = [jieba.lcut(i) for i in text2]
stop_words = [re.sub(' |\n','',i) for i in stop_words]
text_ = [[i for i in word if i not in stop_words] for word in text_cut]
#构建词向量
my_wv = WordVec(text_,size = 200,min_counts = 5,window = 2,iter = 100)
#查看词向量
name = '令狐冲'
print(my_wv[name])