gensim词向量Word2Vec

原理

神经网络语言模型结构示意图:输入层、投影层、隐藏层、输出层

模型简化

代码实现

1、创造随机数据集

数据格式为 list of lists of tokens
[[word1, word2, …],
 [word1, word2, …],
 …
 [word1, word2, …]]
from random import choice
ls_of_ls = [['cat', 'dog', 'fish'], ['car', 'plane', 'tank']]
ls_of_words = []  # 存放(假设是jieba.lcut后得到的)分词列表的列表
for i in range(1500):
    ls = choice(ls_of_ls)
    ls_of_words.append([choice(ls) for _ in range(9, 15)])

2、建模训练

from gensim.models import Word2Vec
model = Word2Vec(ls_of_words)
参数解释默认值
sentenceslist of lists of tokensNone
size词向量维数100
window同句中当前词和预测词的最大距离5
min_count最低词频过滤5
workers线程数3
sg0:CBOW;1:skip-gram0
hs0:负例采样;1:层次softmax0
negative负例样本数5
ns_exponentThe exponent used to shape the negative sampling distribution0.75
cbow_mean0:上下文词向量求和值;1:上下文词向量平均值1
alpha初始学习率0.025
min_alpha最小学习率0.0001

3、词间相似度

print(model.similar_by_word('car'))
print(model.similarity('car', 'tank'))
[('plane', 0.9965242743492126),
 ('tank', 0.996138334274292),
 ('fish', 0.6187092661857605),
 ('cat', 0.6155150532722473),
 ('dog', 0.5961228609085083)]
0.9961384

4、预测词

print(model.predict_output_word(['car']))
total = sum(i[1] for i in model.predict_output_word(['car']))
print('概率总和为%.2f' % total)
[('car', 0.2208322),
 ('tank', 0.20190619),
 ('plane', 0.19147001),
 ('cat', 0.1310298),
 ('dog', 0.1299157),
 ('fish', 0.124846116)]
概率总和为:1.00

5、词ID、词向量集(词矩阵)

from gensim.models import Word2Vec
ls_of_words = ['AaAaAa', 'BbBbBb'] * 9999
model = Word2Vec(ls_of_words, size=6)
print('\033[033m%s\033[0m' % ' wv.index2word '.center(60, '-'))
print(model.wv.index2word)
print('\033[033m%s\033[0m' % ' wv.vectors '.center(60, '-'))
vectors = model.wv.vectors
print(vectors)
print(vectors.shape)
print('\033[033m%s\033[0m' % ' similar_by_vector '.center(60, '-'))
for i in range(4):
    print(model.similar_by_vector(vectors[i]))

6、词矩阵可视化

from gensim.models import Word2Vec
from seaborn import heatmap
from matplotlib import pyplot
import numpy as np
np.random.seed(0)
n = 8
sentences = [
    ''.join(chr(i) for i in range(30, 40)) * n,
    ''.join(chr(i) for i in range(40, 50)) * n,
    ''.join(chr(i) for i in range(50, 60)) * n,
    ''.join(chr(i) for i in range(60, 70)) * n,
    ''.join(chr(i) for i in range(70, 80)) * n,
    ''.join(chr(i) for i in range(80, 90)) * n,
    ''.join(chr(i) for i in range(90, 100)) * n,
    ''.join(chr(i) for i in range(100, 110)) * n,
    ''.join(chr(i) for i in range(110, 120)) * n,
    ''.join(chr(i) for i in range(120, 130)) * n,
] * n * 10000
model = Word2Vec(sentences, size=10, window=15, sg=1, hs=1, sorted_vocab=0, cbow_mean=0)
vectors = model.wv.vectors
heatmap(vectors, center=np.max(vectors))
pyplot.show()

词向量聚类+3d可视化

# 捏造数据
from random import choice
ls_of_ls = [['芝士', '酸奶', '蛋糕', '巧克力', '做', '吃'],
            ['文本', '数据', '挖掘', '分析', '做', '玩'],
            ['佛山', '广州', '南海', '天河', '吃', '玩']]
ls_of_words = []  # 存放分词列表(假设是jieba.lcut后得到的)的列表
for i in range(2500):
    ls = choice(ls_of_ls)
    ls_of_words.append([choice(ls) for _ in range(9, 15)])

# 建模训练
from gensim.models import Word2Vec
model = Word2Vec(ls_of_words, size=3, window=7)

# 词向量聚类(基于密度)
from sklearn.cluster import DBSCAN
vectors = [model[word] for word in model.wv.index2word]
labels = DBSCAN(eps=0.24, min_samples=3).fit(vectors).labels_

# 词向量可视化
import matplotlib
from mpl_toolkits import mplot3d
import matplotlib.pyplot as mp
mp.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文
matplotlib.rcParams['axes.unicode_minus'] = False  # 显示负号
fig = mp.figure()
ax = mplot3d.Axes3D(fig)  # 创建3d坐标轴
colors = ['red', 'blue', 'green', 'black']
for word, vector, label in zip(model.wv.index2word, vectors, labels):
    ax.scatter(vector[0], vector[1], vector[2], c=colors[label], s=500, alpha=0.4)
    ax.text(vector[0], vector[1], vector[2], word, ha='center', va='center')
mp.show()

写诗程序

import warnings
warnings.filterwarnings('ignore')  # 不打印警告
from gensim.models import Word2Vec
from random import choice

"""配置"""
path = '春节.txt'
window = 12
min_count = 30
size = 110
topn = 13

"""数据读取"""
with open(path, encoding='utf-8') as f:
    ls_of_ls_of_c = [list(line.strip()) for line in f]

"""建模训练"""
model = Word2Vec(ls_of_ls_of_c, size=size, window=window, min_count=min_count)
chr_dict = model.wv.index2word

"""文本序列生成"""
def poem_generator(title, form):
    filt = lambda lst: [t[0] for t in lst if t[0] not in [',', '。']]
    # 标题补全
    if len(title) < 4:
        if not title:
            title += choice(chr_dict)
        for _ in range(4 - len(title)):
            similar_chr = filt(model.similar_by_word(title[-1], topn // 2))
            char = choice([c for c in similar_chr if c not in title])
            title += char
    # 文本生成
    poem = list(title)
    for i in range(form[0]):
        for _ in range(form[1]):
            predict_chr = model.predict_output_word(poem[-window:], max(topn, len(poem) + 1))
            predict_chr = filt(predict_chr)
            char = choice([c for c in predict_chr if c not in poem[len(title):]])
            poem.append(char)
        poem.append(',' if i % 2 == 0 else '。')
    length = form[0] * (form[1] + 1)
    return '《%s》' % ''.join(poem[:-length]) + '\n' + ''.join(poem[-length:])

"""诗歌生成"""
literary_form = {'五言绝句': (4, 5), '对联': (2, 7)}
while True:
    title = input('输入标题:').strip()
    poem5 = poem_generator(title, literary_form['五言绝句'])
    print('\033[033m', poem5, '\033[0m', sep='')
    poem7 = poem_generator(title, literary_form['对联'])
    print('\033[036m', poem7, '\033[0m', sep='')
    print()


语料下载地址:https://blog.csdn.net/Yellow_python/article/details/86726619

Appendix

encn
CBOWContinuous Bag-of-Words
token表征;代币;
hierarchical分层的
exponent[数]指数;说明者
proportion比例
exactly精确地
toolkit工具箱
alpha希腊字母的第1个字母: α \alpha α
appendix附录
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小基基o_O

您的鼓励是我创作的巨大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值