word2vec模型训练

10 篇文章 0 订阅
4 篇文章 0 订阅
import numpy as np
from collections import defaultdict


class word2vec():

    def __init__(self):
        self.n = settings['n']
        self.lr = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']

    def generate_training_data(self, settings, corpus):
        """
        得到训练数据
        """

        # defaultdict(int)  一个字典,当所访问的键不存在时,用int类型实例化一个默认值
        word_counts = defaultdict(int)

        # 遍历语料库corpus
        for row in corpus:
            for word in row:
                # 统计每个单词出现的次数
                word_counts[word] += 1

        # 词汇表的长度
        self.v_count = len(word_counts.keys())
        # 在词汇表中的单词组成的列表
        self.words_list = list(word_counts.keys())
        # 以词汇表中单词为key,索引为value的字典数据
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
        # 以索引为key,以词汇表中单词为value的字典数据
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))

        training_data = []

        for sentence in corpus:
            sent_len = len(sentence)

            for i, word in enumerate(sentence):

                w_target = self.word2onehot(sentence[i])

                w_context = []

                for j in range(i - self.window, i + self.window+1):
                    if j != i and j <= sent_len - 1 and j >= 0:
                        w_context.append(self.word2onehot(sentence[j]))

                training_data.append([w_target, w_context])

        return np.array(training_data)

    def word2onehot(self, word):

        # 将词用onehot编码

        word_vec = [0 for i in range(0, self.v_count)]

        word_index = self.word_index[word]

        word_vec[word_index] = 1

        return word_vec

    def train(self, training_data):

        # 随机化参数w1,w2
        self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))

        self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))

        for i in range(self.epochs):

            self.loss = 0

            # w_t 是表示目标词的one-hot向量
            # w_t -> w_target,w_c ->w_context
            for w_t, w_c in training_data:
                # 前向传播
                y_pred, h, u = self.forward(w_t)

                # 计算误差
                EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)

                # 反向传播,更新参数
                self.backprop(EI, h, w_t)

                # 计算总损失
                self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))

            print('Epoch:', i, "Loss:", self.loss)

    def forward(self, x):
        """
        前向传播
        """

        h = np.dot(self.w1.T, x)

        u = np.dot(self.w2.T, h)

        y_c = self.softmax(u)

        return y_c, h, u

    def softmax(self, x):
        """
        """
        e_x = np.exp(x - np.max(x))

        return e_x / np.sum(e_x)

    def backprop(self, e, h, x):

        d1_dw2 = np.outer(h, e)
        d1_dw1 = np.outer(x, np.dot(self.w2, e.T))

        self.w1 = self.w1 - (self.lr * d1_dw1)
        self.w2 = self.w2 - (self.lr * d1_dw2)

    def word_vec(self, word):

        """
        获取词向量
        通过获取词的索引直接在权重向量中找
        """

        w_index = self.word_index[word]
        v_w = self.w1[w_index]

        return v_w

    def vec_sim(self, word, top_n):
        """
        找相似的词
        """

        v_w1 = self.word_vec(word)
        word_sim = {}

        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_sum = np.dot(v_w1, v_w2)

            # np.linalg.norm(v_w1) 求范数 默认为2范数,即平方和的二次开方
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_sum / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)

        for word, sim in words_sorted[:top_n]:
            print(word, sim)

    def get_w(self):
        w1 = self.w1
        return w1


# 超参数
settings = {
    'window_size': 2,  # 窗口尺寸 m
    # 单词嵌入(word embedding)的维度,维度也是隐藏层的大小。
    'n': 10,
    'epochs': 50,  # 表示遍历整个样本的次数。在每个epoch中,我们循环通过一遍训练集的样本。
    'learning_rate': 0.01  # 学习率
}

# 数据准备
text = "natural language processing and machine learning is fun and exciting"
# 按照单词间的空格对我们的语料库进行分词
corpus = [[word.lower() for word in text.split()]]
print(corpus)

# 初始化一个word2vec对象
w2v = word2vec()

training_data = w2v.generate_training_data(settings, corpus)

# 训练
w2v.train(training_data)

# 获取词的向量
word = "machine"
vec = w2v.word_vec(word)
print(word, vec)

# 找相似的词
w2v.vec_sim("machine", 3)
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值