pytorch训练词向量

最新推荐文章于 2023-04-08 17:07:15 发布

别管江湖

最新推荐文章于 2023-04-08 17:07:15 发布

阅读量1.1k

点赞数

分类专栏：人工智能

本文链接：https://blog.csdn.net/qq_33961152/article/details/105620668

版权

人工智能专栏收录该内容

7 篇文章 0 订阅

订阅专栏

需要的库

import torch
import torch.nn as nn  #神经网络工具箱torch.nn 
import torch.nn.functional as F  #神经网络函数torch.nn.functional
import torch.utils.data as tud  #Pytorch读取训练集需要用到torch.utils.data类

from torch.nn.parameter import Parameter  #参数更新和优化函数

from collections import Counter #Counter 计数器
import numpy as np 
import random
import math 

import pandas as pd
import scipy #SciPy是基于NumPy开发的高级模块，它提供了许多数学算法和函数的实现
import sklearn
from sklearn.metrics.pairwise import cosine_similarity #余弦相似度函数

USE_CUDA = torch.cuda.is_available() #有GPU可以用

# 为了保证实验结果可以复现，我们经常会把各种random seed固定在某一个值
random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
if USE_CUDA:
    torch.cuda.manual_seed(53113)
    
# 设定一些超参数   
K = 100 # number of negative samples 负样本随机采样数量
C = 3 # nearby words threshold 指定周围三个单词进行预测
NUM_EPOCHS = 2 # The number of epochs of training 迭代轮数
MAX_VOCAB_SIZE = 30000 # the vocabulary size 词汇表多大
BATCH_SIZE = 128 # the batch size 每轮迭代1个batch的数量
LEARNING_RATE = 0.2 # the initial learning rate #学习率
EMBEDDING_SIZE = 100 #词向量维度
       
    
LOG_FILE = "word-embedding.log"

# tokenize函数，把一篇文本转化成一个个单词
def word_tokenize(text): 
    return text.split()

数据预处理

从文本文件中读取所有的文字，通过这些文本创建一个vocabulary
由于单词数量可能太大，我们只选取最常见的MAX_VOCAB_SIZE个单词
我们添加一个UNK单词表示所有不常见的单词
我们需要记录单词到index的mapping，以及index到单词的mapping，单词的count，单词的(normalized) frequency，以及单词总数。

with open("text8.train.txt", "r") as fin: #读入文件
    text = fin.read()
    
text = [w for w in word_tokenize(text.lower())] 
#分词，在这里类似于text.split()

vocab = dict(Counter(text).most_common(MAX_VOCAB_SIZE-1))
#字典格式，把（MAX_VOCAB_SIZE-1）个最频繁出现的单词取出来，-1是留给不常见的单词

vocab["<unk>"] = len(text) - np.sum(list(vocab.values()))
#unk表示不常见单词数=总单词数-常见单词数
#这里计算的到vocab["<unk>"]=29999

idx_to_word = [word for word in vocab.keys()] 
#取出字典的所有单词key

word_to_idx = {word:i for i, word in enumerate(idx_to_word)}
#取出所有单词的单词和对应的索引，索引值与单词出现次数相反，最常见单词索引为0。

word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
#所有单词的频数values

word_freqs = word_counts / np.sum(word_counts)
#所有单词的频率

word_freqs = word_freqs ** (3./4.)
#论文里乘以3/4次方

word_freqs = word_freqs / np.sum(word_freqs) # 用来做 negative sampling
# 重新计算所有单词的频率

VOCAB_SIZE = len(idx_to_word) #词汇表单词数30000=MAX_VOCAB_SIZE
VOCAB_SIZE