手写skip-gram pytorch

import torch
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable
from collections import Counter
import random

MAX_VOCAB_SIZE = 30000

S = 'F:\\data'
path = S + "\\train.txt"

with open(path,"r") as fin:
    text = fin.read()

def word_tokenize(text):#分开文章单词 I/love/you/
    return text.split()

vocab = word_tokenize(text)
vocabulary = dict(Counter(vocab).most_common(MAX_VOCAB_SIZE - 1))#建立词典,最后一位留给不常用或者没出现过的单词
vocabulary['<unk>'] = len(vocabulary) - np.sum(list(vocabulary.values()))#把不常用或者没出现过的单词 放到字典的最后一位

word2idx={w:idx for (idx,w) in enumerate(vocabulary)} #建立词典 word to id
idx2word={idx:w for (idx,w) in enumerate(vocabulary)}#建立词典 id to word
vocabulary_size=len(vocabulary) #单词one hot 编码的维度

window_size=2
center_word_pos = 0

#用 points 记录所有 postive samples 和 negative samples
points = []
for word in vocab:
    # text 文章中前1000个字
    if center_word_pos > 1000:
        break
    #检查 key word 是不是在字典中
    if word in word2idx.keys():
        #如果在字典中 取这个单词字词典中的位置
        indices = word2idx[word]
    else:
        # 如果不在字典中 放在字词典中的最后一个位置 vocabulary['<unk>']
        indices = MAX_VOCAB_SIZE-1
    # 用 point 记录一个key word 的所有 postive samples 和 negative samples
    point = []
    # positive sampling
    for w in range(-window_size,window_size+1):
        # postive sampling
        context_word_pos=center_word_pos+w
        #判断 context_word_pos 是不是在窗口中
        if context_word_pos<0 or context_word_pos>= len(vocab) or w==0:
            continue
        #找到key word上下文词
        context_word=vocab[context_word_pos]

        # 检查 context_word是不是在字典中
        if context_word in word2idx.keys():
            # 如果在字典中 取这个单词字词典中的位置
            context_word_idx = word2idx[context_word]
        else:
            # 如果不在字典中 放在字词典中的最后一个位置 vocabulary['<unk>']
            context_word_idx = MAX_VOCAB_SIZE-1
        # 把这个中心词 和 上下文词 放入到一个 point 中
        point.append([1,indices,context_word_idx])

    # negative sampling 一般取10到20个点
    negative_point_number = 20
    while negative_point_number>0:
       # 在text 文章中选择一个词
        negative_point = (random.randint(0, len(vocab)))
       # 如果这个词在 key word 的窗口里面,则从新选择
        while negative_point in range(indices-window_size,indices+window_size+1):
            negative_point = (random.randint(0, len(vocab)))
        #找到这个单词
        context_word = vocab[negative_point]
       # 检查 context_word是不是在字典中
        if context_word in word2idx.keys():
            # 如果在字典中 取这个单词字词典中的位置
            context_word_idx = word2idx[context_word]
        else:
            # 如果不在字典中 放在字词典中的最后一个位置 vocabulary['<unk>']
            context_word_idx = MAX_VOCAB_SIZE - 1

       # 把这个中心词 和  一个 negative sample 放入到一个 point 中
        point.append([-1, indices, context_word_idx])
        negative_point_number = negative_point_number -1

    points.append(point)
    center_word_pos = center_word_pos + 1

# # 把单词转换为 one hot 编码
def get_input_layer_one(word_idx):
    x=torch.zeros(vocabulary_size).float()
    x[word_idx]=1.0
    return x

# 把单词组转换为 one hot 编码
def get_input_layer(word_idx):
    x=torch.zeros(vocabulary_size,len(word_idx)).float()
    for i in range(len(word_idx)):
        x[word_idx[i]][i]= 1.0
    return x


# 把points 分成一个一个batch
def batch_generator(all_data , batch_size, shuffle=False):
    """
    :param all_data : all_data整个数据集
    :param batch_size: batch_size表示每个batch的大小
    :param shuffle: 每次是否打乱顺序
    :return:
    """
    all_data = [np.array(d) for d in all_data]
    data_size = all_data[0].shape[0]
    print("data_size: ", data_size)
    if shuffle:
        p = np.random.permutation(data_size)
        all_data = [d[p] for d in all_data]

    batch_count = 0
    arr =[]
    while True:
        start = batch_count * batch_size
        if batch_count * batch_size + batch_size > data_size:
            end = data_size
            arr = arr + [d[start: end] for d in all_data]
            return arr

        end = start + batch_size
        batch_count += 1
        arr = arr + [d[start: end] for d in all_data]

# batch size 为10 每一个batch 里面装着 一个 key word 以及它的 正样本 positive sample 和 负样本 negative sample
batch_size = 10

#把points 分为 size 为 10 的 batch
training_data = batch_generator([points],  batch_size)

#中间hidden 成的神经元个数
embedding_size=5
W1=Variable(torch.randn(embedding_size,vocabulary_size).float(),requires_grad=True)# 5 * 30000
W2 = Variable(torch.randn(vocabulary_size, embedding_size).float(), requires_grad=True)# 30000 * 5
#epoch 次数
num_epochs = 1000
#学习率
learning_rate = 0.001

# 代价函数  代价函数是指 每一个batch 中所有loss value 的总和
cost_value = 0
for epo in range(num_epochs):

    for batch in training_data:
        cost_value = 0

        for key_word in batch:
            loss_val = 0

            positive_data = [ data[1] for data in key_word if data[0] == 1]
            #target = [[0],[1]]
            positive_target = [ data[2] for data in key_word if data[0] == 1]
            #x_positive  这个值转为one hot: [1,0 ,0,0,0],[0,1 ,0,0,0]]
            x_positive = Variable(get_input_layer(positive_data)).float()
            # target [[0],[1]]
            y_true = np.array(positive_target).reshape(len(positive_target), 1)

            y_positive = Variable(torch.from_numpy(np.array(y_true)).long())

            z1 = torch.matmul(W1, x_positive)  # W1 = 5 *30000   x =  2 * 30000  z1 = 5
            z2 = torch.matmul(W2, z1)  # W2 = 30000 * 5  z1 =  5   z2 = 30000
            log_softmax = F.log_softmax(z2, dim=0)
            loss_positive = F.nll_loss(log_softmax.T, y_positive.view(-1))

            negative_data = [ data[1] for data in key_word if data[0] == -1]
            negative_target = [ data[2] for data in key_word if data[0] == -1]
            x_negative = Variable(get_input_layer(negative_data)).float()
            # target 为这个x的值 eq: x = [1,0 ,0,0,0] target = 0  eq: x = [0,1 ,0,0,0] target = 1
            y_true = np.array(negative_target).reshape(len(negative_target), 1)
            # print(y_true)
            y_negative = Variable(torch.from_numpy(np.array(y_true)).long())

            z1 = torch.matmul(W1, x_negative)  # W1 = 5 *30000   x = 30000*2  z1 = 5
            z2 = torch.matmul(W2, z1)  # W2 = 30000 * 5  z1 =  5   z2 = 30000
            log_softmax = F.log_softmax(z2, dim=0)
            loss_negative = F.nll_loss(log_softmax.T, y_negative.view(-1))

            loss = loss_positive - loss_negative
            cost_value = cost_value + loss

            loss.backward()

            W1.data -= learning_rate * W1.grad.data
            W2.data -= learning_rate * W2.grad.data

            W1.grad.data.zero_()
            W2.grad.data.zero_()

    if epo % 10 == 0:

        print(f'Loss at epo {epo}: {cost_value/len(batch)}')


print('找出相似单词')
test = 'the'
print(test)
print(word2idx[test])
def vord2vec(test):
    x = Variable(get_input_layer_one(word2idx[test])).float()
    z1 = torch.matmul(W1, x)  # z1 = 5   W1 = 5 *15  x =  15
    z2 = torch.matmul(W2, z1)  # z2 = 15  W2 = 15 * 5  z1 =  5
    #对比one hot 编码 
    return z2
#输出 vord2vec 编码
print(vord2vec(test))





在这里插入图片描述

train.txt 部分数据
anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive economic institutions anarchists advocate social relations based upon voluntary association of autonomous individuals mutual aid and self governance while anarchism is most easily defined by what it is against anarchists also offer positive visions of what they believe to be a truly free society however ideas about how an anarchist society might work vary considerably especially with respect to economics there is also disagreement about how a free society might be brought about origins and predecessors kropotkin and others argue that before recorded history human society was organized on anarchist principles most anthropologists follow kropotkin and engels in believing that hunter gatherer bands were egalitarian and lacked division of labour accumulated wealth or decreed law and had equal access to resources william godwin anarchists including the the anarchy organisation and rothbard find anarchist attitudes in taoism from ancient china kropotkin found similar ideas in stoic zeno of citium according to kropotkin zeno repudiated the omnipotence of the state its intervention and regimentation and proclaimed the sovereignty of the moral law of the individual the anabaptists of one six th century europe are sometimes considered to be religious forerunners of modern anarchism bertrand russell in his history of western philosophy writes that the anabaptists repudiated all law since they held that the good man will be guided at every moment by the holy spirit from this premise they arrive at communism the diggers or true levellers were an early communistic movement during the time of the english civil war and are considered by some as forerunners of modern anarchism in the modern era the first to use the term to mean something other than chaos was louis armand baron de lahontan in his nouveaux voyages dans l am rique septentrionale one seven zero three where he described the indigenous american society which had no state laws prisons priests or private property as being in anarchy russell means a libertarian and leader in the american indian movement has repeatedly stated that he is an anarchist and so are all his ancestors in one seven nine three in the thick of the french revolution william godwin published an enquiry concerning political justice although godwin did not use the word anarchism many later anarchists have regarded this book as the first major anarchist text and godwin as the founder of philosophical anarchism but at this point no anarchist movement yet existed and the term anarchiste was known mainly as an insult hurled by the bourgeois girondins at more radical elements in the french revolution the first self labelled anarchist pierre joseph proudhon it is commonly held that it wasn t until pierre joseph proudhon published what is property in one eight four zero that the term anarchist was adopted as a self description it is for this reason that some claim proudhon as the founder of modern anarchist theory in what is property proudhon answers with the famous accusation property is theft in this work he opposed the institution of decreed property propri t where owners have complete rights to use and abuse their property as they wish such as exploiting workers for profit in its place proudhon supported what he called possession individuals can have limited rights to use resources capital and goods in accordance with principles of equality and justice proudhon s vision of anarchy which he called mutualism mutuellisme involved an exchange economy where individuals and groups could trade the products of their labor using labor notes which represented the amount of working time involved in production this would ensure that no one would profit from the labor of others workers could freely join together in co operative workshops an interest free bank would be set up to provide everyone with access to the means of production proudhon s ideas were influential within french working class movements and his followers were active in the revolution of one eight four eight in france proudhon s philosophy of property is complex it was developed in a number of works over his lifetime and there are differing interpretations of some of his ideas for more detailed discussion see here max stirner s egoism in his the ego and its own stirner argued that most commonly accepted social institutions including the notion of state property as a right natural rights in general and the very notion of society were mere illusions or ghosts in the mind saying of society that the individuals are its reality he advocated egoism and a form of amoralism in which individuals would unite in associations of egoists only when it was in their self interest to do so for him property simply comes about through might whoever knows how to take to defend the thing to him belongs property and what i have in my power that is my own so long as i assert myself as holder i am the proprietor of the thing stirner never called himself an anarchist he accepted only the label egoist nevertheless his ideas were influential on many individualistically inclined anarchists although interpretations of his thought are diverse american

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值