小黑fastNLP成长日记2:vocab与embedding

1.构建vocab

(1) 直接构建vocab

from fastNLP import Vocabulary
vocab = Vocabulary()
vocab.add_word_lst(['小','黑','无','敌'])
vocab.add_word('小黑')
print('小:',vocab.to_index('小'))
print('小黑:',vocab.to_index('小黑'))
print('词典全貌:',list(vocab))
print('----------------------------')
# 不必pad和unk的时候
vocab = Vocabulary(unknown = None,padding = None)
vocab.add_word_lst(['xiao','hei'])
print('xiao:',vocab.to_index('xiao'))
print('hei:',vocab.to_index('hei'))
# print(vocab.to_index('无敌'))会报错

小: 2
小黑: 6
小: 2
小黑: 6
词典全貌: [(’’, 0), (’’, 1), (‘小’, 2), (‘黑’, 3), (‘无’, 4), (‘敌’, 5), (‘小黑’, 6)]


xiao: 0
hei: 1

(2) 通过dataset构建vocab

from fastNLP import Vocabulary
from fastNLP import DataSet
dataset = DataSet()
dataset = DataSet({'chars': [
                                ['今', '天', '天', '气', '很', '好', '。'],
                                ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']
                            ],
                    'target': ['neutral', 'negative']
})
vocab = Vocabulary()
vocab.from_dataset(dataset,field_name = 'chars')
vocab.index_dataset(dataset,field_name = 'chars')
target_vocab = Vocabulary(padding = None,unknown = None)
target_vocab.from_dataset(dataset,field_name = 'target')
target_vocab.index_dataset(dataset,field_name = 'target')
dataset

±-----------------------------------±-------+
| chars | target |
±-----------------------------------±-------+
| [4, 2, 2, 5, 6, 7, 3] | 0 |
| [8, 9, 10, 11, 12, 13, 14, 15, … | 1 |
±-----------------------------------±-------+

(3) 训练小trip(no_create_entry_dataset)

from fastNLP import Vocabulary
from fastNLP import DataSet
tr_data = DataSet({'chars': [
                                ['今', '天', '心', '情', '很', '好', '。'],
                                ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']
                            ],
                    'target': ['positive', 'negative']
})
dev_data = DataSet({'chars': [
                                ['住', '宿', '条', '件', '还', '不', '错'],
                                ['糟', '糕', '的', '天', '气', ',', '无', '法', '出', '行', '。']
                            ],
                    'target': ['positive', 'negative']
})
vocab = Vocabulary()
vocab.from_dataset(tr_data,field_name = 'chars',no_create_entry_dataset = [dev_data])
list(vocab)

[(’’, 0),
(’’, 1),
(’。’, 2),
(‘天’, 3),
(‘今’, 4),
(‘心’, 5),
(‘情’, 6),
(‘很’, 7),
(‘好’, 8),
(‘被’, 9),
(‘这’, 10),
(‘部’, 11),
(‘电’, 12),
(‘影’, 13),
(‘浪’, 14),
(‘费’, 15),
(‘了’, 16),
(‘两’, 17),
(‘个’, 18),
(‘小’, 19),
(‘时’, 20),
(‘住’, 21),
(‘宿’, 22),
(‘条’, 23),
(‘件’, 24),
(‘还’, 25),
(‘不’, 26),
(‘错’, 27),
(‘糟’, 28),
(‘糕’, 29),
(‘的’, 30),
(‘气’, 31),
(’,’, 32),
(‘无’, 33),
(‘法’, 34),
(‘出’, 35),
(‘行’, 36)]

import torch
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary
vocab = Vocabulary()
vocab.add_word('train')
vocab.add_word('only_in_train')    # 仅在train中出现,但肯定在预训练表中不存在
vocab.add_word('test',no_create_entry = True)    # 该词只在dev或test中出现
vocab.add_word('only_in_test',no_create_entry = True)    #这个词在预训练词表中找不到
embed = StaticEmbedding(vocab,model_dir_or_name = 'en-glove-6b-50d')
print(embed(torch.LongTensor([vocab.to_index('train')])))
print(embed(torch.LongTensor([vocab.to_index('only_in_train')])))    # 随机初始化
print(embed(torch.LongTensor([vocab.to_index('test')]))) 
print(embed(torch.LongTensor([vocab.to_index('only_in_test')])))    # 找不到,为unk
print(embed(torch.LongTensor([vocab.unknown_idx])))

Found 2 out of 6 words in the pre-training embedding.
tensor([[ 0.9497, 0.3433, 0.8450, -0.8852, -0.7208, -0.2931, -0.7468, 0.6512,
0.4730, -0.7401, 0.1877, -0.3828, -0.5590, 0.4295, -0.2698, -0.4238,
-0.3124, 1.3423, -0.7857, -0.6302, 0.9182, 0.2113, -0.5744, 1.4549,
0.7546, -1.6165, -0.0085, 0.0029, 0.5130, -0.4745, 2.5306, 0.8594,
-0.3067, 0.0578, 0.6623, 0.2080, 0.6424, -0.5246, -0.0534, 1.1404,
-0.1370, -0.1836, 0.4546, -0.5096, -0.0255, -0.0286, 0.1805, -0.4483,
0.4053, -0.3682]], grad_fn=)
tensor([[-0.0679, 0.1663, -0.0844, 0.0084, 0.1099, -0.1997, -0.0852, -0.0419,
0.1569, 0.2359, -0.0010, 0.1434, 0.2325, -0.1618, 0.1793, -0.0205,
0.0202, 0.1161, -0.1493, 0.1386, 0.0505, -0.2206, -0.0717, -0.0972,
0.0858, -0.0635, -0.2301, 0.1901, 0.1649, 0.1589, -0.2337, 0.2305,
0.1733, -0.0487, 0.2094, -0.2108, -0.2290, -0.1251, 0.0837, 0.1625,
-0.0345, -0.0014, -0.2263, 0.1861, -0.0591, 0.0972, 0.1509, -0.1980,
0.1856, 0.1953]], grad_fn=)
tensor([[ 0.1318, -0.2552, -0.0679, 0.2619, -0.2616, 0.2357, 0.1308, -0.0118,
1.7659, 0.2078, 0.2620, -0.1643, -0.8464, 0.0201, 0.0702, 0.3978,
0.1528, -0.2021, -1.6184, -0.5433, -0.1786, 0.5389, 0.4987, -0.1017,
0.6626, -1.7051, 0.0572, -0.3241, -0.6683, 0.2665, 2.8420, 0.2684,
-0.5954, -0.5004, 1.5199, 0.0396, 1.6659, 0.9976, -0.5597, -0.7049,
-0.0309, -0.2830, -0.1356, 0.6429, 0.4149, 1.2362, 0.7659, 0.9780,
0.5851, -0.3018]], grad_fn=)
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.]], grad_fn=)
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0.]], grad_fn=)

2.embedding的使用

(1) 使用随机初始化embedding

from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary
vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())
embed = StaticEmbedding(vocab,model_dir_or_name = None,embedding_dim = 30)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

torch.Size([1, 5, 30])

(2) ELMO Embedding

from fastNLP.embeddings import ElmoEmbedding
from fastNLP import Vocabulary
vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())
embed = ElmoEmbedding(vocab,model_dir_or_name = 'en-small',requires_grad = False)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print('elmo:',embed(words).size())
# 输出多层embedding结果
embed = ElmoEmbedding(vocab,model_dir_or_name = 'en-small',requires_grad = False,layers = '1,2')
print('输出多层embedding结果:',embed(words).size())
# 多层加权embedding
embed = ElmoEmbedding(vocab,model_dir_or_name = 'en-small',requires_grad = True,layers = 'mix')
print('多层加权融合embedding:',embed(words).size())

22 out of 22 characters were found in pretrained elmo embedding.
elmo: torch.Size([1, 5, 256])
22 out of 22 characters were found in pretrained elmo embedding.
输出多层embedding结果: torch.Size([1, 5, 512])
22 out of 22 characters were found in pretrained elmo embedding.
多层加权融合embedding: torch.Size([1, 5, 256])

(3) Bert Embedding

from fastNLP.embeddings import BertEmbedding
from fastNLP import Vocabulary
vocab = Vocabulary()
vocab.add_word_lst('this is a demo .'.split())
# 使用最后两层的输出
embed = BertEmbedding(vocab,model_dir_or_name = 'en-base-cased',layers = '10,11',requires_grad=False)
print('bert的最后两层输出:',embed(words).size())
# 保留cls
embed = BertEmbedding(vocab,model_dir_or_name = 'en-base-cased',layers = '-1',include_cls_sep = True)
print(embed(words).size())    # 结果将在序列维度上增加2
print('cls表示为:',embed(words)[:,0].size())

loading vocabulary file C:\Users\xiaoh.fastNLP\embedding\bert-base-cased\vocab.txt
Load pre-trained BERT parameters from file C:\Users\xiaoh.fastNLP\embedding\bert-base-cased\pytorch_model.bin.
Bert Model will return 2 layers (layer-0 is embedding result): [10, 11]
bert的最后两层输出: torch.Size([1, 5, 1536])
loading vocabulary file C:\Users\xiaoh.fastNLP\embedding\bert-base-cased\vocab.txt
Load pre-trained BERT parameters from file C:\Users\xiaoh.fastNLP\embedding\bert-base-cased\pytorch_model.bin.
Bert Model will return 1 layers (layer-0 is embedding result): [-1]
torch.Size([1, 7, 768])
cls表示为: torch.Size([1, 768])

(4)使用pool_method

'''
在英文Bert模型中,一个英文单词可能会被切分为多个subword,例如"fairness"会被拆分为 ["fair", "##ness"] ,
这样一个word对应的将有两个输出, BertEmbedding 会使用pooling方法将一个word的subword的表示合并成一个vector,
通过pool_method可以控制 该pooling方法,支持的有"first"(即使用fair的表示作为fairness的表示), 
"last"(使用##ness的表示作为fairness的表示), "max"(对fair和 ##ness在每一维上做max),
"avg"(对fair和##ness每一维做average)。
'''
embed = BertEmbedding(vocab,model_dir_or_name = 'en-base-cased',layers = '-1',pool_method = 'max')
print(embed(words).size())

loading vocabulary file C:\Users\xiaoh.fastNLP\embedding\bert-base-cased\vocab.txt
Load pre-trained BERT parameters from file C:\Users\xiaoh.fastNLP\embedding\bert-base-cased\pytorch_model.bin.
Bert Model will return 1 layers (layer-0 is embedding result): [-1]
torch.Size([1, 5, 768])

"""
根据 BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding , 
Bert在针对具有两句话的任务时(如matching,Q&A任务),句子之间通过[SEP]拼接起来,
前一句话的token embedding为0, 后一句话的token embedding为1。
BertEmbedding能够自动识别句子中间的[SEP]来正确设置对应的token_type_id的。

在多个[SEP]的情况下,将会使token_type_id不断0,1循环。
比如"first sentence [SEP] second sentence [SEP] third sentence",
它们的 token_type_id将是[0, 0, 0, 1, 1, 1, 0, 0]。
但请注意[SEP]一定要大写的,不能是[sep],否则无法识别。
"""
vocab = Vocabulary()
vocab.add_word_lst("this is a demo . [SEP] another sentence .".split())
embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo . [SEP] another sentence .".split()]])
print(embed(words).size())

loading vocabulary file C:\Users\xiaoh.fastNLP\embedding\bert-base-cased\vocab.txt
Load pre-trained BERT parameters from file C:\Users\xiaoh.fastNLP\embedding\bert-base-cased\pytorch_model.bin.
Bert Model will return 1 layers (layer-0 is embedding result): [-1]
torch.Size([1, 9, 768])

(5)使用character-level的embedding

# CNN
from fastNLP.embeddings import CNNCharEmbedding
from fastNLP import Vocabulary
vocab = Vocabulary()
vocab.add_word_lst(['this','is','a','demo'])
embed = CNNCharEmbedding(vocab,embed_size = 64,char_emb_size = 50)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print('cnn char embedding:',embed(words).size())

Start constructing character vocabulary.
In total, there are 8 distinct characters.
cnn char embedding: torch.Size([1, 5, 64])

from fastNLP.embeddings import LSTMCharEmbedding
from fastNLP import Vocabulary
vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())
embed = LSTMCharEmbedding(vocab,embed_size = 64,char_emb_size = 50)
words = torch.LongTensor([[vocab.to_index(word) for word in 'this is a demo .'.split()]])
print(embed(words).size())

Start constructing character vocabulary.
In total, there are 8 distinct characters.
torch.Size([1, 5, 64])

(6)embedding的拼接

from fastNLP.embeddings import StaticEmbedding,StackEmbedding,CNNCharEmbedding
from fastNLP import Vocabulary
vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())
word_embed = StaticEmbedding(vocab,model_dir_or_name = 'en-glove-6b-50d')
char_embed = CNNCharEmbedding(vocab,embed_size = 64,char_emb_size = 50)
embed = StackEmbedding([word_embed,char_embed])
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())  # 输出embedding的维度为50+64=114

Found 5 out of 7 words in the pre-training embedding.
Start constructing character vocabulary.
In total, there are 8 distinct characters.
torch.Size([1, 5, 114])

(7) 设置权重是否更新

from fastNLP.embeddings import *
vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())
embed = BertEmbedding(vocab,model_dir_or_name = 'en-base-cased',requires_grad = True)
embed.requires_grad = False

(8)词的大小写问题

# fastNLP解决大小写问题
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary
vocab = Vocabulary().add_word_lst("The the a A".split())
embed = StaticEmbedding(vocab,model_dir_or_name=None,embedding_dim=5)
print('--------区分大小------------------')
print(embed(torch.LongTensor([vocab.to_index('The')])))
print(embed(torch.LongTensor([vocab.to_index('the')])))
print('--------一律转成小写------------------')
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)
print(embed(torch.LongTensor([vocab.to_index('The')])))
print(embed(torch.LongTensor([vocab.to_index('the')])))

--------区分大小------------------
tensor([[ 0.5718, -0.1065, 0.7637, 0.2195, -0.5307]],
grad_fn=)
tensor([[-0.5218, 0.2985, 0.5048, 0.1330, -0.5379]],
grad_fn=)
--------一律转成小写------------------
StaticEmbedding will ignore model_dir_or_name, and randomly initialize embedding with dimension 5. If you want to use pre-trained embedding, set embedding_dim to 0.
All word in the vocab have been lowered. There are 6 words, 4 unique lowered words.
tensor([[ 0.6252, -0.1459, -0.3673, 0.6399, -0.6080]],
grad_fn=)
tensor([[ 0.6252, -0.1459, -0.3673, 0.6399, -0.6080]],
grad_fn=)

# fastNLP如何解决min_freq的问题
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("the the the a".split())
#  下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)
print(embed(torch.LongTensor([vocab.to_index('the')])))
print(embed(torch.LongTensor([vocab.to_index('a')])))   # 跟pad相等
print(embed(torch.LongTensor([vocab.unknown_idx])))  

StaticEmbedding will ignore model_dir_or_name, and randomly initialize embedding with dimension 5. If you want to use pre-trained embedding, set embedding_dim to 0.
1 words have frequency less than 2.
tensor([[-0.0727, -0.3303, -0.3358, -0.5523, -0.1097]],
grad_fn=)
tensor([[-0.7554, 0.7275, 0.2864, -0.4711, -0.5800]],
grad_fn=)
tensor([[-0.7554, 0.7275, 0.2864, -0.4711, -0.5800]],
grad_fn=)

# 考虑到lower的作用
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("the the the a A".split())
#  下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)
print(embed(torch.LongTensor([vocab.to_index('the')])))
print(embed(torch.LongTensor([vocab.to_index('a')])))
print(embed(torch.LongTensor([vocab.to_index('A')])))
print(embed(torch.LongTensor([vocab.unknown_idx])))

StaticEmbedding will ignore model_dir_or_name, and randomly initialize embedding with dimension 5. If you want to use pre-trained embedding, set embedding_dim to 0.
0 words have frequency less than 2.
All word in the vocab have been lowered. There are 5 words, 4 unique lowered words.
tensor([[ 0.0705, -0.1956, 0.1031, 0.0911, 0.5606]],
grad_fn=)
tensor([[-0.2584, -0.6060, -0.4661, 0.4403, -0.3481]],
grad_fn=)
tensor([[-0.2584, -0.6060, -0.4661, 0.4403, -0.3481]],
grad_fn=)
tensor([[-0.0139, -0.2810, -0.2592, 0.4572, -0.2768]],
grad_fn=)

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值