from transformers import AutoTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel
text = "I don't really like working, but I need the money!"
tokens = tokenizer.tokenize(text)
print(tokens)
dic = tokenizer(text, padding="max_length", truncation=True, max_length=39, return_tensors="pt")
# tokens ['i', 'don', "'", 't', 'really', 'like', 'working', ',', 'but', 'i', 'need', 'the', 'money', '!']
#'input_ids': [ 101, 1045, 2123, 1005, 1056, 2428, 2066, 2551, 1010, 2021, 1045, 2342,
# 1996, 2769, 999, 102, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
# 这里说明了tokenizer获取input_ids时,是自带了分词功能的。当然,只是分词的话,调用.tokenize就行
text= ['I like he', 'like he', 'he but', 'but he', 'he like she', 'like', 'she like I', 'I he she']
# [bs]
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
dic = tokenizer(text, padding="max_length", truncation=True, max_length=10, return_tensors="pt")
input_ids = dic['input_ids'] # [bs, max_length] 添加了开头101和结尾102
token_type_ids = dic['token_type_ids'] # [bs, max_length]
attention_mask = dic['attention_mask']# [bs, max_length]
# 这里只是想看一看它们的实际样子
# attention_mask
# [[1,1,1,1,1,0,0,0,0,0],
# [1,1,1,1,0,0,0,0,0,0]
# ....]
# input_ids
# [[101,1045,1001,2006,102,0,0,0,0,0],
# [101,1001,2006,102,0,0,0,0,0,0]
# ....]
class PretrainedLanguageModel(nn.Module):
def __init__(self, pretrained_language_model_name):
super(PretrainedLanguageModel, self).__init__()
self.model = AutoModel.from_pretrained(pretrained_language_model_name)
def forward(self, input_ids, token_type_ids, attention_mask):
output = self.model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).last_hidden_state
return output
model = PretrainedLanguageModel('bert-base-uncased')
output = model(input_ids, token_type_ids, attention_mask)
# 【bs, max_length, hidden_dim] 返回结果始终是这个样子。是文本的表示
print(input_ids.shape)
print(output.shape)