  1. 安装
pip install transformers
  1. 包含的模型结构

Want to contribute a new model? We have added a detailed guide and templates to guide you in the process of adding a new model. You can find them in the templates folder of the repository. Be sure to check the contributing guidelines and contact the maintainers or open an issue to collect feedbacks before starting your PR.

  1. demo
    给了一个写作的demo 本部分略
  2. 快速预览


import torch
from transformers import *

# Transformers has a unified API
# for 10 transformer architectures and 30 pretrained weights.
#          Model          | Tokenizer          | Pretrained weights shortcut
MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
          (CTRLModel,       CTRLTokenizer,       'ctrl'),
          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
# 上面是所包含的模型,比较常用的还是BertModel
# 如果用的是TF2.0 需要将Model的名字前加上"TF"
# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`

# 下面展示了使用预训练模型将文本转化为嵌入隐态的方法 
# Let's encode some text in a sequence of hidden-states using each model:

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

# For example purposes. Not runnable.
model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)

x = tokenizer.encode('前三季度,中部、西部地区社零总额增速分别为10.1%和7.8%,快于东部地区7.3%的增速。', add_special_tokens=True)
#结果为 x = [101,1184,...,102]

##下面可以使用pytorch 将input_demo输入到模型中  先转化为tensor
input_demo = torch.tensor([x])
#结果为 input_demo = tensor([[ 101, 1184,..., 102]])
with torch.no_grad():
    output_demo = model(input_demo)
#output_demo[0]的size是 torch.Size([1, 47, 768])
#output_demo[1]的size是 torch.Size([1, 768])
for model_class, tokenizer_class, pretrained_weights in MODELS:
    # Load pretrained model/tokenizer
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)

    # Encode text
    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples

# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
                      BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering]

# All the classes for an architecture can be initiated from pretrained weights for this architecture每个类可以初始化为预训练权重
# Note that additional weights added for fine-tuning are only initialized
# and need to be trained on the down-stream task
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
for model_class in BERT_MODEL_CLASSES:
    # Load pretrained model/tokenizer
    model = model_class.from_pretrained(pretrained_weights)

    # Models can return full list of hidden-states & attentions weights at each layer
    model = model_class.from_pretrained(pretrained_weights,
    input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
    all_hidden_states, all_attentions = model(input_ids)[-2:]

    # Models are compatible with Torchscript
    model = model_class.from_pretrained(pretrained_weights, torchscript=True)
    traced_model = torch.jit.trace(model, (input_ids,))

    # Simple serialization for models and tokenizers
    model.save_pretrained('./directory/to/save/')  # save
    model = model_class.from_pretrained('./directory/to/save/')  # re-load
    tokenizer.save_pretrained('./directory/to/save/')  # save
    tokenizer = BertTokenizer.from_pretrained('./directory/to/save/')  # re-load

    # SOTA examples for GLUE, SQUAD, text generation...
  1. 快速预览pytorch和tf2.0的互用性
import tensorflow as tf
import tensorflow_datasets
from transformers import *

# Load dataset, tokenizer, model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')#读取的是TF2.0的模型
data = tensorflow_datasets.load('glue/mrpc')

# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
valid_dataset = valid_dataset.batch(64)

# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])#Keras的compile

# Train and evaluate using tf.keras.Model.fit()
history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
                    validation_data=valid_dataset, validation_steps=7)#keras的fit

# Load the TensorFlow model in PyTorch for inspection
pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
# 用pytorch推理和测试更快

# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
sentence_0 = "This research was consistent with his findings."
sentence_1 = "His findings were compatible with this research."
sentence_2 = "His findings were not compatible with this research."
inputs_1 = tokenizer(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
inputs_2 = tokenizer(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')

pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()

print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
  1. fine-tuning/usage scripts
  2. pipelines的使用
    Pipeline are high-level objects which automatically handle tokenization, running your data through a transformers model and outputting the result in a structured object.
    feature-extraction: Generates a tensor representation for the input sequence
    ner: Generates named entity mapping for each word in the input sequence.
    sentiment-analysis: Gives the polarity (positive / negative) of the whole input sequence.
    text-classification: Initialize a TextClassificationPipeline directly, or see sentiment-analysis for an example.
    question-answering: Provided some context and a question refering to the context, it will extract the answer to the question in the context.
    fill-mask: Takes an input sequence containing a masked token (e.g. ) and return list of most probable filled sequences, with their probabilities.

第一、fill mask



BERT_MODEL_PATH = "D:/xx/pytorch_model.bin"
BERT_CONFIG_PATH = "D:/xx/config.json"

model = pipeline('fill-mask',
                              #framework = 'pt'或者'tf',根据下载模型不同选取
                              #topk是一个int 是返回多少个预测结果

mask = model.tokenizer.mask_token#实例化下载模型默认的mask标识符


test_sentence = '我今天很'+mask+'乐'#让模型预测“我今天很_乐”中的_

[{'sequence': '[CLS] 我 今 天 很 快 乐 [SEP]',
  'score': 0.998837411403656,
  'token': 2571},
 {'sequence': '[CLS] 我 今 天 很 欢 乐 [SEP]',
  'score': 0.0009607075480744243,
  'token': 3614},
 {'sequence': '[CLS] 我 今 天 很 喜 乐 [SEP]',
  'score': 3.116693551419303e-05,
  'token': 1599}]

test_sentence2 = ['我今天很'+mask+'乐','但是天' + mask + '很热']
[[{'sequence': '[CLS] 我 今 天 很 快 乐 [SEP]',
   'score': 0.998837411403656,
   'token': 2571},
  {'sequence': '[CLS] 我 今 天 很 欢 乐 [SEP]',
   'score': 0.0009607103420421481,
   'token': 3614},
  {'sequence': '[CLS] 我 今 天 很 喜 乐 [SEP]',
   'score': 3.1167113775154576e-05,
   'token': 1599}],
 [{'sequence': '[CLS] 但 是 天 气 很 热 [SEP]',
   'score': 0.9807906150817871,
   'token': 3698},
  {'sequence': '[CLS] 但 是 天 天 很 热 [SEP]',
   'score': 0.0060727293603122234,
   'token': 1921},
  {'sequence': '[CLS] 但 是 天 也 很 热 [SEP]',
   'score': 0.0018581727053970098,
   'token': 738}]]

