


Transformer充分发挥了注意力机制潜力,基于Transformer架构的模型(Bert,GPT,Plato,Erine等)在NLP领域取得了非常巨大的成功。然而,由于Transformer对于输入序列需要构建一个与序列长度等长的注意力矩阵,其内存和计算量会随着序列长度的增加而呈现平方增长,这使得长序列的处理代价很大。Performer使用谷歌提出的FAVOR算法(Fast Attention Via Positive Orthogonal Random Feature) 为注意力机制提供了一种可拓展的低方差、无偏估计,同时保证空间和时间的复杂度是接近线性的。





      -------paddlepaddle  // Paddle代码

           ----------fast_attention.py  // Performer核心代码
           ----------fast_attention_test.py  // 简单测试代码
           ----------transformer.py  // 基于Performer实现的Transformer
				----------bert  // 基于Performer实现的Bert模型
       --------tf2  // Performer源代码

!unzip -o Performers.zip

如下,调用fast_attention_test.py,我们可以看到Performer模型相对于Transformer的加速效果。其中加速程度用speed up ratio来表示,为运行完单个注意力模块花费的时间之比(time_transformer / time_performer)。结果可以看出,当序列长度length > 1000时,Performer就可以发挥非常可观的加速效果;但是在1000以下时,加速的效果并不显著,特别是在序列长度本来就不长时,效果甚至差于Transformer。因此,希望用此模型加速的需要根据实际需求,其实在实际中,序列长度超过1000的情况并不多见。

from Performers.paddlepaddle import fast_attention_test
import prettytable

test = fast_attention_test.TransformerLayersTest()

lengthes = [128, 256, 512, 1024, 2048, 4096, 8192]
ratio = []
for l in lengthes:

table = prettytable.PrettyTable()
table.field_names = ['length'] + [str(l) for l in lengthes]
table.add_row(['speed up ratio'] + ['%.5f'%r for r in ratio])
|     length     |   128   |   256   |   512   |   1024  |   2048  |   4096   |   8192   |
| speed up ratio | 0.15373 | 0.35420 | 0.91381 | 3.22711 | 7.37718 | 14.36226 | 28.28287 |




# 下载诗歌数据集 (从镜像网站github.com.cnpmjs.org下载可提高下载速度)
!git clone https://github.com.cnpmjs.org/chinese-poetry/chinese-poetry
# 下载繁体转简体工具
!git clone https://github.com.cnpmjs.org/fiyen/cht2chs
fatal: destination path 'chinese-poetry' already exists and is not an empty directory.
fatal: destination path 'cht2chs' already exists and is not an empty directory.
import os
import json
import re
from cht2chs.langconv import cht_to_chs

def sentenceParse(para):
    result, number = re.subn(u"(.*)", "", para)
    result, number = re.subn(u"{.*}", "", result)
    result, number = re.subn(u"《.*》", "", result)
    result, number = re.subn(u"《.*》", "", result)
    result, number = re.subn(u"[\]\[]", "", result)
    r = ""
    for s in result:
        if s not in set('0123456789-'):
            r += s
    r, number = re.subn(u"。。", u"。", r)
    return r

def data_preprocess(poem_dir='./chinese-poetry/json', len_limit=120):
    poems = []
    for f in os.listdir(poem_dir)[:1]:
        if f.endswith('.json'):
            json_data = json.load(open(os.path.join(poem_dir, f)))
            for d in json_data:
                    poem = ''.join(d['paragraphs'])
                    poem = sentenceParse(poem)
                    # 控制长度,并将繁体字转换为简体字
                    if len(poem) <= len_limit:
    return poems
from paddlenlp.transformers import BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
import paddle
from paddle.io import Dataset
import numpy as np

class PoemData(Dataset):
        poems (list): 诗歌数据列表,每一个元素为一首诗歌,诗歌未经编码
        max_len: 接收诗歌的最大长度
    def __init__(self, poems, tokenizer, max_len=128):
        super(PoemData, self).__init__()
        self.poems = poems
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __getitem__(self, idx):
        line = poems[idx]
        token_line = self.tokenizer.encode(line)
        token, token_type = token_line['input_ids'], token_line['token_type_ids']
        if len(token) > self.max_len + 1:
            token = token[:self.max_len] + token[-1]
            token_type = token_type[:self.max_len] + token_type[-1]
        input_token, input_token_type = token[:-1], token_type[:-1]
        label_token = np.array((token[1:] + [0] * self.max_len)[:self.max_len], dtype='int64')
        # 输入填充
        input_token = np.array((input_token + [0] * self.max_len)[:self.max_len], dtype='int64')
        input_token_type = np.array((input_token_type + [0] * self.max_len)[:self.max_len], dtype='int64')
        input_pad_mask = (input_token != 0).astype('float32')
        return input_token, input_token_type, input_pad_mask, label_token, input_pad_mask
    def __len__(self):
        return len(self.poems)




from paddle.nn import Layer, Linear, Softmax
from paddlenlp.transformers import BertModel as BertT
from paddlenlp.transformers import BertForTokenClassification as BertClassT

class PoetryBertModelT(Layer):
    def __init__(self, pretrained_bert_model: str, input_length: int):
        super(PoetryBertModelT, self).__init__()
        bert_model = BertT.from_pretrained(pretrained_bert_model)
        self.vocab_size, self.hidden_size = bert_model.embeddings.word_embeddings.parameters()[0].shape
        self.bert_for_class = BertClassT(bert_model, self.vocab_size)
        # 生成下三角矩阵,用来mask句子后边的信息
        self.sequence_length = input_length
        self.lower_triangle_mask = paddle.tril(paddle.tensor.full((input_length, input_length), 1, 'float32'))

    def forward(self, token, token_type, input_mask, input_length=None):
        # 计算attention mask
        mask_left = paddle.reshape(input_mask, input_mask.shape + [1])
        mask_right = paddle.reshape(input_mask, [input_mask.shape[0], 1, input_mask.shape[1]])
        # 输入句子中有效的位置
        mask_left = paddle.cast(mask_left, 'float32')
        mask_right = paddle.cast(mask_right, 'float32')
        attention_mask = paddle.matmul(mask_left, mask_right)
        # 注意力机制计算中有效的位置
        if input_length is not None:
            lower_triangle_mask = paddle.tril(paddle.tensor.full((input_length, input_length), 1, 'float32'))
            lower_triangle_mask = self.lower_triangle_mask
        attention_mask = attention_mask * lower_triangle_mask
        # 无效的位置设为极小值
        attention_mask = (1 - paddle.unsqueeze(attention_mask, axis=[1])) * -1e10
        attention_mask = paddle.cast(attention_mask, self.bert_for_class.parameters()[0].dtype)

        output_logits = self.bert_for_class(token, token_type_ids=token_type, attention_mask=attention_mask)
        return output_logits
from paddle.nn import Layer, Linear, Softmax
from Performers.paddlepaddle.bert.modeling import BertModel as BertP
from Performers.paddlepaddle.bert.modeling import BertForTokenClassification as BertClassP

class PoetryBertModelP(Layer):
    def __init__(self, pretrained_bert_model: str, input_length: int):
        super(PoetryBertModelP, self).__init__()
        bert_model = BertP.from_pretrained(pretrained_bert_model)
        self.vocab_size, self.hidden_size = bert_model.embeddings.word_embeddings.parameters()[0].shape
        self.bert_for_class = BertClassP(bert_model, self.vocab_size)
        # 生成下三角矩阵,用来mask句子后边的信息
        self.sequence_length = input_length
        self.lower_triangle_mask = paddle.tril(paddle.tensor.full((input_length, input_length), 1, 'float32'))

    def forward(self, token, token_type, input_mask, input_length=None):
        # 计算attention mask
        mask_left = paddle.reshape(input_mask, input_mask.shape + [1])
        mask_right = paddle.reshape(input_mask, [input_mask.shape[0], 1, input_mask.shape[1]])
        # 输入句子中有效的位置
        mask_left = paddle.cast(mask_left, 'float32')
        mask_right = paddle.cast(mask_right, 'float32')
        attention_mask = paddle.matmul(mask_left, mask_right)
        # 注意力机制计算中有效的位置
        if input_length is not None:
            lower_triangle_mask = paddle.tril(paddle.tensor.full((input_length, input_length), 1, 'float32'))
            lower_triangle_mask = self.lower_triangle_mask
        attention_mask = attention_mask * lower_triangle_mask
        # 无效的位置设为极小值
        attention_mask = (1 - paddle.unsqueeze(attention_mask, axis=[1])) * -1e10
        attention_mask = paddle.cast(attention_mask, self.bert_for_class.parameters()[0].dtype)

        output_logits = self.bert_for_class(token, token_type_ids=token_type, attention_mask=attention_mask)
        return output_logits
class PoetryBertModelLossCriterion(Layer):
    def forward(self, pred_logits, label, input_mask):
        loss = paddle.nn.functional.cross_entropy(pred_logits, label, ignore_index=0, reduction='none')
        masked_loss = paddle.mean(loss * input_mask, axis=0)
        return paddle.sum(masked_loss)



from paddle.static import InputSpec
from paddlenlp.metrics import Perplexity
from paddle.optimizer import AdamW

length = 1024

net_t = PoetryBertModelT('bert-base-chinese', length)
net_p = PoetryBertModelP('bert-base-chinese', length)

token_ids = InputSpec((-1, length), 'int64', 'token')
token_type_ids = InputSpec((-1, length), 'int64', 'token_type')
input_mask = InputSpec((-1, length), 'float32', 'input_mask')
label = InputSpec((-1, length), 'int64', 'label')

inputs = [token_ids, token_type_ids, input_mask]
labels = [label, input_mask]

model_t = paddle.Model(net_t, inputs, labels)
model_t.prepare(optimizer=AdamW(learning_rate=0.0001, parameters=model_t.parameters()), loss=PoetryBertModelLossCriterion(), metrics=[Perplexity()])
model_t.summary(inputs, [input.dtype for input in inputs])

model_p = paddle.Model(net_p, inputs, labels)
model_p.prepare(optimizer=AdamW(learning_rate=0.0001, parameters=model_p.parameters()), loss=PoetryBertModelLossCriterion(), metrics=[Perplexity()])
model_p.summary(inputs, [input.dtype for input in inputs])

from paddle.io import DataLoader

# 开始处理
poems = data_preprocess(len_limit=length)


train_loader = DataLoader(PoemData(poems, bert_tokenizer, length), batch_size=2, shuffle=True)
model_t.fit(train_data=train_loader, epochs=1, verbose=2)
The loss value printed in the log is the current step, and the metric is the average value of previous step.
Epoch 1/1
Epoch 1/1

model_p.fit(train_data=train_loader, epochs=1, verbose=2)
The loss value printed in the log is the current step, and the metric is the average value of previous step.
Epoch 1/1
Epoch 1/1
