CTC decode 代码示例 python 实现

"""
This is an example CTC decoder written in Python. The code is
intended to be a simple example and is not designed to be
especially efficient.
The algorithm is a prefix beam search for a model trained
with the CTC loss function.
For more details checkout either of these references:
  https://distill.pub/2017/ctc/#inference
  https://arxiv.org/abs/1408.2873
"""

import numpy as np
import math
import collections

NEG_INF = -float("inf")


def make_new_beam():
    fn = lambda: (NEG_INF, NEG_INF)
    return collections.defaultdict(fn)


def logsumexp(*args):
    """
    Stable log sum exp.
    """
    if all(a == NEG_INF for a in args):
        return NEG_INF
    a_max = max(args)
    lsp = math.log(sum(math.exp(a - a_max)
                       for a in args))
    return a_max + lsp


def decode(probs, beam_size=100, blank=0):
    """
    Performs inference for the given output probabilities.
    Arguments:
        probs: The output probabilities (e.g. post-softmax) for each
          time step. Should be an array of shape (time x output dim).
        beam_size (int): Size of the beam to use during inference.
        blank (int): Index of the CTC blank label.
    Returns the output label sequence and the corresponding negative
    log-likelihood estimated by the decoder.
    """
    T, S = probs.shape
    probs = np.log(probs)

    # Elements in the beam are (prefix, (p_blank, p_no_blank))
    # Initialize the beam with the empty sequence, a probability of
    # 1 for ending in blank and zero for ending in non-blank
    # (in log space).
    beam = [(tuple(), (0.0, NEG_INF))]

    for t in range(T):  # Loop over time

        # A default dictionary to store the next step candidates.
        next_beam = make_new_beam()

        for s in range(S):  # Loop over vocab
            p = probs[t, s]

            # The variables p_b and p_nb are respectively the
            # probabilities for the prefix given that it ends in a
            # blank and does not end in a blank at this time step.
            for prefix, (p_b, p_nb) in beam:  # Loop over beam

                # If we propose a blank the prefix doesn't change.
                # Only the probability of ending in blank gets updated.
                if s == blank:
                    n_p_b, n_p_nb = next_beam[prefix]
                    n_p_b = logsumexp(n_p_b, p_b + p, p_nb + p)
                    next_beam[prefix] = (n_p_b, n_p_nb)
                    continue

                # Extend the prefix by the new character s and add it to
                # the beam. Only the probability of not ending in blank
                # gets updated.
                end_t = prefix[-1] if prefix else None
                n_prefix = prefix + (s,)
                n_p_b, n_p_nb = next_beam[n_prefix]
                if s != end_t:
                    n_p_nb = logsumexp(n_p_nb, p_b + p, p_nb + p)
                else:
                    # We don't include the previous probability of not ending
                    # in blank (p_nb) if s is repeated at the end. The CTC
                    # algorithm merges characters not separated by a blank.
                    n_p_nb = logsumexp(n_p_nb, p_b + p)

                # *NB* this would be a good place to include an LM score.
                next_beam[n_prefix] = (n_p_b, n_p_nb)

                # If s is repeated at the end we also update the unchanged
                # prefix. This is the merging case.
                if s == end_t:
                    n_p_b, n_p_nb = next_beam[prefix]
                    n_p_nb = logsumexp(n_p_nb, p_nb + p)
                    next_beam[prefix] = (n_p_b, n_p_nb)

        # Sort and trim the beam before moving on to the
        # next time-step.
        beam = sorted(next_beam.items(),
                      key=lambda x: logsumexp(*x[1]),
                      reverse=True)
        beam = beam[:beam_size]

    best = beam[0]
    return best[0], -logsumexp(*best[1])


if __name__ == "__main__":
    np.random.seed(3)

    time = 50
    output_dim = 20

    probs = np.random.rand(time, output_dim)
    probs = probs / np.sum(probs, axis=1, keepdims=True)

    labels, score = decode(probs)
    print("Score {:.3f}".format(score))
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
CTC(Connectionist Temporal Classification)和Attention是两种不同的序列建模方法,分别用于语音识别和自然语言处理等任务。下面是一个简单的实现示例: 1. CTC CTC是一种无需对齐标签的序列建模方法,常用于语音识别和手写字符识别等任务。以下是一个使用TensorFlow实现CTC示例: ```python import tensorflow as tf from tensorflow.keras import layers # 定义模型 def ctc_model(input_dim, output_dim, units=128): input = layers.Input(shape=(None, input_dim)) lstm = layers.LSTM(units, return_sequences=True)(input) lstm = layers.LSTM(units, return_sequences=True)(lstm) output = layers.Dense(output_dim, activation='softmax')(lstm) model = tf.keras.Model(inputs=input, outputs=output) return model # 编译模型 model = ctc_model(input_dim=20, output_dim=10) model.compile(loss=tf.keras.backend.ctc_batch_cost, optimizer='adam') # 训练模型 model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10) ``` 其中,`ctc_batch_cost`是TensorFlow中的CTC损失函数。 2. Attention Attention是一种机制,用于增强序列模型的表现力。以下是一个使用PyTorch实现Attention的示例: ```python import torch import torch.nn as nn # 定义模型 class Attention(nn.Module): def __init__(self, input_dim, hidden_dim): super(Attention, self).__init__() self.input_dim = input_dim self.hidden_dim = hidden_dim self.W = nn.Linear(input_dim, hidden_dim, bias=False) self.U = nn.Linear(hidden_dim, hidden_dim, bias=False) self.v = nn.Linear(hidden_dim, 1, bias=False) def forward(self, inputs): # inputs shape: (batch_size, seq_len, input_dim) e = torch.tanh(self.W(inputs)) # e shape: (batch_size, seq_len, hidden_dim) a = torch.softmax(self.v(e).transpose(1, 2), dim=2) # a shape: (batch_size, 1, seq_len) v = torch.bmm(a, inputs).squeeze(1) # v shape: (batch_size, input_dim) return v class Seq2Seq(nn.Module): def __init__(self, input_dim, output_dim, hidden_dim): super(Seq2Seq, self).__init__() self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True) self.decoder = nn.LSTM(output_dim, hidden_dim, batch_first=True) self.attention = Attention(hidden_dim, hidden_dim) self.fc = nn.Linear(hidden_dim, output_dim) def forward(self, inputs, targets): # inputs shape: (batch_size, seq_len, input_dim) # targets shape: (batch_size, seq_len, output_dim) encoder_outputs, _ = self.encoder(inputs) decoder_outputs, _ = self.decoder(targets) seq_len = decoder_outputs.size(1) outputs = [] for t in range(seq_len): context = self.attention(encoder_outputs) decoder_input = decoder_outputs[:, t, :] decoder_input = torch.cat((decoder_input, context), dim=1) decoder_output, _ = self.decoder(decoder_input.unsqueeze(1)) output = self.fc(decoder_output.squeeze(1)) outputs.append(output) return torch.stack(outputs, dim=1) # 实例化模型 model = Seq2Seq(input_dim=20, output_dim=10, hidden_dim=128) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) # 训练模型 for epoch in range(10): for inputs, targets in train_loader: optimizer.zero_grad() outputs = model(inputs, targets[:, :-1, :]) loss = criterion(outputs.reshape(-1, 10), targets[:, 1:, :].argmax(dim=2).reshape(-1)) loss.backward() optimizer.step() ``` 其中,`Attention`是一个自定义的Attention模块,`Seq2Seq`是一个基于LSTM和Attention的序列模型。在训练过程中,我们使用交叉熵损失函数计算模型的损失。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值