# 【Learning Notes】CTC 原理及实现

31 篇文章 5 订阅
12 篇文章 0 订阅
8 篇文章 0 订阅

CTC（ Connectionist Temporal Classification，连接时序分类）是一种用于序列建模的工具，其核心是定义了特殊的目标函数/优化准则[1]。

jupyter notebook 版见 repo.

# 1. 算法

## 1.1 序列问题形式化。

Nw:(Rm)T(Rn)T N w : ( R m ) T → ( R n ) T

import numpy as np

np.random.seed(1111)

T, V = 12, 5
m, n = 6, V

x = np.random.random([T, m])  # T x m
w = np.random.random([m, n])  # weights, m x n

def softmax(logits):
max_value = np.max(logits, axis=1, keepdims=True)
exp = np.exp(logits - max_value)
exp_sum = np.sum(exp, axis=1, keepdims=True)
dist = exp / exp_sum
return dist

def toy_nw(x):
y = np.matmul(x, w)  # T x n
y = softmax(y)
return y

y = toy_nw(x)
print(y)
print(y.sum(1, keepdims=True))
[[ 0.24654511  0.18837589  0.16937668  0.16757465  0.22812766]
[ 0.25443629  0.14992236  0.22945293  0.17240658  0.19378184]
[ 0.24134404  0.17179604  0.23572466  0.12994237  0.22119288]
[ 0.27216255  0.13054313  0.2679252   0.14184499  0.18752413]
[ 0.32558002  0.13485564  0.25228604  0.09743785  0.18984045]
[ 0.23855586  0.14800386  0.23100255  0.17158135  0.21085638]
[ 0.38534786  0.11524603  0.18220093  0.14617864  0.17102655]
[ 0.21867406  0.18511892  0.21305488  0.16472572  0.21842642]
[ 0.29856607  0.13646801  0.27196606  0.11562552  0.17737434]
[ 0.242347    0.14102063  0.21716951  0.2355229   0.16393996]
[ 0.26597326  0.10009752  0.23362892  0.24560198  0.15469832]
[ 0.23337289  0.11918746  0.28540761  0.20197928  0.16005275]]
[[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]]

## 1.2 align-free 变长映射

B(aa%bb%%cc)=B(%a%b%cc%)=abc B ( a a % b b % % c c ) = B ( % a % b % c c % ) = a b c

LTLT L ′ T → L ≤ T

## 1.3 似然计算

p(l|x)=πB1(l)p(π|x) p ( l | x ) = ∑ π ∈ B − 1 ( l ) p ( π | x )

CTC 假设输出的概率是（相对于输入）条件独立的，因此有：

$p\left(\pi |x\right)=\prod {y}_{{\pi }_{t}}^{t},\mathrm{\forall }\pi \in {L}^{\mathrm{\prime }T}$

## 1.4 前向算法

αt(s)=defπNT:B(π1:t)=l1:st=1tytπ α t ( s ) = d e f ∑ π ∈ N T : B ( π 1 : t ) = l 1 : s ∏ t ′ = 1 t y π ′ t

α1(1)=y1b,α1(2)=y1l1,α1(s)=0,s>2(1)(2)(3) (1) α 1 ( 1 ) = y b 1 , (2) α 1 ( 2 ) = y l 1 1 , (3) α 1 ( s ) = 0 , ∀ s > 2

αt(s)={(αt1(s)+αt1(s1))ytls,   if ls=b or ls2=ls(αt1(s)+αt1(s1)+αt1(s2))ytls  otherwise α t ( s ) = { ( α t − 1 ( s ) + α t − 1 ( s − 1 ) ) y l s ′ t ,       i f   l s ′ = b   o r   l s − 2 ′ = l s ′ ( α t − 1 ( s ) + α t − 1 ( s − 1 ) + α t − 1 ( s − 2 ) ) y l s ′ t     o t h e r w i s e

### 1.4.2 Case 1

def forward(y, labels):
T, V = y.shape
L = len(labels)
alpha = np.zeros([T, L])

# init
alpha[0, 0] = y[0, labels[0]]
alpha[0, 1] = y[0, labels[1]]

for t in range(1, T):
for i in range(L):
s = labels[i]

a = alpha[t - 1, i]
if i - 1 >= 0:
a += alpha[t - 1, i - 1]
if i - 2 >= 0 and s != 0 and s != labels[i - 2]:
a += alpha[t - 1, i - 2]

alpha[t, i] = a * y[t, s]

return alpha

labels = [0, 3, 0, 3, 0, 4, 0]  # 0 for blank
alpha = forward(y, labels)
print(alpha)

p = alpha[-1, labels[-1]] + alpha[-1, labels[-2]]
print(p)
6.81811271177e-06

## 1.5 后向计算

βt(s)=defπNT:B(πt:T)=ls:|l|t=tTytπ β t ( s ) = d e f ∑ π ∈ N T : B ( π t : T ) = l s : | l | ∏ t ′ = t T y π ′ t

βT(|l|)=yTb,βT(|l|1)=yTl|l|,βT(s)=0,s<|l|1(4)(5)(6) (4) β T ( | l ′ | ) = y b T , (5) β T ( | l ′ | − 1 ) = y l | l | T , (6) β T ( s ) = 0 , ∀ s < | l ′ | − 1

βt(s)={(βt+1(s)+βt+1(s+1))ytls,   if ls=b or ls+2=ls(βt+1(s)+βt+1(s+1)+βt+1(s+2))ytls β t ( s ) = { ( β t + 1 ( s ) + β t + 1 ( s + 1 ) ) y l s ′ t ,       i f   l s ′ = b   o r   l s + 2 ′ = l s ′ ( β t + 1 ( s ) + β t + 1 ( s + 1 ) + β t + 1 ( s + 2 ) ) y l s ′ t

def backward(y, labels):
T, V = y.shape
L = len(labels)
beta = np.zeros([T, L])

# init
beta[-1, -1] = y[-1, labels[-1]]
beta[-1, -2] = y[-1, labels[-2]]

for t in range(T - 2, -1, -1):
for i in range(L):
s = labels[i]

a = beta[t + 1, i]
if i + 1 < L:
a += beta[t + 1, i + 1]
if i + 2 < L and s != 0 and s != labels[i + 2]:
a += beta[t + 1, i + 2]

beta[t, i] = a * y[t, s]

return beta

beta = backward(y, labels)
print(beta)

## 1.6 梯度计算

αt(s)βt(s)=πB1(l):πt=lsytlst=1Tytπt=ytlsπB1(l):πt=lst=1Tytπt α t ( s ) β t ( s ) = ∑ π ∈ B − 1 ( l ) : π t = l s ′ y l s ′ t ∏ t = 1 T y π t t = y l s ′ t ⋅ ∑ π ∈ B − 1 ( l ) : π t = l s ′ ∏ t = 1 T y π t t

αt(s)βt(s)ytls=πB1(l):πt=lst=1Tytπt=πB1(l):πt=lsp(π|x) α t ( s ) β t ( s ) y l s ′ t = ∑ π ∈ B − 1 ( l ) : π t = l s ′ ∏ t = 1 T y π t t = ∑ π ∈ B − 1 ( l ) : π t = l s ′ p ( π | x )

p(l|x)=s=1|l|πB1(l):πt=lsp(π|x)=s=1|l|αt(s)βt(s)ytls p ( l | x ) = ∑ s = 1 | l ′ | ∑ π ∈ B − 1 ( l ) : π t = l s ′ p ( π | x ) = ∑ s = 1 | l ′ | α t ( s ) β t ( s ) y l s ′ t

p(l|x)ytk=αt(k)βt(k)ytkytk ∂ p ( l | x ) ∂ y k t = ∂ α t ( k ) β t ( k ) y k t ∂ y k t

p(l|x)ytk=2αt(k)βt(k)ytkytkαt(k)βt(k)1ytk2=αt(k)βt(k)ytk2 ∂ p ( l | x ) ∂ y k t = 2 ⋅ α t ( k ) β t ( k ) y k t ⋅ y k t − α t ( k ) β t ( k ) ⋅ 1 y k t 2 = α t ( k ) β t ( k ) y k t 2

l l $l$ 中可能包含多个 $k$$k$ 字符，它们计算的梯度要进行累加，因此，最后的梯度计算结果为：

p(l|x)ytk=1ytk2slab(l,k)αt(s)βt(s) ∂ p ( l | x ) ∂ y k t = 1 y k t 2 ∑ s ∈ l a b ( l , k ) α t ( s ) β t ( s )

ln(p(l|x))ytk=1p(l|x)p(l|x)ytk ∂ ln ⁡ ( p ( l | x ) ) ∂ y k t = 1 p ( l | x ) ∂ p ( l | x ) ∂ y k t

$O\left(D,{N}_{w}\right)=-\sum _{\left(x,z\right)\in D}\mathrm{ln}\left(p\left(z|x\right)\right)$

T, V = y.shape
L = len(labels)

alpha = forward(y, labels)
beta = backward(y, labels)
p = alpha[-1, -1] + alpha[-1, -2]

for t in range(T):
for s in range(V):
lab = [i for i, c in enumerate(labels) if c == s]
for i in lab:
grad[t, s] += alpha[t, i] * beta[t, i]
grad[t, s] /= y[t, s] ** 2

def check_grad(y, labels, w=-1, v=-1, toleration=1e-3):

delta = 1e-10
original = y[w, v]

y[w, v] = original + delta
alpha = forward(y, labels)
log_p1 = np.log(alpha[-1, -1] + alpha[-1, -2])

y[w, v] = original - delta
alpha = forward(y, labels)
log_p2 = np.log(alpha[-1, -1] + alpha[-1, -2])

y[w, v] = original

grad_2 = (log_p1 - log_p2) / (2 * delta)

for toleration in [1e-5, 1e-6]:
print('%.e' % toleration)
for w in range(y.shape[0]):
for v in range(y.shape[1]):
1e-05
1e-06
[0, 3]：3.91e-06
[1, 0]：3.61e-06
[1, 3]：2.66e-06
[2, 0]：2.67e-06
[2, 3]：3.88e-06
[3, 0]：4.71e-06
[3, 3]：3.39e-06
[4, 0]：1.24e-06
[4, 3]：4.79e-06
[5, 0]：1.57e-06
[5, 3]：2.98e-06
[6, 0]：5.03e-06
[6, 3]：4.89e-06
[7, 0]：1.05e-06
[7, 4]：4.19e-06
[8, 4]：5.57e-06
[9, 0]：5.95e-06
[9, 3]：3.85e-06
[10, 0]：1.09e-06
[10, 3]：1.53e-06
[10, 4]：3.82e-06

## 1.7 logits 梯度

ln(p(l|x))ytk=ytk1ytkp(l|x)slab(l,k)αt(s)βt(s) ∂ ln ⁡ ( p ( l | x ) ) ∂ y k t = y k t − 1 y k t ⋅ p ( l | x ) ∑ s ∈ l a b ( l , k ) α t ( s ) β t ( s )

ln(p(l|x))utk=ytk(ln(p(l|x))ytkj=1Vln(p(l|x))ytjytj) ∂ ln ⁡ ( p ( l | x ) ) ∂ u k t = y k t ( ∂ ln ⁡ ( p ( l | x ) ) ∂ y k t − ∑ j = 1 V ∂ ln ⁡ ( p ( l | x ) ) ∂ y j t y j t )

ln(p(l|x))utk=1ytkp(l|x)slab(l,k)αt(s)βt(s)ytk ∂ ln ⁡ ( p ( l | x ) ) ∂ u k t = 1 y k t p ( l | x ) ∑ s ∈ l a b ( l , k ) α t ( s ) β t ( s ) − y k t

'''
'''

'''
'''
T, V = y.shape
L = len(labels)

alpha = forward(y, labels)
beta = backward(y, labels)
p = alpha[-1, -1] + alpha[-1, -2]

for t in range(T):
for s in range(V):
lab = [i for i, c in enumerate(labels) if c == s]
for i in lab:
u_grad[t, s] += alpha[t, i] * beta[t, i]
u_grad[t, s] /= y[t, s] * p

1.34961486431e-15

def check_grad_logits(x, labels, w=-1, v=-1, toleration=1e-3):

delta = 1e-10
original = x[w, v]

x[w, v] = original + delta
y = softmax(x)
alpha = forward(y, labels)
log_p1 = np.log(alpha[-1, -1] + alpha[-1, -2])

x[w, v] = original - delta
y = softmax(x)
alpha = forward(y, labels)
log_p2 = np.log(alpha[-1, -1] + alpha[-1, -2])

x[w, v] = original

grad_2 = (log_p1 - log_p2) / (2 * delta)

np.random.seed(1111)
x = np.random.random([10, 10])
for toleration in [1e-5, 1e-6]:
print('%.e' % toleration)
for w in range(x.shape[0]):
for v in range(x.shape[1]):

# 2. 数值稳定性

CTC 的训练过程面临数值下溢的风险，特别是序列较大的情况下。下面介绍两种数值上稳定的工程优化方法：1）log 域（许多 CRF 实现的常用方法）；2）scale 技巧（原始论文 [1] 使用的方法）。

## 2.1 log 域计算

log 计算涉及 logsumexp 操作。

ninf = -np.float('inf')

def _logsumexp(a, b):
'''
np.log(np.exp(a) + np.exp(b))

'''

if a < b:
a, b = b, a

if b == ninf:
return a
else:
return a + np.log(1 + np.exp(b - a))

def logsumexp(*args):
'''
from scipy.special import logsumexp
logsumexp(args)
'''
res = args[0]
for e in args[1:]:
res = _logsumexp(res, e)
return res

### 2.1.1 log 域前向算法

def forward_log(log_y, labels):
T, V = log_y.shape
L = len(labels)
log_alpha = np.ones([T, L]) * ninf

# init
log_alpha[0, 0] = log_y[0, labels[0]]
log_alpha[0, 1] = log_y[0, labels[1]]

for t in range(1, T):
for i in range(L):
s = labels[i]

a = log_alpha[t - 1, i]
if i - 1 >= 0:
a = logsumexp(a, log_alpha[t - 1, i - 1])
if i - 2 >= 0 and s != 0 and s != labels[i - 2]:
a = logsumexp(a, log_alpha[t - 1, i - 2])

log_alpha[t, i] = a + log_y[t, s]

return log_alpha

log_alpha = forward_log(np.log(y), labels)
alpha = forward(y, labels)
print(np.sum(np.abs(np.exp(log_alpha) - alpha)))
8.60881935942e-17

### 2.1.2 log 域后向算法

def backward_log(log_y, labels):
T, V = log_y.shape
L = len(labels)
log_beta = np.ones([T, L]) * ninf

# init
log_beta[-1, -1] = log_y[-1, labels[-1]]
log_beta[-1, -2] = log_y[-1, labels[-2]]

for t in range(T - 2, -1, -1):
for i in range(L):
s = labels[i]

a = log_beta[t + 1, i]
if i + 1 < L:
a = logsumexp(a, log_beta[t + 1, i + 1])
if i + 2 < L and s != 0 and s != labels[i + 2]:
a = logsumexp(a, log_beta[t + 1, i + 2])

log_beta[t, i] = a + log_y[t, s]

return log_beta

log_beta = backward_log(np.log(y), labels)
beta = backward(y, labels)
print(np.sum(np.abs(np.exp(log_beta) - beta)))
1.10399945005e-16

### 2.1.3 log 域梯度计算

T, V = log_y.shape
L = len(labels)

log_alpha = forward_log(log_y, labels)
log_beta = backward_log(log_y, labels)
log_p = logsumexp(log_alpha[-1, -1], log_alpha[-1, -2])

log_grad = np.ones([T, V]) * ninf
for t in range(T):
for s in range(V):
lab = [i for i, c in enumerate(labels) if c == s]
for i in lab:
log_grad[t, s] -= 2 * log_y[t, s]

4.97588081849e-14

## 2.2 scale

### 2.2.1 前向算法

Ct=defsαt(s) C t = d e f ∑ s α t ( s )

α^t=αt(s)Ct α ^ t = α t ( s ) C t

def forward_scale(y, labels):
T, V = y.shape
L = len(labels)
alpha_scale = np.zeros([T, L])

# init
alpha_scale[0, 0] = y[0, labels[0]]
alpha_scale[0, 1] = y[0, labels[1]]
Cs = []

C = np.sum(alpha_scale[0])
alpha_scale[0] /= C
Cs.append(C)

for t in range(1, T):
for i in range(L):
s = labels[i]

a = alpha_scale[t - 1, i]
if i - 1 >= 0:
a += alpha_scale[t - 1, i - 1]
if i - 2 >= 0 and s != 0 and s != labels[i - 2]:
a += alpha_scale[t - 1, i - 2]

alpha_scale[t, i] = a * y[t, s]

C = np.sum(alpha_scale[t])
alpha_scale[t] /= C
Cs.append(C)

return alpha_scale, Cs

p(l|x)=αT(|l|)+αT(|l|1)=(α^T(|l|)+α^T(|l|1)t=1TCt p ( l | x ) = α T ( | l ′ | ) + α T ( | l ′ | − 1 ) = ( α ^ T ( | l ′ | ) + α ^ T ( | l ′ | − 1 ) ∗ ∏ t = 1 T C t

ln(p(l|x))=tTln(Ct)+ln(α^T(|l|)+α^T(|l|1)) ln ⁡ ( p ( l | x ) ) = ∑ t T ln ⁡ ( C t ) + ln ⁡ ( α ^ T ( | l ′ | ) + α ^ T ( | l ′ | − 1 ) )

labels = [0, 1, 2, 0]  # 0 for blank

alpha_scale, Cs = forward_scale(y, labels)
log_p = np.sum(np.log(Cs)) + np.log(alpha_scale[-1][labels[-1]] + alpha_scale[-1][labels[-2]])

alpha = forward(y, labels)
p = alpha[-1, labels[-1]] + alpha[-1, labels[-2]]

print(np.log(p), log_p, np.log(p) - log_p)
(-13.202925982240107, -13.202925982240107, 0.0)

### 2.2.2 后向算法

Dt=defsβt(s) D t = d e f ∑ s β t ( s )

β^t=βt(s)Dt β ^ t = β t ( s ) D t

def backward_scale(y, labels):
T, V = y.shape
L = len(labels)
beta_scale = np.zeros([T, L])

# init
beta_scale[-1, -1] = y[-1, labels[-1]]
beta_scale[-1, -2] = y[-1, labels[-2]]

Ds = []

D = np.sum(beta_scale[-1,:])
beta_scale[-1] /= D
Ds.append(D)

for t in range(T - 2, -1, -1):
for i in range(L):
s = labels[i]

a = beta_scale[t + 1, i]
if i + 1 < L:
a += beta_scale[t + 1, i + 1]
if i + 2 < L and s != 0 and s != labels[i + 2]:
a += beta_scale[t + 1, i + 2]

beta_scale[t, i] = a * y[t, s]

D = np.sum(beta_scale[t])
beta_scale[t] /= D
Ds.append(D)

return beta_scale, Ds[::-1]

beta_scale, Ds = backward_scale(y, labels)
print(beta_scale)

### 2.2.3 梯度计算

ln(p(l|x))ytk=1p(l|x)p(l|x)ytk=1p(l|x)1ytk2slab(l,k)αt(s)βt(s) ∂ ln ⁡ ( p ( l | x ) ) ∂ y k t = 1 p ( l | x ) ∂ p ( l | x ) ∂ y k t = 1 p ( l | x ) 1 y k t 2 ∑ s ∈ l a b ( l , k ) α t ( s ) β t ( s )

p(l|x)=s=1|l|αt(s)βt(s)ytls p ( l | x ) = ∑ s = 1 | l ′ | α t ( s ) β t ( s ) y l s ′ t

αt(s)=α^t(s)k=1tCk α t ( s ) = α ^ t ( s ) ⋅ ∏ k = 1 t C k

βt(s)=β^t(s)k=tTDk β t ( s ) = β ^ t ( s ) ⋅ ∏ k = t T D k

ln(p(l|x))ytk=1|l|s=1α^t(s)β^t(s)ytls1ytk2slab(l,k)α^t(s)β^t(s) ∂ ln ⁡ ( p ( l | x ) ) ∂ y k t = 1 ∑ s = 1 | l ′ | α ^ t ( s ) β ^ t ( s ) y l s ′ t 1 y k t 2 ∑ s ∈ l a b ( l , k ) α ^ t ( s ) β ^ t ( s )

T, V = y.shape
L = len(labels)

alpha_scale, _ = forward_scale(y, labels)
beta_scale, _ = backward_scale(y, labels)

for t in range(T):
for s in range(V):
lab = [i for i, c in enumerate(labels) if c == s]
for i in lab:
grad[t, s] += alpha_scale[t, i] * beta_scale[t, i]
grad[t, s] /= y[t, s] ** 2

# normalize factor
z = 0
for i in range(L):
z += alpha_scale[t, i] * beta_scale[t, i] / y[t, labels[i]]

labels = [0, 3, 0, 3, 0, 4, 0]  # 0 for blank
6.86256607096e-15

### 2.2.4 logits 梯度

ln(p(l|x))utk=1ytkZtslab(l,k)α^t(s)β^t(s)ytk ∂ ln ⁡ ( p ( l | x ) ) ∂ u k t = 1 y k t Z t ∑ s ∈ l a b ( l , k ) α ^ t ( s ) β ^ t ( s ) − y k t

Zt=defs=1|l|α^t(s)β^t(s)ytls Z t = d e f ∑ s = 1 | l ′ | α ^ t ( s ) β ^ t ( s ) y l s ′ t

# 3. 解码

h(x)=argmaxlLT p(l|x) h ( x ) = a r g m a x l ∈ L ≤ T   p ( l | x )

p(π|x)=k=1Tp(πk|x) p ( π | x ) = ∏ k = 1 T p ( π k | x )

h(x)B(π) h ( x ) ≈ B ( π ⋆ )

π=argmaxπNT p(π|x) π ⋆ = a r g m a x π ∈ N T   p ( π | x )

π=catTt=1(argmaxsL yts) π ⋆ = c a t t = 1 T ( a r g m a x s ∈ L ′   y s t )

def remove_blank(labels, blank=0):
new_labels = []

# combine duplicate
previous = None
for l in labels:
if l != previous:
new_labels.append(l)
previous = l

# remove blank
new_labels = [l for l in new_labels if l != blank]

return new_labels

def insert_blank(labels, blank=0):
new_labels = [blank]
for l in labels:
new_labels += [l, blank]
return new_labels

def greedy_decode(y, blank=0):
raw_rs = np.argmax(y, axis=1)
rs = remove_blank(raw_rs, blank)
return raw_rs, rs

np.random.seed(1111)
y = softmax(np.random.random([20, 6]))
rr, rs = greedy_decode(y)
print(rr)
print(rs)
[1 3 5 5 5 5 1 5 3 4 4 3 0 4 5 0 3 1 3 3]
[1, 3, 5, 1, 5, 3, 4, 3, 4, 5, 3, 1, 3]

def beam_decode(y, beam_size=10):
T, V = y.shape
log_y = np.log(y)

beam = [([], 0)]
for t in range(T):  # for every timestep
new_beam = []
for prefix, score in beam:
for i in range(V):  # for every state
new_prefix = prefix + [i]
new_score = score + log_y[t, i]

new_beam.append((new_prefix, new_score))

# top beam_size
new_beam.sort(key=lambda x: x[1], reverse=True)
beam = new_beam[:beam_size]

return beam

np.random.seed(1111)
y = softmax(np.random.random([20, 6]))
beam = beam_decode(y, beam_size=100)
for string, score in beam[:20]:
print(remove_blank(string), score)

from collections import defaultdict

def prefix_beam_decode(y, beam_size=10, blank=0):
T, V = y.shape
log_y = np.log(y)

beam = [(tuple(), (0, ninf))]  # blank, non-blank
for t in range(T):  # for every timestep
new_beam = defaultdict(lambda : (ninf, ninf))

for prefix, (p_b, p_nb) in beam:
for i in range(V):  # for every state
p = log_y[t, i]

if i == blank:  # propose a blank
new_p_b, new_p_nb = new_beam[prefix]
new_p_b = logsumexp(new_p_b, p_b + p, p_nb + p)
new_beam[prefix] = (new_p_b, new_p_nb)
continue
else:  # extend with non-blank
end_t = prefix[-1] if prefix else None

# exntend current prefix
new_prefix = prefix + (i,)
new_p_b, new_p_nb = new_beam[new_prefix]
if i != end_t:
new_p_nb = logsumexp(new_p_nb, p_b + p, p_nb + p)
else:
new_p_nb = logsumexp(new_p_nb, p_b + p)
new_beam[new_prefix] = (new_p_b, new_p_nb)

# keep current prefix
if i == end_t:
new_p_b, new_p_nb = new_beam[prefix]
new_p_nb = logsumexp(new_p_nb, p_nb + p)
new_beam[prefix] = (new_p_b, new_p_nb)

# top beam_size
beam = sorted(new_beam.items(), key=lambda x : logsumexp(*x[1]), reverse=True)
beam = beam[:beam_size]

return beam

np.random.seed(1111)
y = softmax(np.random.random([20, 6]))
beam = prefix_beam_decode(y, beam_size=100)
for string, score in beam[:20]:
print(remove_blank(string), score)

# 4. 工具

warp-ctc 是百度开源的基于 CPU 和 GPU 的高效并行实现。warp-ctc 自身提供 C 语言接口，对于流利的机器学习工具（torchpytorchtensorflowchainer）都有相应的接口绑定。

cudnn 7 以后开始提供 CTC 支持。

Tensorflow 也原生支持 CTC loss，及 greedy 和 beam search 解码器。

# 小结

1. CTC 可以建模无对齐信息的多对多序列问题（输入长度不小于输出），如语音识别、连续字符识别 [3,4]。
2. CTC 不需要输入与输出的对齐信息，可以实现端到端的训练。
3. CTC 在 loss 的计算上，利用了整个 labels 序列的全局信息，某种意义上相对逐帧计算损失的方法，”更加区分性”。

# References

05-03 2万+
01-18 5020
01-30 7113
04-03 4462
04-16 583
01-13 2523
06-26 1万+
12-25 9324
03-28 165
02-20 305
12-26 868
12-22 1129
04-14 4559
10-07 197
12-19 5038
11-02 1122
08-08 5043
11-20 1万+
02-20 1万+

### “相关推荐”对你有帮助么？

• 非常没帮助
• 没帮助
• 一般
• 有帮助
• 非常有帮助

MoussaTintin

¥2 ¥4 ¥6 ¥10 ¥20

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、C币套餐、付费专栏及课程。