BLEU
BLEU (bilingual evaluation understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another. This metric uses a modified form of precision to compare a candidate translation against multiple reference translations.
BLEU 用来评测由机器翻译转换成另一种语言的文本的质量,用一种修改后的precision将一种候选的翻译结果和多种参考进行比较
BLEU metric ranges from 0 to 1 and only few generated responses will obtain a score of 1 unless they are exactly the same as the original gold response.
The original BLEU metric first computes the n-gram matches sentence by sentence from 1 to 4, adds the clipped n-gram counts for all the candidates sentence, and divides by the number of candidate n-grams in the original dataset (Reference) to get the modified precision score. The calculation method shows as following:
p
n
=
∑
c
∈
{
Candidates
}
∑
n
−
g
r
a
m
∈
C
Count
clip
(
n
−
gram
)
∑
C
′
∈
{
Candidates
}
∑
∑
n
−
g
r
a
m
′
∈
C
′
Count
(
n
−
g
r
a
m
′
)
.
\begin{array}{l} p_{n}= \frac{\sum_{c \in\{\text { Candidates }\}} \sum_{n-g r a m \in \mathcal{C}} \text { Count }_{\text {clip }}(n-\text { gram })}{\sum_{C^{\prime} \in\{\text { Candidates }\}} \sum \sum_{n-g r a m^{\prime} \in C^{\prime}} \operatorname{Count}\left(n-g r a m^{\prime}\right)} . \end{array}
pn=∑C′∈{ Candidates }∑∑n−gram′∈C′Count(n−gram′)∑c∈{ Candidates }∑n−gram∈C Count clip (n− gram ).
where
C
o
u
n
t
c
l
i
p
=
min
(
C
o
u
n
t
,
M
a
x
_
R
e
f
_
C
o
u
n
t
)
Count_{clip}=\min (Count, Max\_Ref\_Count)
Countclip=min(Count,Max_Ref_Count)
and
M
a
x
_
R
e
f
_
C
o
u
n
t
Max\_Ref\_Count
Max_Ref_Count represent the frequency of the n-gram in all the References.
C o u n t c l i p Count_{clip} Countclip ensures that each word frequency will not exceed the largest count number in Reference for that word.
Let c c c be the length of the candidate translation and r r r be the effective reference corpus length. We compute the brevity penalty BP,
B P = { 1 if c > r e ( 1 − r / c ) if c ≤ r \mathrm{BP}=\left\{\begin{array}{ll} 1 & \text { if } c>r \\ e^{(1-r / c)} & \text { if } c \leq r \end{array}\right. BP={1e(1−r/c) if c>r if c≤r
B L E U = B P exp ( ∑ n = 1 N w n log p n ) B L E U=B P \exp \left(\sum_{n=1}^{N} w_{n} \log p_{n}\right) BLEU=BPexp(n=1∑Nwnlogpn)
Distinct
As a typical sequence to sequence model tend to generate generic and normal responses (e.g., That is good) without considering the input information. Distinct is an auxiliary metric for evaluating the textual diversity of the generated response by calculating the number of distinct n-grams. The larger the number of distinct n-grams, the higher the diversity of the text.
See details.
D
i
s
t
i
n
c
t
=
Different n-gram
Total n-gram
\mathrm{Distinct}=\frac{\text{Different n-gram}}{\text{Total n-gram}}
Distinct=Total n-gramDifferent n-gram
F1
Measure the overlap between generated sentences and golden one.
F
1
=
2
⋅
P
r
e
c
i
s
i
o
n
⋅
R
e
c
a
l
l
P
r
e
c
i
s
i
o
n
+
R
e
c
a
l
l
\mathrm{F}1 = 2 \cdot \frac{Precision \cdot Recall}{Precision + Recall}
F1=2⋅Precision+RecallPrecision⋅Recall
where
R
e
c
a
l
l
=
Number of overlap characters between generated and golden responses
Number of characters in golden responses
Recall = \frac{\text{Number of overlap characters between generated and golden responses}}{\text{Number of characters in golden responses}}
Recall=Number of characters in golden responsesNumber of overlap characters between generated and golden responses
P r e c i s i o n = Number of overlap characters between generated and golden responses Number of characters in generated responses Precision = \frac{\text{Number of overlap characters between generated and golden responses}}{\text{Number of characters in generated responses}} Precision=Number of characters in generated responsesNumber of overlap characters between generated and golden responses
Code
待评测文档(test.result.eval)形式,左半边generated 右半边golden sentences,用\t 分隔符隔开
调用以下eval.py的cmd语句
python eval.py test.result.eval
eval.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import math
from collections import Counter
if len(sys.argv) < 2:
print("Usage: " + sys.argv[0] + " eval_file")
print("eval file format: pred_response \t gold_response")
exit()
def get_dict(tokens, ngram, gdict=None):
"""
get_dict
统计n-gram频率并用dict存储
"""
token_dict = {}
if gdict is not None:
token_dict = gdict
tlen = len(tokens)
for i in range(0, tlen - ngram + 1):
ngram_token = "".join(tokens[i:(i + ngram)])
if token_dict.get(ngram_token) is not None:
token_dict[ngram_token] += 1
else:
token_dict[ngram_token] = 1
return token_dict
def count(pred_tokens, gold_tokens, ngram, result):
"""
计算BLEU中pn
"""
cover_count, total_count = result
pred_dict = get_dict(pred_tokens, ngram)
gold_dict = get_dict(gold_tokens, ngram)
cur_cover_count = 0
cur_total_count = 0
for token, freq in pred_dict.items():
if gold_dict.get(token) is not None:
gold_freq = gold_dict[token]
cur_cover_count += min(freq, gold_freq)
cur_total_count += freq
result[0] += cur_cover_count
result[1] += cur_total_count
def calc_bp(pair_list):
"""
calc_bp
"""
c_count = 0.0
r_count = 0.0
for pair in pair_list:
pred_tokens, gold_tokens = pair
c_count += len(pred_tokens)
r_count += len(gold_tokens)
bp = 1
if c_count < r_count:
bp = math.exp(1 - r_count / c_count)
return bp
def calc_cover_rate(pair_list, ngram):
"""
calc_cover_rate
"""
result = [0.0, 0.0] # [cover_count, total_count]
for pair in pair_list:
pred_tokens, gold_tokens = pair
count(pred_tokens, gold_tokens, ngram, result)
cover_rate = result[0] / result[1]
return cover_rate
def calc_bleu(pair_list):
"""
calc_bleu
"""
bp = calc_bp(pair_list)
cover_rate1 = calc_cover_rate(pair_list, 1)
cover_rate2 = calc_cover_rate(pair_list, 2)
cover_rate3 = calc_cover_rate(pair_list, 3)
bleu1 = 0
bleu2 = 0
bleu3 = 0
if cover_rate1 > 0:
bleu1 = bp * math.exp(math.log(cover_rate1))
if cover_rate2 > 0:
bleu2 = bp * math.exp((math.log(cover_rate1) + math.log(cover_rate2)) / 2)
if cover_rate3 > 0:
bleu3 = bp * math.exp((math.log(cover_rate1) + math.log(cover_rate2) + math.log(cover_rate3)) / 3)
return [bleu1, bleu2]
def calc_distinct_ngram(pair_list, ngram):
"""
calc_distinct_ngram
"""
ngram_total = 0.0
ngram_distinct_count = 0.0
pred_dict = {}
for predict_tokens, _ in pair_list:
get_dict(predict_tokens, ngram, pred_dict)
for key, freq in pred_dict.items():
ngram_total += freq
ngram_distinct_count += 1
#if freq == 1:
# ngram_distinct_count += freq
return ngram_distinct_count / ngram_total
def calc_distinct(pair_list):
"""
calc_distinct
"""
distinct1 = calc_distinct_ngram(pair_list, 1)
distinct2 = calc_distinct_ngram(pair_list, 2)
return [distinct1, distinct2]
def calc_f1(data):
"""
calc_f1
"""
golden_char_total = 0.0
pred_char_total = 0.0
hit_char_total = 0.0
for response, golden_response in data:
#golden_response = "".join(golden_response).decode("utf8")
#response = "".join(response).decode("utf8")
golden_response = "".join(golden_response)
response = "".join(response)
common = Counter(response) & Counter(golden_response)
hit_char_total += sum(common.values())
golden_char_total += len(golden_response)
pred_char_total += len(response)
p = hit_char_total / pred_char_total
r = hit_char_total / golden_char_total
f1 = 2 * p * r / (p + r)
return f1
eval_file = "your generated and golden response"
sents = []
for line in open(eval_file):
tk = line.strip().split("\t")
if len(tk) < 2:
continue
pred_tokens = tk[0].strip().split(" ")
gold_tokens = tk[1].strip().split(" ")
sents.append([pred_tokens, gold_tokens])
# calc f1
f1 = calc_f1(sents)
# calc bleu
bleu1, bleu2 = calc_bleu(sents)
# calc distinct
distinct1, distinct2 = calc_distinct(sents)
output_str = "F1: %.2f%%\n" % (f1 * 100)
output_str += "BLEU1: %.3f%%\n" % bleu1
output_str += "BLEU2: %.3f%%\n" % bleu2
output_str += "DISTINCT1: %.3f%%\n" % distinct1
output_str += "DISTINCT2: %.3f%%\n" % distinct2
sys.stdout.write(output_str)
以计算BLEU为例,调用函数顺序为calc_bleu,calc_bp, calc_cover_rate, count, dict
BLEU1和BLEU2分别为只考虑了unigram和bigram的BLEU值