conlleval_conlleval_rev.pl

#!/usr/bin/perl -w

# conlleval_rev.perl: evaluate result of processing SIGHAN NER task

# usage: conlleval_rev.perl [-l] [-r] [-d delimiterTag] < file

# README: http://cnts.uia.ac.be/conll2000/chunking/output.html

# options: l: generate LaTeX output for tables like in

# http://cnts.uia.ac.be/conll2003/ner/example.tex

# r: accept raw result tags (without B- and I- prefix;

# assumes one character per chunk)

# d: alternative delimiter tag (default is single space)

# note: the file should contain lines with items separated

# by $delimiter characters (default space). The final

# two items should contain the correct tag and the

# guessed tag in that order.

# url: http://lcg-www.uia.ac.be/conll2000/chunking/

# started: 1998-09-25

# version: 2006-04-26

# original author: Erik Tjong Kim Sang

# adapted by: Gina-Anne Levow

# modified to handle SIGHAN coding and non-phrase tags

use strict;

my $false = 0;

my $true = 42;

my $boundary = "-X-"; # sentence boundary

my $correct; # current corpus chunk tag (I,O,B)

my $correctChunk = 0; # number of correctly identified chunks

my $correctTags = 0; # number of correct chunk tags

my $correctType; # type of current corpus chunk tag (NP,VP,etc.)

my $delimiter = " "; # field delimiter

my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979)

my $firstItem; # first feature (for sentence boundary checks)

my $foundCorrect = 0; # number of chunks in corpus

my $foundGuessed = 0; # number of identified chunks

my $guessed; # current guessed chunk tag

my $guessedType; # type of current guessed chunk tag

my $i; # miscellaneous counter

my $inCorrect = $false; # currently processed chunk is correct until now

my $lastCorrect = "O"; # previous chunk tag in corpus

my $latex = 0; # generate LaTeX formatted output

my $lastCorrectType = ""; # type of previously identified chunk tag

my $lastGuessed = "O"; # previously identified chunk tag

my $lastGuessedType = ""; # type of previous chunk tag in corpus

my $lastType; # temporary storage for detecting duplicates

my $line; # line

my $nbrOfFeatures = -1; # number of features per line

my $precision = 0.0; # precision score

#my $oTag = "O"; # outside tag, default O

my $oTag = "0"; # outside tag, default 0

my $raw = 0; # raw input: add B to every token

my $recall = 0.0; # recall score

my $tokenCounter = 0; # token counter (ignores sentence breaks)

my %correctChunk = (); # number of correctly identified chunks per type

my %foundCorrect = (); # number of chunks in corpus per type

my %foundGuessed = (); # number of identified chunks per type

my @features; # features on line

my @sortedTypes; # sorted list of chunk type names

# sanity check

while (@ARGV and $ARGV[0] =~ /^-/) {

if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }

elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }

elsif ($ARGV[0] eq "-d") {

shift(@ARGV);

if (not defined $ARGV[0]) {

die "conlleval: -d requires delimiter character";

}

$delimiter = shift(@ARGV);

} elsif ($ARGV[0] eq "-o") {

shift(@ARGV);

if (not defined $ARGV[0]) {

die "conlleval: -o requires delimiter character";

}

$oTag = shift(@ARGV);

} else { die "conlleval: unknown argument $ARGV[0]\n"; }

}

if (@ARGV) { die "conlleval: unexpected command line argument\n"; }

# process input

while () {

chomp($line = $_);

@features = split(/$delimiter/,$line);

if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }

elsif ($nbrOfFeatures != $#features and @features != 0) {

printf STDERR "unexpected number of features: %d (%d)\n",

$#features+1,$nbrOfFeatures+1;

exit(1);

}

if (@features == 0 or

$features[0] eq $boundary) { @features = ($boundary,"0","0"); }

if (@features < 2) {

die "conlleval: unexpected number of features in line $line\n";

}

if ($raw) {

if ($features[$#features] eq $oTag) { $features[$#features] = "0"; }

if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "0"; }

if ($features[$#features] ne "0") {

$features[$#features] = "B-$features[$#features]";

}

if ($features[$#features-1] ne "0") {

$features[$#features-1] = "B-$features[$#features-1]";

}

}

# 20040126 ET code which allows hyphens in the types

if ($features[$#features] =~ /^([^-]*)-(.*)$/) {

$guessed = $1;

$guessedType = $2;

} else {

$guessed = $features[$#features];

$guessedType = "";

}

pop(@features);

if ($features[$#features] =~ /^([^-]*)-(.*)$/) {

$correct = $1;

$correctType = $2;

} else {

$correct = $features[$#features];

$correctType = "";

}

pop(@features);

# ($guessed,$guessedType) = split(/-/,pop(@features));

# ($correct,$correctType) = split(/-/,pop(@features));

$guessedType = $guessedType ? $guessedType : "";

$correctType = $correctType ? $correctType : "";

$firstItem = shift(@features);

# 1999-06-26 sentence breaks should always be counted as out of chunk

#if ( $firstItem eq $boundary ) { $guessed = "0"; }

if ($inCorrect) {

if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and

&endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and

$lastGuessedType eq $lastCorrectType) {

$inCorrect=$false;

$correctChunk++;

$correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?

$correctChunk{$lastCorrectType}+1 : 1;

} elsif (

&endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) !=

&endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or

$guessedType ne $correctType ) {

$inCorrect=$false;

}

}

if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and

&startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and

$guessedType eq $correctType) { $inCorrect = $true; }

if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {

$foundCorrect++;

$foundCorrect{$correctType} = $foundCorrect{$correctType} ?

$foundCorrect{$correctType}+1 : 1;

}

if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {

$foundGuessed++;

$foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?

$foundGuessed{$guessedType}+1 : 1;

}

# if ( $firstItem ne $boundary ) {

if ( $correct eq $guessed and $guessedType eq $correctType ) {

$correctTags++;

}

$tokenCounter++;

# }

$lastGuessed = $guessed;

$lastCorrect = $correct;

$lastGuessedType = $guessedType;

$lastCorrectType = $correctType;

}

if ($inCorrect) {

$correctChunk++;

$correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?

$correctChunk{$lastCorrectType}+1 : 1;

}

if (not $latex) {

# compute overall precision, recall and FB1 (default values are 0.0)

$precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);

$recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);

$FB1 = 2*$precision*$recall/($precision+$recall)

if ($precision+$recall > 0);

# print overall performance

printf "processed $tokenCounter tokens with $foundCorrect phrases; ";

printf "found: $foundGuessed phrases; correct: $correctChunk.\n";

if ($tokenCounter>0) {

printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;

printf "precision: %6.2f%%; ",$precision;

printf "recall: %6.2f%%; ",$recall;

printf "FB1: %6.2f\n",$FB1;

}

}

# sort chunk type names

undef($lastType);

@sortedTypes = ();

foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {

if (not($lastType) or $lastType ne $i) {

push(@sortedTypes,($i));

}

$lastType = $i;

}

# print performance per chunk type

if (not $latex) {

for $i (@sortedTypes) {

$correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;

if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }

else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }

if (not($foundCorrect{$i})) { $recall = 0.0; }

else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }

if ($precision+$recall == 0.0) { $FB1 = 0.0; }

else { $FB1 = 2*$precision*$recall/($precision+$recall); }

printf "%17s: ",$i;

printf "precision: %6.2f%%; ",$precision;

printf "recall: %6.2f%%; ",$recall;

printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i};

}

} else {

print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline";

for $i (@sortedTypes) {

$correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;

if (not($foundGuessed{$i})) { $precision = 0.0; }

else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }

if (not($foundCorrect{$i})) { $recall = 0.0; }

else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }

if ($precision+$recall == 0.0) { $FB1 = 0.0; }

else { $FB1 = 2*$precision*$recall/($precision+$recall); }

printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",

$i,$precision,$recall,$FB1;

}

print "\\hline\n";

$precision = 0.0;

$recall = 0;

$FB1 = 0.0;

$precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);

$recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);

$FB1 = 2*$precision*$recall/($precision+$recall)

if ($precision+$recall > 0);

printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",

$precision,$recall,$FB1;

}

exit 0;

# endOfChunk: checks if a chunk ended between the previous and current word

# arguments: previous and current chunk tags, previous and current types

# note: this code is capable of handling other chunk representations

# than the default CoNLL-2000 ones, see EACL'99 paper of Tjong

# Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006

sub endOfChunk {

my $prevTag = shift(@_);

my $tag = shift(@_);

my $prevType = shift(@_);

my $type = shift(@_);

my $chunkEnd = $false;

if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }

if ( $prevTag eq "B" and $tag eq "0" ) { $chunkEnd = $true; }

if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }

if ( $prevTag eq "I" and $tag eq "0" ) { $chunkEnd = $true; }

if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }

if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }

if ( $prevTag eq "E" and $tag eq "0" ) { $chunkEnd = $true; }

if ( $prevTag eq "I" and $tag eq "0" ) { $chunkEnd = $true; }

if ($prevTag ne "0" and $prevTag ne "." and $prevType ne $type) {

$chunkEnd = $true;

}

# corrected 1998-12-22: these chunks are assumed to have length 1

if ( $prevTag eq "]" ) { $chunkEnd = $true; }

if ( $prevTag eq "[" ) { $chunkEnd = $true; }

return($chunkEnd);

}

# startOfChunk: checks if a chunk started between the previous and current word

# arguments: previous and current chunk tags, previous and current types

# note: this code is capable of handling other chunk representations

# than the default CoNLL-2000 ones, see EACL'99 paper of Tjong

# Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006

sub startOfChunk {

my $prevTag = shift(@_);

my $tag = shift(@_);

my $prevType = shift(@_);

my $type = shift(@_);

my $chunkStart = $false;

if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }

if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }

if ( $prevTag eq "0" and $tag eq "B" ) { $chunkStart = $true; }

if ( $prevTag eq "0" and $tag eq "I" ) { $chunkStart = $true; }

if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }

if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }

if ( $prevTag eq "0" and $tag eq "E" ) { $chunkStart = $true; }

if ( $prevTag eq "0" and $tag eq "I" ) { $chunkStart = $true; }

if ($tag ne "0" and $tag ne "." and $prevType ne $type) {

$chunkStart = $true;

}

# corrected 1998-12-22: these chunks are assumed to have length 1

if ( $tag eq "[" ) { $chunkStart = $true; }

if ( $tag eq "]" ) { $chunkStart = $true; }

return($chunkStart);

}

一键复制

编辑

Web IDE

原始数据

按行查看

历史

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值