perl -正向最大匹配 转自Sighan 提供的FMM程序

#!/usr/bin/perl -w
# GBK编码,参数一 词典  参数二 待分文本

#转自Sighan 提供的FMM程序 ,不是原创

#所以把人家的声明都放在下面了

###########################################################################
#                                                                         #
#                               SIGHAN                                    #
#                         Copyright (c) 2003                              #
#                        All Rights Reserved.                             #
#                                                                         #
#  Permission is hereby granted, free of charge, to use and distribute    #
#  this software and its documentation without restriction, including     #
#  without limitation the rights to use, copy, modify, merge, publish,    #
#  distribute, sublicense, and/or sell copies of this work, and to        #
#  permit persons to whom this work is furnished to do so, subject to     #
#  the following conditions:                                              #
#   1. The code must retain the above copyright notice, this list of      #
#      conditions and the following disclaimer.                           #
#   2. Any modifications must be clearly marked as such.                  #
#   3. Original authors' names are not deleted.                           #
#   4. The authors' names are not used to endorse or promote products     #
#      derived from this software without specific prior written          #
#      permission.                                                        #
#                                                                         #
#  SIGHAN AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES       #
#  WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF      #
#  MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SIGHAN NOR THE          #
#  CONTRIBUTORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL      #
#  DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA     #
#  OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER      #
#  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR       #
#  PERFORMANCE OF THIS SOFTWARE.                                          #
#                                                                         #
###########################################################################
#                                                                         #
# Author: Richard Sproat (rws@research.att.com)                           #
#                                                                         #
###########################################################################


$USAGE = "Usage:\t$0 dictionary\n\t";


if (@ARGV < 1) {print "$USAGE\n"; exit;}


%dict = ();
$maxwlen = 0;


open (S, $ARGV[0]) or die "$ARGV[0]: $!\n";
while (<S>) { 
    chop; 
    $dict{$_} = 1; 
    my $l = length($_);
    $maxwlen = $l if $l > $maxwlen;
}
close (S);


shift @ARGV;


$n = 0;
while (<>) {
    chop;
    s/\s*//g;
    my $text = $_;

#print "$text\t";


    while ($text ne "") {
$sub = substr($text, 0, $maxwlen);
while ($sub ne "") {


   if ($dict{$sub}) {
print "$sub\/";
for (my $i = 0; $i < length($sub); ++$i) {
   $text =~ s/^.//;
}
last;
   }
   $sub =~ s/.$//;
}
if ($sub eq "")  {
   if ($text =~ /^([\x21-\x7e])/) {
print "$1\/";
$text =~ s/^.//;
   }
   elsif ($text =~ /^([^\x21-\x7e].)/) {
print "$1\/";
$text =~ s/^..//;
   }
   else { ## shouldn't happen
print STDERR "Oops: shouldn't be here: $n\n";
print "$1\/";
$text =~ s/^.//;
   }
}
    }
    print "\n";
    ++$n;
}


exit(0);
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值