langconv.py#!/usr/bin/env python#-*- coding: utf-8 -*-
from copy importdeepcopyimportretry:importpsyco
psyco.full()except:pass
try:from zh_wiki importzh2Hant, zh2HansexceptImportError:from zhtools.zh_wiki importzh2Hant, zh2Hansimportsys
py3k= sys.version_info >= (3, 0, 0)ifpy3k:
UEMPTY= ''
else:
_zh2Hant, _zh2Hans={}, {}for old, new in((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)):for k, v inold.items():
new[k.decode('utf8')] = v.decode('utf8')
zh2Hant=_zh2Hant
zh2Hans=_zh2Hans
UEMPTY= ''.decode('utf8')#states
(START, END, FAIL, WAIT_TAIL) = list(range(4))#conditions
(TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5))
MAPS={}classNode(object):def __init__(self, from_word, to_word=None, is_tail=True,
have_child=False):
self.from_word=from_wordif to_word isNone:
self.to_word=from_word
self.data=(is_tail, have_child, from_word)
self.is_original=Trueelse:
self.to_word= to_word orfrom_word
self.data=(is_tail, have_child, to_word)
self.is_original=False
self.is_tail=is_tail
self.have_child=have_childdefis_original_long_word(self):return self.is_original and len(self.from_word)>1
defis_follow(self, chars):return chars != self.from_word[:-1]def __str__(self):return '' %(repr(self.from_word),
repr(self.to_word), self.is_tail, self.have_child)__repr__ = __str__
classConvertMap(object):def __init__(self, name, mapping=None):
self.name=name
self._map={}ifmapping:
self.set_convert_map(mapping)defset_convert_map(self, mapping):
convert_map={}
have_child={}
max_key_length=0for key insorted(mapping.keys()):if len(key)>1:for i in range(1, len(key)):
parent_key=key[:i]
have_child[parent_key]=True
have_child[key]=False
max_key_length=max(max_key_length, len(key))for key insorted(have_child.keys()):
convert_map[key]= (key inmapping, have_child[key],
mapping.get(key, UEMPTY))
self._map=convert_map
self.max_key_length=max_key_lengthdef __getitem__(self, k):try:
is_tail, have_child, to_word=self._map[k]returnNode(k, to_word, is_tail, have_child)except:returnNode(k)def __contains__(self, k):return k inself._mapdef __len__(self):returnlen(self._map)class StatesMachineException(Exception): pass
classStatesMachine(object):def __init__(self):
self.state=START
self.final=UEMPTY
self.len=0
self.pool=UEMPTYdefclone(self, pool):
new=deepcopy(self)
new.state=WAIT_TAIL
new.pool=poolreturnnewdeffeed(self, char, map):
node= map[self.pool+char]ifnode.have_child:ifnode.is_tail:ifnode.is_original:
cond=UNMATCHED_SWITCHelse:
cond=MATCHED_SWITCHelse:
cond=CONNECTORelse:ifnode.is_tail:
cond=TAILelse:
cond=ERROR
new=Noneif cond ==ERROR:
self.state=FAILelif cond ==TAIL:if self.state == WAIT_TAIL andnode.is_original_long_word():
self.state=FAILelse:
self.final+=node.to_word
self.len+= 1self.pool=UEMPTY
self.state=ENDelif self.state == START or self.state ==WAIT_TAIL:if cond ==MATCHED_SWITCH:
new=self.clone(node.from_word)
self.final+=node.to_word
self.len+= 1self.state=END
self.pool=UEMPTYelif cond == UNMATCHED_SWITCH or cond ==CONNECTOR:if self.state ==START:
new=self.clone(node.from_word)
self.final+=node.to_word
self.len+= 1self.state=ENDelse:ifnode.is_follow(self.pool):
self.state=FAILelse:
self.pool=node.from_wordelif self.state ==END:#END is a new START
self.state =START
new=self.feed(char, map)elif self.state ==FAIL:raise StatesMachineException('Translate States Machine'
'have error with input data %s' %node)returnnewdef __len__(self):return self.len + 1
def __str__(self):return '' %(
id(self), self.pool, self.state, self.final)__repr__ = __str__
classConverter(object):def __init__(self, to_encoding):
self.to_encoding=to_encoding
self.map=MAPS[to_encoding]
self.start()deffeed(self, char):
branches=[]for fsm inself.machines:
new=fsm.feed(char, self.map)ifnew:
branches.append(new)ifbranches:
self.machines.extend(branches)
self.machines= [fsm for fsm in self.machines if fsm.state !=FAIL]
all_ok=Truefor fsm inself.machines:if fsm.state !=END:
all_ok=Falseifall_ok:
self._clean()returnself.get_result()def_clean(self):iflen(self.machines):
self.machines.sort(key=lambdax: len(x))#self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y)))
self.final +=self.machines[0].final
self.machines=[StatesMachine()]defstart(self):
self.machines=[StatesMachine()]
self.final=UEMPTYdefend(self):
self.machines= [fsm for fsm inself.machinesif fsm.state == FAIL or fsm.state ==END]
self._clean()defconvert(self, string):
self.start()for char instring:
self.feed(char)
self.end()returnself.get_result()defget_result(self):returnself.finaldefregistery(name, mapping):globalMAPS
MAPS[name]=ConvertMap(name, mapping)
registery('zh-hant', zh2Hant)
registery('zh-hans', zh2Hans)delzh2Hant, zh2Hansdefrun():importsysfrom optparse importOptionParser
parser=OptionParser()
parser.add_option('-e', type='string', dest='encoding',
help='encoding')
parser.add_option('-f', type='string', dest='file_in',
help='input file (- for stdin)')
parser.add_option('-t', type='string', dest='file_out',
help='output file')
(options, args)=parser.parse_args()if notoptions.encoding:
parser.error('encoding must be set')ifoptions.file_in:if options.file_in == '-':
file_in=sys.stdinelse:
file_in=open(options.file_in)else:
file_in=sys.stdinifoptions.file_out:if options.file_out == '-':
file_out=sys.stdoutelse:
file_out= open(options.file_out, 'wb')else:
file_out=sys.stdout
c=Converter(options.encoding)for line infile_in:#print >> file_out, c.convert(line.rstrip('\n').decode(
file_out.write(c.convert(line.rstrip('\n').decode('utf8')).encode('utf8'))if __name__ == '__main__':
run()