def init_wfst(tokens,grammar):
numtokens=len(tokens)
wwfst=[[None for i in range(numtokens+1)] for j in range(numtokens+1)]
for i in range(numtokens):
productions=grammar.productions(rhs=tokens[i])
wfst[i][i+1]=productions[0].lhs()
return wfst
def comoplete_wfst(wfst,tokens,grammar,trace=False):
index=dict((p.rhs(),p.lhs() )for p in grammar.productions())
numtokens=len(tokens)
for span in range(2,numtokens+1):
for start in range(numtokens+1-span):
end=start+span
for mid in range(start+1,end):
nt1,nt2=wfst[start][mid],wfst[mid][end]
if nt1 and nt2 and (nt1,nt2) in index:
wfst[start][end]=index[(nt1,nt2)]
if trace:
print "[%s] %3s [%s] %3s [%s]==> [%s] %3s [%s]" %\
(start,nt1,nt2,end start,index[(nt1,nt2)] end)
return wfst
def display(wfst,tokens):
print '\n WFST'+' '.join([("%-4d"% i) for i in range(1,len(wfst))])
for i in range(len(wfst)-1):
print "%d " %i,
for j in range(1,len(wffst)):
print "%-4s" % (wfst[i][j] or '.')
print
tokens="I shoot an elephant in my pajamas".split()
wfst0=init_wfst(tokens,groucho_grammar)
display(wfst0,tokens)
def filter(tree):
child_nodes=[child.node for child in tree
if isinstance(child,nltk.Tree)]
return (tree.node=='VP') and ('S' in child_nodes)
from nltk.corpus import treebank
[subtree for tree in treebank.parsed_sents()
for subtree in tree.subtrees(fileter)]