def ie_preprocess(document):
sentences=nltk.sents_tokenize(document)
sentences=[nltk.word_wokenizes(sent) for sent in sentences]
sentence=[nltk.pos_tag(sent) for sent in sentences]
2.一个基于正则表达式的NP分块器的例子。
sentences=[("the","DT"),("little","JJ"),("yellow","JJ"),
...("dog","NN"),("barked","VBD"),("at","IN"),(the","DT"),("cat","NN")]
grammar="NP:{<DT>?<JJ>*<NN>}"
cp=nltk.RegexpParser(gammar)
result=cp.parse(sentence)
print result
result.draw()
grammar=r"""
NP:{<DT|PP\$>?<JJ>*<NN>}
{<NNP>+}
"""
cp=nltk.RegexpParser(grammar)
sentence=[("Rapunzel","NNP"),("let","VBD"),("down","RP"),("her","pp$"),
("long","JJ"),("golden","JJ"),("hair","NN")]
print cp.parse(sentence)
cp=nltk.RegexpParser('CHUNK:{<V.*><TO><V.*>}')
brown=nltk.corpus.brown
for sent in brown.tagged_sents():
tree=cp.parse(sent)
for subtree in tree.subtrees():
if subtree.node=='CHUNK':print subtree
from nltk.corpus import conll2000
cp=nltk.RegexpParser("")
test_sents=conll2000.chunked_sents('test.txt',chunk_types=['NP'])
print cp.evaluate(test_sents)
class UnigramChunker(nltk.ChunkParserI):
def __init__(self,train_sents):
train_data=[[(t,c) for w,t,c in nltk.chunk.tree2colltags(sent)]
for sent in train_sents]
self.tagger=nltk.UnigramTagger(train_data)
def parse(self,sentence):
pos_tags=[pos for (word,pos) in sentence]
tagged_pos_tags=self.tagger.tag(pos_tags)
chunktags=[chunktag for (pos,chunktag) in tagged_pos_tags]
chunktags=[(word,pos,chunktag) for ((word,pos),chunktag) in zip(sentence,chunktags)]
return nltk.chunktags2tree(colltags)
>>>tree1=nltk.Tree('NP',['Alice'])
>>>tree2=nltk.Tree('NP',['the','rabbit'])
>>>tree3=nltk.Tree('VP',['chase',tree2])
>>>tree4=nltk.Tree('S',[tree1,tree3])
>>>print tree4
(s (NP Alice) (VP chase (NP the rabbit)))
>>>print tree4[0]
(NP Alice)
>>>print treee4[0][0]
Alice
>>>print tree4[0][0][0]
a
>>>print tree4[1]
(VP chased (NP the rabbit))
>>>tree4[1].node
'VP'
>>>tree4.leaves()
['Alice','chased','the','rabbit']
>>>tree4[1][1][1]
'rabbit'
def traverse(t):
try:
t.node
except AttributeError:
print t,
else:
#Now we know that t.node is defined
print '(',t.node,
for child in t:
traverse(child)
print ')',
>>>t=nltk.Tree('(S (NP Alice) (VP chased (NP the rabbit)))')
>>>traverse(t)
(S (NP Alice) (VP chased (NP the rabbit)))
sent=nltk.corpus.treebank.tagged_sents()[22]
print nltk.ne_chunk(sent,binary=True)
print nltk.ne_chunk(sent)