从文本提取信息
import nltk
def ie_preprocess(document):
sentences = nltk.sent_tokenize(document)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences]
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),
("the", "DT"), ("cat", "NN")]
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)
"""
(S
(NP the/DT little/JJ yellow/JJ dog/NN)
barked/VBD
at/IN
(NP the/DT cat/NN))
"""
result.draw()
grammer = r"""
NP: {<DT|PP\$>?<JJ>*<NN>}
{<NNP>+}
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"), ("her", "PP$"), ("long", "JJ"), ("golden", "JJ")
,("hair", "NN")]
print(cp.parse(sentence))
"""
(S
Rapunzel/NNP
let/VBD
down/RP
her/PP$
(NP long/JJ golden/JJ hair/NN))
"""
nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]
grammar = "NP: {<NN><NN>} # Chunk two consecutive nouns"
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))
"""
(S (NP money/NN market/NN) fund/NN)
"""
cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}')
brown = nltk.corpus.brown
for sent in brown.tagged_sents():
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.label() == 'CHUNK':print(subtree)
"""
(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
...
"""
grammer = r"""
NP:
{<.*>+} # Chunk everything
}<VBD|IN>+{ # Chink sequences of VBD and IN
"""
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"),
("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN")]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))
"""
(S the/DT little/JJ yellow/JJ dog/NN barked/VBD at/IN the/DT cat/NN)
"""
text = """
he PRP B-NP
accepted VBD B-NP
the DT B-NP
position NN I-NP
of IN B-NP
vice NN B-NP
chairman NN I-NP
of IN B-PP
Carlyle NNP B-NP
Group NNP I-NP
, , O
a DT B-NP
merchant NN I-NP
banking NN I-NP
concern NN I-NP
. . O
"""
nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()
from nltk.corpus import conll2000
print(conll2000.chunked_sents('train.txt')[99])
"""
(S
(PP Over/IN)
(NP a/DT cup/NN)
(PP of/IN)
(NP coffee/NN)
,/,
(NP Mr./NNP Stone/NNP)
(VP told/VBD)
(NP his/PRP$ story/NN)
./.)
"""
print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])
"""
(S
Over/IN
(NP a/DT cup/NN)
of/IN
(NP coffee/NN)
,/,
(NP Mr./NNP Stone/NNP)
told/VBD
(NP his/PRP$ story/NN)
./.)
"""
from nltk.corpus import conll2000
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))
"""
ChunkParse score:
IOB Accuracy: 43.4%%
Precision: 0.0%%
Recall: 0.0%%
F-Measure: 0.0%%
"""
grammar = r"NP: {<CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))
"""
ChunkParse score:
IOB Accuracy: 43.4%%
Precision: 0.0%%
Recall: 0.0%%
F-Measure: 0.0%%
"""
class UnigramChunker(nltk.ChunkParserI):
def __init__(self, train_sents):
train_data = [[(t,c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
for sent in train_sents]
self.tagger = nltk.UnigramTagger(train_data)
def parse(self, sentence):
pos_tags = [pos for (word, pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
conlltags = [(word, pos, chunktag) for ((word, pos), chunktag)
in zip(sentence, chunktags)]
return nltk.chunk.tree2conlltags(conlltags)
from nltk.corpus import conll2000
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))
"""
ChunkParse score:
IOB Accuracy: 43.4%%
Precision: 0.0%%
Recall: 0.0%%
F-Measure: 0.0%%
"""
postags = sorted(set(pos for sent in train_sents
for (word, pos) in sent.leaves()))
print(unigram_chunker.tagger.tag(postags))
"""[('#', 'B-NP'), ('$', 'B-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), ..."""
class BigramChunker(nltk.ChunkParserI):
def __init__(self, train_sents):
train_data = [[(t,c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
for sent in train_sents]
self.tagger = nltk.UnigramTagger(train_data)
def parse(self, sentence):
pos_tags = [pos for (word, pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
conlltags = [(word, pos, chunktag) for ((word, pos), chunktag)
in zip(sentence, chunktags)]
return nltk.chunk.tree2conlltags(conlltags)
bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))
"""
ChunkParse score:
IOB Accuracy: 43.4%%
Precision: 0.0%%
Recall: 0.0%%
F-Measure: 0.0%%
"""
class ConsecutiveNPChunkTagger(nltk.TaggerI):
def __init__(self, train_sents):
train_set = []
for tagged_sent in train_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
history = []
for i, (word, tag) in enumerate(tagged_sent):
featureset = npchunk_features(untagged_sent, i, history)
train_set.append((featureset, tag))
history.append(tag)
self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='iis', trace=0)
def tag(self, sentence):
history = []
for i, word in enumerate(sentence):
featureset = npchunk_features(sentence, i, history)
tag = self.classifier.classify(featureset)
history.append(tag)
return zip(sentence, history)
class ConsecutiveNPChunker(nltk.ChunkParserI):
def __init__(self, train_sents):
tagged_sents = [[((w,t),c) for (w,t,c) in
nltk.chunk.tree2conlltags(sent)]
for sent in train_sents]
self.tagger = ConsecutiveNPChunkTagger(tagged_sents)
def parse(self, sentence):
tagged_sents = self.tagger.tag(sentence)
conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
return nltk.chunk.conlltags2tree(conlltags)
'''
def npchunk_features(sentence, i, history):
word, pos = sentence[i]
return {"pos": pos}
chunker = ConsecutiveNPChunker(train_sents) # 这个训练比较耗时
print(chunker.evaluate(test_sents))
"""
ChunkParse score:
IOB Accuracy: 92.9%%
Precision: 79.9%%
Recall: 86.8%%
F-Measure: 83.2%%
"""
def npchunk_features(sentence, i, history):
word, pos = sentence[i]
if i == 0:
prevword, prevpos = "<START>", "<START>"
else:
prevword, prevpos = sentence[i-1]
return {"pos": pos, "prevpos": prevpos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))
"""
ChunkParse score:
IOB Accuracy: 93.6%%
Precision: 82.0%%
Recall: 87.2%%
F-Measure: 84.6%%
"""
def npchunk_features(sentence, i, history):
word, pos = sentence[i]
if i == 0:
prevword, prevpos = "<START>", "<START>"
else:
prevword, prevpos = sentence[i-1]
return {"pos": pos, "word": word, "prevpos":prevpos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))
"""
ChunkParse score:
IOB Accuracy: 94.6%%
Precision: 84.6%%
Recall: 89.8%%
F-Measure: 87.1%%
"""
def npchunk_features(sentence, i, history):
word, pos = sentence[i]
if i == 0:
prevword, prevpos = "<START>", "<START>"
else:
prevword, prevpos = sentence[i-1]
if i == len(sentence)-1:
nextword, nextpos = "<END>", "<END>"
else:
nextword, nextpos = sentence[i+1]
return {"pos": pos, "word": word,
"prevpos": prevpos, "nextpos": nextpos,
"prevpos+pos": "%s+%s" % (prevpos, pos),
"pos+nextpos": "%s+%s" % (pos, nextpos),
"tags-since-dt": tags_since_dt(sentence,i)}
def tags_since_dt(sentence, i):
tags = set()
for word, pos in sentence[:i]:
if pos == 'DT':
tags = set()
else:
tags.add(pos)
return '+'.join(sorted(tags))
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))
"""
ChunkParse score:
IOB Accuracy: 96.0%%
Precision: 88.3%%
Recall: 91.1%%
F-Measure: 89.7%%
"""
# 7.5 语言结构中的递归
# 用级联分块器构建嵌套结构
grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP, VP
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),
("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
print(cp.parse(sentence))
"""
(S
(NP Mary/NN)
saw/VBD
(CLAUSE
(NP the/DT cat/NN)
(VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
"""
sentence = [("John", "NNP"), ("thinks", "VBZ"), ("Mary", "NN"), ("saw", "VBD"), ("the", "DT"),
("cat", "NN"), ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
print(cp.parse(sentence))
"""
(S
(NP John/NNP)
thinks/VBZ
(NP Mary/NN)
saw/VBD
(CLAUSE
(NP the/DT cat/NN)
(VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
"""
cp = nltk.RegexpParser(grammar, loop=2)
print(cp.parse(sentence))
"""
(S
(NP John/NNP)
thinks/VBZ
(CLAUSE
(NP Mary/NN)
(VP
saw/VBD
(CLAUSE
(NP the/DT cat/NN)
(VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))
"""
# 树状图
tree1 = nltk.Tree('NP', ['Alice'])
print(tree1) # (NP Alice)
tree2 = nltk.Tree('NP', ['the', 'rabbit'])
print(tree2) # (NP the rabbit)
tree3 = nltk.Tree('VP', ['chased', tree2])
print(tree3) # (VP chased (NP the rabbit))
tree4 = nltk.Tree('S', [tree1, tree3])
print(tree4) # (S (NP Alice) (VP chased (NP the rabbit)))
print(tree4[1]) # (VP chased (NP the rabbit))
print(tree4[1].label()) # VP
print(tree4.leaves()) # ['Alice', 'chased', 'the', 'rabbit']
print(tree4[1][1][1]) # rabbit
tree3.draw()
# 递归函数遍历树状图
def traverse(t):
try:
t.label()
except AttributeError:
print(t, end="")
else:
# Now we know that t.node is defined
print("(", t.label(), end='')
for child in t:
traverse(child)
print(")", end='')
import nltk
#t = nltk.Tree('(S (NP Alice) (VP chased (NP the rabbit)))')
t = nltk.Tree.fromstring('(S (NP Alice) (VP chased (NP the rabbit)))')
traverse(t)
"""
(S
(NP Mary/NN)
saw/VBD
(CLAUSE
(NP the/DT cat/NN)
(VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
(S
(NP John/NNP)
thinks/VBZ
(NP Mary/NN)
saw/VBD
(CLAUSE
(NP the/DT cat/NN)
(VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
(S
(NP John/NNP)
thinks/VBZ
(CLAUSE
(NP Mary/NN)
(VP
saw/VBD
(CLAUSE
(NP the/DT cat/NN)
(VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))
(NP Alice)
(NP the rabbit)
(VP chased (NP the rabbit))
(S (NP Alice) (VP chased (NP the rabbit)))
(VP chased (NP the rabbit))
VP
['Alice', 'chased', 'the', 'rabbit']
rabbit
( S( NPAlice)( VPchased( NPtherabbit)))
"""
# 命名实体识别(NES)
sent = nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent, binary=True))
"""
(S
The/DT
(NE U.S./NNP)
is/VBZ
one/CD
of/IN
the/DT
few/JJ
industrialized/VBN
nations/NNS
that/WDT
*T*-7/-NONE-
does/VBZ
n't/RB
have/VB
a/DT
higher/JJR
standard/NN
of/IN
regulation/NN
for/IN
the/DT
smooth/JJ
,/,
needle-like/JJ
fibers/NNS
such/JJ
as/IN
crocidolite/NN
that/WDT
*T*-1/-NONE-
are/VBP
classified/VBN
*-5/-NONE-
as/IN
amphobiles/NNS
,/,
according/VBG
to/TO
(NE Brooke/NNP)
T./NNP
Mossman/NNP
,/,
a/DT
professor/NN
of/IN
pathlogy/NN
at/IN
the/DT
(NE University/NNP)
of/IN
(NE Vermont/NNP College/NNP)
of/IN
(NE Medicine/NNP)
./.)
"""
'''
import re, nltk
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
for rel in nltk.sem.relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
print(nltk.sem.relextract.rtuple(rel))
"""
[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan & Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']
"""
from nltk.corpus import conll2002
vnv = """
( #
is/V| #
was/V| #
werd/V| #
wordt/V #
) #
.* #
van/Prep #
"""
VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
for r in nltk.sem.relextract.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
print(nltk.sem.relextract.clause(r, relsym="VAN"))
"""
VAN("cornet_d'elzius", 'buitenlandse_handel')
VAN('johan_rottiers', 'kardinaal_van_roey_instituut')
VAN('annie_lennox', 'eurythmics')
"""