NLTK08《Python自然语言处理》code07 从文本提取信息

最新推荐文章于 2024-06-01 22:35:29 发布
longji
最新推荐文章于 2024-06-01 22:35:29 发布
阅读量1.7k
点赞数
分类专栏： NLTK 文章标签： NLTK
本文链接：https://blog.csdn.net/longji/article/details/77748680
版权
NLTK 专栏收录该内容
12 篇文章 0 订阅
订阅专栏
从文本提取信息

# -*- coding: utf-8 -*-
# win10 python3.5.3/python3.6.1 nltk3.2.4
# 《Python自然语言处理》 07 从文本提取信息
# pnlp07.py

# 7.1 信息提取
# 信息提取结构
import nltk
def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]

# 7.2 分块
# 名词短语分块
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),
            ("the", "DT"), ("cat", "NN")]
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)
"""
(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))
"""
result.draw()

# 标记模式

# 用正则表达式分块
grammer = r"""
NP: {<DT|PP\$>?<JJ>*<NN>}
    {<NNP>+}
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"), ("her", "PP$"), ("long", "JJ"), ("golden", "JJ")
            ,("hair", "NN")]
print(cp.parse(sentence))
"""
(S
  Rapunzel/NNP
  let/VBD
  down/RP
  her/PP$
  (NP long/JJ golden/JJ hair/NN))
"""

nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]
grammar = "NP: {<NN><NN>} # Chunk two consecutive nouns"
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))
"""
(S (NP money/NN market/NN) fund/NN)
"""

# 探索文本语料库
cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}')
brown = nltk.corpus.brown
for sent in brown.tagged_sents():
    tree = cp.parse(sent)
    for subtree in tree.subtrees():
        if subtree.label() == 'CHUNK':print(subtree)
"""
(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
...
"""

# 缝隙
grammer = r"""
NP: 
  {<.*>+}  # Chunk everything
  }<VBD|IN>+{  # Chink sequences of VBD and IN
"""
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"),
            ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN")]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))
"""
(S the/DT little/JJ yellow/JJ dog/NN barked/VBD at/IN the/DT cat/NN)
"""

# 分块的表示：标记与树状图

# 7.3 开发和评估分块器
# 读取IOB格式与CoNLL2000分块语料库
text = """
he PRP B-NP
accepted VBD B-NP
the DT B-NP
position NN I-NP
of IN B-NP
vice NN B-NP
chairman NN I-NP
of IN B-PP
Carlyle NNP B-NP
Group NNP I-NP
, , O
a DT B-NP
merchant NN I-NP
banking NN I-NP
concern NN I-NP
. . O
"""
nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()

from nltk.corpus import conll2000
print(conll2000.chunked_sents('train.txt')[99])
"""
(S
  (PP Over/IN)
  (NP a/DT cup/NN)
  (PP of/IN)
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  (VP told/VBD)
  (NP his/PRP$ story/NN)
  ./.)
"""

print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])
"""
(S
  Over/IN
  (NP a/DT cup/NN)
  of/IN
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  told/VBD
  (NP his/PRP$ story/NN)
  ./.)
"""

# 简单评估与基准
from nltk.corpus import conll2000
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))
"""
ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
"""

grammar = r"NP: {<CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))
"""
ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
"""

class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.tree2conlltags(conlltags)

from nltk.corpus import conll2000
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))
"""
ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
"""

postags = sorted(set(pos for sent in train_sents
                     for (word, pos) in sent.leaves()))
print(unigram_chunker.tagger.tag(postags))
"""[('#', 'B-NP'), ('$', 'B-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), ..."""

class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.tree2conlltags(conlltags)

bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))
"""
ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
"""

# 训练基于分类器的分块器
class ConsecutiveNPChunkTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append((featureset, tag))
                history.append(tag)

        # windows 采用megam算法需要自己编译megam模块，比较麻烦，可以尝试使用iis、gis之类的算法
        self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='iis', trace=0)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

# 不同的npchunk_features
'''
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {"pos": pos}
chunker = ConsecutiveNPChunker(train_sents) # 这个训练比较耗时
print(chunker.evaluate(test_sents))
"""
ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%
"""

def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "prevpos": prevpos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))
"""
ChunkParse score:
    IOB Accuracy:  93.6%%
    Precision:     82.0%%
    Recall:        87.2%%
    F-Measure:     84.6%%
"""

def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "word": word, "prevpos":prevpos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))
"""
ChunkParse score:
    IOB Accuracy:  94.6%%
    Precision:     84.6%%
    Recall:        89.8%%
    F-Measure:     87.1%%
"""

def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == len(sentence)-1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i+1]
    return {"pos": pos, "word": word,
            "prevpos": prevpos, "nextpos": nextpos,
            "prevpos+pos": "%s+%s" % (prevpos, pos),
            "pos+nextpos": "%s+%s" % (pos, nextpos),
            "tags-since-dt": tags_since_dt(sentence,i)}
def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))
"""
ChunkParse score:
    IOB Accuracy:  96.0%%
    Precision:     88.3%%
    Recall:        91.1%%
    F-Measure:     89.7%%
"""

# 7.5 语言结构中的递归
# 用级联分块器构建嵌套结构
grammar = r"""
NP: {<DT|JJ|NN.*>+}           # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>}                # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$}  # Chunk verbs and their arguments
CLAUSE: {<NP><VP>}            # Chunk NP, VP
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),
            ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
print(cp.parse(sentence))
"""
(S
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
"""
sentence = [("John", "NNP"), ("thinks", "VBZ"), ("Mary", "NN"), ("saw", "VBD"), ("the", "DT"),
            ("cat", "NN"), ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
print(cp.parse(sentence))
"""
(S
  (NP John/NNP)
  thinks/VBZ
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
"""
cp = nltk.RegexpParser(grammar, loop=2)
print(cp.parse(sentence))
"""
(S
  (NP John/NNP)
  thinks/VBZ
  (CLAUSE
    (NP Mary/NN)
    (VP
      saw/VBD
      (CLAUSE
        (NP the/DT cat/NN)
        (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))
"""
# 树状图
tree1 = nltk.Tree('NP', ['Alice'])
print(tree1) # (NP Alice)
tree2 = nltk.Tree('NP', ['the', 'rabbit'])
print(tree2) # (NP the rabbit)
tree3 = nltk.Tree('VP', ['chased', tree2])
print(tree3) # (VP chased (NP the rabbit))
tree4 = nltk.Tree('S', [tree1, tree3])
print(tree4) # (S (NP Alice) (VP chased (NP the rabbit)))
print(tree4[1]) # (VP chased (NP the rabbit))
print(tree4[1].label()) # VP
print(tree4.leaves()) # ['Alice', 'chased', 'the', 'rabbit']
print(tree4[1][1][1]) # rabbit
tree3.draw()

# 递归函数遍历树状图
def traverse(t):
    try:
        t.label()
    except AttributeError:
        print(t, end="")
    else:
        # Now we know that t.node is defined
        print("(", t.label(), end='')
        for child in t:
            traverse(child)
        print(")", end='')
import nltk
#t = nltk.Tree('(S (NP Alice) (VP chased (NP the rabbit)))')
t = nltk.Tree.fromstring('(S (NP Alice) (VP chased (NP the rabbit)))')
traverse(t)
"""
(S
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
(S
  (NP John/NNP)
  thinks/VBZ
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
(S
  (NP John/NNP)
  thinks/VBZ
  (CLAUSE
    (NP Mary/NN)
    (VP
      saw/VBD
      (CLAUSE
        (NP the/DT cat/NN)
        (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))
(NP Alice)
(NP the rabbit)
(VP chased (NP the rabbit))
(S (NP Alice) (VP chased (NP the rabbit)))
(VP chased (NP the rabbit))
VP
['Alice', 'chased', 'the', 'rabbit']
rabbit
( S( NPAlice)( VPchased( NPtherabbit)))
"""

# 命名实体识别(NES)
sent = nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent, binary=True))
"""
(S
  The/DT
  (NE U.S./NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  few/JJ
  industrialized/VBN
  nations/NNS
  that/WDT
  *T*-7/-NONE-
  does/VBZ
  n't/RB
  have/VB
  a/DT
  higher/JJR
  standard/NN
  of/IN
  regulation/NN
  for/IN
  the/DT
  smooth/JJ
  ,/,
  needle-like/JJ
  fibers/NNS
  such/JJ
  as/IN
  crocidolite/NN
  that/WDT
  *T*-1/-NONE-
  are/VBP
  classified/VBN
  *-5/-NONE-
  as/IN
  amphobiles/NNS
  ,/,
  according/VBG
  to/TO
  (NE Brooke/NNP)
  T./NNP
  Mossman/NNP
  ,/,
  a/DT
  professor/NN
  of/IN
  pathlogy/NN
  at/IN
  the/DT
  (NE University/NNP)
  of/IN
  (NE Vermont/NNP College/NNP)
  of/IN
  (NE Medicine/NNP)
  ./.)
"""
'''
# 7.6 关系抽取
import re, nltk
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
        print(nltk.sem.relextract.rtuple(rel))
"""
[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']
"""

from nltk.corpus import conll2002
vnv = """
(        # 
is/V|    # 
was/V|   # 
werd/V|  # 
wordt/V  # 
)        # 
.*       # 
van/Prep # 
"""
VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
    for r in nltk.sem.relextract.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
        print(nltk.sem.relextract.clause(r, relsym="VAN"))
"""
VAN("cornet_d'elzius", 'buitenlandse_handel')
VAN('johan_rottiers', 'kardinaal_van_roey_instituut')
VAN('annie_lennox', 'eurythmics')
"""