Exercises - Natural Language Processing with Python (Chapter8)

最新推荐文章于 2025-04-04 23:07:43 发布

_Meilinger_

最新推荐文章于 2025-04-04 23:07:43 发布

阅读量209

点赞数 1

分类专栏：碎片笔记文章标签： nlp python 自然语言处理

本文链接：https://blog.csdn.net/qq_36332660/article/details/109845230

版权

碎片笔记专栏收录该内容

52 篇文章

订阅专栏

本文探讨了使用Python中的nltk库进行自然语言处理的方法，包括定义上下文无关文法(CFG)、构建语法树、解析句子结构以及比较不同解析器的效率。通过具体实例展示了如何构建和操作语法树，并评估了不同解析算法的时间性能。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import nltk
from nltk.corpus import brown, treebank, gutenberg
from nltk.tree import Tree
from collections import defaultdict
from timeit import timeit
import prettytable as pt

# 3
grammar1 = nltk.CFG.fromstring("""
  S -> NPV OR NAN | NON AND NPV
  NAN -> NPV AND NPV
  NON -> NPV OR NPV
  NPV -> NP V
  V -> "arrived" | "left" | "cheered"
  NP -> "Kim" | "Dana" | "everyone"
  OR -> "or"
  AND -> "and"
  """)
sent = "Kim arrived or Dana left and everyone cheered".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
for tree in rd_parser.parse(sent):
    print(tree)


# 4
help(Tree)


# 5
nan = Tree('NAN', [Tree('N', ['men']), Tree('AND', ['and']), Tree('N', ['women'])])
tree_1 = Tree('S', [Tree('ADJ', ['old']), nan])

tree_2 = Tree.fromstring("(S (ADJN (ADJ old) (N men)) (AND and) (N (N women)))")

print(tree_1)
print(tree_2)

tree_2.draw()

'The woman saw a man last Thursday'

tree_3 = Tree.fromstring("(S (NP (Det The)(N woman)) (VP (V saw) (NP (Det a)(N man)) (PP (ADJ last)(NOM Thursday))))")
tree_3.draw()


# 6

tree = Tree('S', [Tree('ADJN', [Tree('ADJ', ['old']), Tree('N', ['men'])]), Tree('AND', ['and']),
                  Tree('N', [Tree('N', ['women'])])])


def depth(tree):
    max_child_height = 0
    for child in tree:
        if isinstance(child, Tree):
            max_child_height = max(max_child_height, child.height())
        else:
            max_child_height = max(max_child_height, 1)
    return max_child_height


print(depth(tree))


# 15
grammar2 = nltk.CFG.fromstring("""
  S  -> NP VP
  NP -> Det Nom | PropN
  VP -> IV ADVS
  ADVS -> ADV ADV
  PropN -> 'Lee'
  IV ->  'ran'
  ADV -> 'away' | 'home'
  """)
sent = "Lee ran away home".split()
rd_parser = nltk.RecursiveDescentParser(grammar2)
for tree in rd_parser.parse(sent):
    print(tree)


# 16 (incomplete)
entries = nltk.corpus.ppattach.attachments('training')
table = defaultdict(lambda: defaultdict(set))
for entry in entries:
    key = entry.verb + '-' + entry.noun1
    table[key][entry.attachment].add(entry.noun1)

for key in sorted(table):
    if len(table[key]) > 1:
        print(key, 'N:', sorted(table[key]['N']), 'V:', sorted(table[key]['V']))


# 17
grammar2 = nltk.CFG.fromstring("""
  S  -> NP VP
  NP -> Det Nom | PropN
  VP -> IV ADVS
  ADVS -> ADV ADV
  PropN -> 'Lee'
  IV ->  'ran'
  ADV -> 'away' | 'home'
  """)
sent = "Lee ran away home".split()


def test_ct():
    ct_parser = nltk.ChartParser(grammar2)
    for tree in ct_parser.parse(sent):
        pass
        print(tree)
    return ct_parser


def test_rd():
    rd_parser = nltk.RecursiveDescentParser(grammar2)
    for tree in rd_parser.parse(sent):
        pass
        print(tree)
    return rd_parser


print(timeit(stmt=test_ct, number=1))
print(timeit(stmt=test_rd, number=1))


# 18
grammar = nltk.CFG.fromstring("""
  S -> NP VP
  VP -> V NP | V NP PP | IV ADVS
  ADVS -> ADV ADV
  PP -> P NP
  V -> "saw" | "ate" | "walked"
  NP -> "John" | "Mary" | "Bob" | Det N | Det N PP | Det Nom | PropN
  Det -> "a" | "an" | "the" | "my"
  N -> "man" | "dog" | "cat" | "telescope" | "park"
  P -> "in" | "on" | "by" | "with"
  PropN -> 'Lee'
  IV ->  'ran'
  ADV -> 'away' | 'home'
  """)

sent1 = "Mary saw Bob".split()
sent2 = "Lee ran away home".split()
sent3 = "Mary saw a dog".split()


def test_rd1():
    rd_parser = nltk.RecursiveDescentParser(grammar)
    for tree in rd_parser.parse(sent1):
        pass
        # print(tree)
    return rd_parser


def test_rd2():
    rd_parser = nltk.RecursiveDescentParser(grammar)
    for tree in rd_parser.parse(sent2):
        pass
        # print(tree)
    return rd_parser


def test_rd3():
    rd_parser = nltk.RecursiveDescentParser(grammar)
    for tree in rd_parser.parse(sent3):
        pass
        # print(tree)
    return rd_parser


def test_ct1():
    ct_parser = nltk.ChartParser(grammar)
    for tree in ct_parser.parse(sent1):
        pass
        # print(tree)
    return ct_parser


def test_ct2():
    ct_parser = nltk.ChartParser(grammar)
    for tree in ct_parser.parse(sent2):
        pass
        # print(tree)
    return ct_parser


def test_ct3():
    ct_parser = nltk.ChartParser(grammar)
    for tree in ct_parser.parse(sent3):
        pass
        # print(tree)
    return ct_parser


def test_lc1():
    lc_parser = nltk.LeftCornerChartParser(grammar)
    for tree in lc_parser.parse(sent1):
        pass
        # print(tree)
    return lc_parser


def test_lc2():
    lc_parser = nltk.LeftCornerChartParser(grammar)
    for tree in lc_parser.parse(sent2):
        pass
        # print(tree)
    return lc_parser


def test_lc3():
    lc_parser = nltk.LeftCornerChartParser(grammar)
    for tree in lc_parser.parse(sent3):
        pass
        # print(tree)
    return lc_parser


rd1 = round(timeit(stmt=test_rd1, number=1), 6)
rd2 = round(timeit(stmt=test_rd2, number=1), 6)
rd3 = round(timeit(stmt=test_rd3, number=1), 6)
ct1 = round(timeit(stmt=test_ct1, number=1), 6)
ct2 = round(timeit(stmt=test_ct2, number=1), 6)
ct3 = round(timeit(stmt=test_ct3, number=1), 6)
lc1 = round(timeit(stmt=test_lc1, number=1), 6)
lc2 = round(timeit(stmt=test_lc2, number=1), 6)
lc3 = round(timeit(stmt=test_lc3, number=1), 6)


tb = pt.PrettyTable(["Parser", "Sent1", "Sent2", "Sent3", "Total"])
tb.add_row(['RecursiveDescentParser', rd1, rd2, rd3, round(rd1+rd2+rd3, 6)])
tb.add_row(['ChartParser', ct1, ct2, ct3, round(ct1+ct2+ct3, 6)])
tb.add_row(['LeftCornerChartParser', lc1, lc2, lc3, round(lc1+lc2+lc3, 6)])
tb.add_row(['Total', round(rd1+ct1+lc1, 6), round(rd2+ct2+lc2, 6), round(rd3+ct3+lc3, 6),
            round(rd1+ct1+lc1+rd2+ct2+lc2+rd3+ct3+lc3, 6)])
print(tb)


# 20
from nltk.draw.tree import draw_trees
nan = Tree('NAN', [Tree('N', ['men']), Tree('AND', ['and']), Tree('N', ['women'])])
tree_1 = Tree('S', [Tree('ADJ', ['old']), nan])
tree_2 = Tree.fromstring("(S (ADJN (ADJ old) (N men)) (AND and) (N (N women)))")
tree_3 = Tree.fromstring("(S (NP (Det The)(N woman)) (VP (V saw) (NP (Det a)(N man)) (PP (ADJ last)(NOM Thursday))))")
draw_trees(tree_1, tree_2, tree_3)


# 21
t = treebank.parsed_sents()[:100]
def filter(tree):
    child_nodes = [child.label() for child in tree
                   if isinstance(child, nltk.Tree) and child.height() <= 2]
    return (tree.label() == 'NP-SBJ')


subtrees = [subtree for tree in treebank.parsed_sents()
       for subtree in tree.subtrees(filter)]
for stree in subtrees:
    print(stree)


# 25
sents = gutenberg.sents('austen-emma.txt')
sorted_sents = sorted(sents, key=lambda x: len(x), reverse=True)
print(len(sorted_sents[0]), sorted_sents[0])