[Python开发] python使用nltk进行Chart Parsing

python用自然语言处理工具nltk(Natural Language Toolkit, http://www.nltk.org/ )完成计算语言学中的Chart Parsing。

安装

  1. 使用pip安装
    http://www.nltk.org/install.html

    pip install nltk

    或下载安装包安装( http://pypi.python.org/pypi/nltk

    python setup.py install
  2. 需要安装NLTK Data( http://www.nltk.org/data.html )。在python中输入以下代码:

    import nltk
    nltk.download()

Chart Parsing

demo - chart parser

tree

Parse a sentence John ate the cat

import nltk

grammar = nltk.CFG.fromstring("""
    S -> NP VP
    VP -> V NP | V
    NP -> NAME | ART N
    NAME -> 'John'
    V -> 'ate'
    ART -> 'the'
    N -> 'cat'
    """)

tokens = ['John', 'ate', 'the', 'cat']

parser = nltk.ChartParser(grammar, trace=1)
for tree in parser.parse(tokens):
    print(tree)
    tree.draw()

parser = nltk.parse.chart.BottomUpChartParser(grammar, trace=1)
for tree in parser.parse(tokens):
    print(tree)

parser = nltk.parse.earleychart.EarleyChartParser(grammar, trace=1)
for tree in parser.parse(tokens):
    print(tree)
output
|.   John  .   ate   .   the   .   cat   .|
|[---------]         .         .         .| [0:1] 'John'
|.         [---------]         .         .| [1:2] 'ate'
|.         .         [---------]         .| [2:3] 'the'
|.         .         .         [---------]| [3:4] 'cat'
|[---------]         .         .         .| [0:1] NAME -> 'John' *
|[---------]         .         .         .| [0:1] NP -> NAME *
|[--------->         .         .         .| [0:1] S  -> NP * VP
|.         [---------]         .         .| [1:2] V  -> 'ate' *
|.         [--------->         .         .| [1:2] VP -> V * NP
|.         [---------]         .         .| [1:2] VP -> V *
|[-------------------]         .         .| [0:2] S  -> NP VP *
|.         .         [---------]         .| [2:3] ART -> 'the' *
|.         .         [--------->         .| [2:3] NP -> ART * N
|.         .         .         [---------]| [3:4] N  -> 'cat' *
|.         .         [-------------------]| [2:4] NP -> ART N *
|.         .         [------------------->| [2:4] S  -> NP * VP
|.         [-----------------------------]| [1:4] VP -> V NP *
|[=======================================]| [0:4] S  -> NP VP *
(S (NP (NAME John)) (VP (V ate) (NP (ART the) (N cat))))

|.   John  .   ate   .   the   .   cat   .|
|[---------]         .         .         .| [0:1] 'John'
|.         [---------]         .         .| [1:2] 'ate'
|.         .         [---------]         .| [2:3] 'the'
|.         .         .         [---------]| [3:4] 'cat'
|>         .         .         .         .| [0:0] NAME -> * 'John'
|[---------]         .         .         .| [0:1] NAME -> 'John' *
|>         .         .         .         .| [0:0] NP -> * NAME
|[---------]         .         .         .| [0:1] NP -> NAME *
|>         .         .         .         .| [0:0] S  -> * NP VP
|[--------->         .         .         .| [0:1] S  -> NP * VP
|.         >         .         .         .| [1:1] V  -> * 'ate'
|.         [---------]         .         .| [1:2] V  -> 'ate' *
|.         >         .         .         .| [1:1] VP -> * V NP
|.         >         .         .         .| [1:1] VP -> * V
|.         [--------->         .         .| [1:2] VP -> V * NP
|.         [---------]         .         .| [1:2] VP -> V *
|[-------------------]         .         .| [0:2] S  -> NP VP *
|.         .         >         .         .| [2:2] ART -> * 'the'
|.         .         [---------]         .| [2:3] ART -> 'the' *
|.         .         >         .         .| [2:2] NP -> * ART N
|.         .         [--------->         .| [2:3] NP -> ART * N
|.         .         .         >         .| [3:3] N  -> * 'cat'
|.         .         .         [---------]| [3:4] N  -> 'cat' *
|.         .         [-------------------]| [2:4] NP -> ART N *
|.         .         >         .         .| [2:2] S  -> * NP VP
|.         [-----------------------------]| [1:4] VP -> V NP *
|.         .         [------------------->| [2:4] S  -> NP * VP
|[=======================================]| [0:4] S  -> NP VP *
(S (NP (NAME John)) (VP (V ate) (NP (ART the) (N cat))))

|.   John  .   ate   .   the   .   cat   .|
|[---------]         .         .         .| [0:1] 'John'
|.         [---------]         .         .| [1:2] 'ate'
|.         .         [---------]         .| [2:3] 'the'
|.         .         .         [---------]| [3:4] 'cat'
|>         .         .         .         .| [0:0] S  -> * NP VP
|>         .         .         .         .| [0:0] NP -> * NAME
|>         .         .         .         .| [0:0] NP -> * ART N
|>         .         .         .         .| [0:0] NAME -> * 'John'
|[---------]         .         .         .| [0:1] NAME -> 'John' *
|[---------]         .         .         .| [0:1] NP -> NAME *
|[--------->         .         .         .| [0:1] S  -> NP * VP
|.         >         .         .         .| [1:1] VP -> * V NP
|.         >         .         .         .| [1:1] VP -> * V
|.         >         .         .         .| [1:1] V  -> * 'ate'
|.         [---------]         .         .| [1:2] V  -> 'ate' *
|.         [--------->         .         .| [1:2] VP -> V * NP
|.         [---------]         .         .| [1:2] VP -> V *
|[-------------------]         .         .| [0:2] S  -> NP VP *
|.         .         >         .         .| [2:2] NP -> * NAME
|.         .         >         .         .| [2:2] NP -> * ART N
|.         .         >         .         .| [2:2] ART -> * 'the'
|.         .         [---------]         .| [2:3] ART -> 'the' *
|.         .         [--------->         .| [2:3] NP -> ART * N
|.         .         .         >         .| [3:3] N  -> * 'cat'
|.         .         .         [---------]| [3:4] N  -> 'cat' *
|.         .         [-------------------]| [2:4] NP -> ART N *
|.         [-----------------------------]| [1:4] VP -> V NP *
|[=======================================]| [0:4] S  -> NP VP *
(S (NP (NAME John)) (VP (V ate) (NP (ART the) (N cat))))

Well-Formed Substring Tables

Parse a sentence John ate the cat

import nltk
grammar = nltk.CFG.fromstring("""
    S -> NP VP
    VP -> V NP | V
    NP -> NAME | ART N
    NAME -> 'John'
    V -> 'ate'
    ART -> 'the'
    N -> 'cat'
    """)

tokens = ['John', 'ate', 'the', 'cat']

def init_wfst(tokens, grammar):
    numtokens = len(tokens)
    wfst = [['.' for i in range(numtokens+1)] for j in range(numtokens+1)]
    for i in range(numtokens):
        productions = grammar.productions(rhs=tokens[i])
        wfst[i][i+1] = productions[0].lhs()
    return wfst

def display(wfst, tokens):
    print('\nWFST ' + ' '.join([("%-4d" % i) for i in range(1, len(wfst))]))
    for i in range(len(wfst)-1):
        print_string = ("%d   " % i)
        for j in range(1, len(wfst)):
            print_string += (" %-4s"% wfst[i][j])
        print(print_string)

wfst0 = init_wfst(tokens, grammar)
display(wfst0, tokens)
output
WFST 1    2    3    4
0    NAME .    .    .
1    .    V    .    .
2    .    .    ART  .
3    .    .    .    N

Charts

Parse a sentence John ate the cat

import nltk
grammar = nltk.CFG.fromstring("""
    S -> NP VP
    VP -> V NP | V
    NP -> NAME | ART N
    NAME -> 'John'
    V -> 'ate'
    ART -> 'the'
    N -> 'cat'
    """)

tokens = ['John', 'ate', 'the', 'cat']

def complete_wfst(wfst, tokens, trace=False):
    index = {}
    for prod in grammar.productions():
        index[prod.rhs()] = prod.lhs()
    numtokens = len(tokens)
    for span in range(2, numtokens+1):
        for start in range(numtokens+1-span):
            end = start + span
            for mid in range(start+1, end):
                nt1, nt2 = wfst[start][mid], wfst[mid][end]
                if (nt1,nt2) in index:
                    if trace:
                        print("[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]" % \
                        (start, nt1, mid, nt2, end, start, index[(nt1,nt2)], end))
                    wfst[start][end] = index[(nt1,nt2)]
    return wfst

wfst1 = complete_wfst(wfst0, tokens, trace=True)
output
[2] ART [3]   N [4] ==> [2]  NP [4]
[1]   V [2]  NP [4] ==> [1]  VP [4]

earleychart demo

import nltk
nltk.parse.earleychart.demo()

Complex grammar and tokens

grammar = nltk.CFG.fromstring("""
    S -> NP VP
    PP -> P NP
    NP -> Det N | Det N PP | 'I'
    VP -> V NP | VP PP
    Det -> 'an' | 'my'
    N -> 'elephant' | 'pajamas'
    V -> 'shot'
    P -> 'in'
    """)
tokens = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']

grammar = nltk.CFG.fromstring("""
    S -> NP VP
    NP -> ART ADJ N | ART N | ADJ N
    VP -> AUX VP | V NP
    ART -> 'the'
    ADJ -> 'large'
    N -> 'can' | 'hold' | 'water'
    AUX -> 'can'
    V -> 'can' | 'hold' | 'water'
    """)
tokens = ['the', 'large', 'can', 'can', 'hold', 'the', 'water']
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值