python用自然语言处理工具nltk(Natural Language Toolkit, http://www.nltk.org/ )完成计算语言学中的Chart Parsing。
安装
使用
pip
安装
http://www.nltk.org/install.htmlpip install nltk
或下载安装包安装( http://pypi.python.org/pypi/nltk )
python setup.py install
需要安装NLTK Data( http://www.nltk.org/data.html )。在python中输入以下代码:
import nltk nltk.download()
Chart Parsing
- 参考
http://www.nltk.org/api/nltk.parse.html
http://www.ling.helsinki.fi/kit/2008s/clt231/nltk-0.9.5/doc/en/ch08.html
http://www.nltk.org/book/ch08-extras.html
demo - chart parser
tree
Parse a sentence John ate the cat
import nltk
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V
NP -> NAME | ART N
NAME -> 'John'
V -> 'ate'
ART -> 'the'
N -> 'cat'
""")
tokens = ['John', 'ate', 'the', 'cat']
parser = nltk.ChartParser(grammar, trace=1)
for tree in parser.parse(tokens):
print(tree)
tree.draw()
parser = nltk.parse.chart.BottomUpChartParser(grammar, trace=1)
for tree in parser.parse(tokens):
print(tree)
parser = nltk.parse.earleychart.EarleyChartParser(grammar, trace=1)
for tree in parser.parse(tokens):
print(tree)
output
|. John . ate . the . cat .|
|[---------] . . .| [0:1] 'John'
|. [---------] . .| [1:2] 'ate'
|. . [---------] .| [2:3] 'the'
|. . . [---------]| [3:4] 'cat'
|[---------] . . .| [0:1] NAME -> 'John' *
|[---------] . . .| [0:1] NP -> NAME *
|[---------> . . .| [0:1] S -> NP * VP
|. [---------] . .| [1:2] V -> 'ate' *
|. [---------> . .| [1:2] VP -> V * NP
|. [---------] . .| [1:2] VP -> V *
|[-------------------] . .| [0:2] S -> NP VP *
|. . [---------] .| [2:3] ART -> 'the' *
|. . [---------> .| [2:3] NP -> ART * N
|. . . [---------]| [3:4] N -> 'cat' *
|. . [-------------------]| [2:4] NP -> ART N *
|. . [------------------->| [2:4] S -> NP * VP
|. [-----------------------------]| [1:4] VP -> V NP *
|[=======================================]| [0:4] S -> NP VP *
(S (NP (NAME John)) (VP (V ate) (NP (ART the) (N cat))))
|. John . ate . the . cat .|
|[---------] . . .| [0:1] 'John'
|. [---------] . .| [1:2] 'ate'
|. . [---------] .| [2:3] 'the'
|. . . [---------]| [3:4] 'cat'
|> . . . .| [0:0] NAME -> * 'John'
|[---------] . . .| [0:1] NAME -> 'John' *
|> . . . .| [0:0] NP -> * NAME
|[---------] . . .| [0:1] NP -> NAME *
|> . . . .| [0:0] S -> * NP VP
|[---------> . . .| [0:1] S -> NP * VP
|. > . . .| [1:1] V -> * 'ate'
|. [---------] . .| [1:2] V -> 'ate' *
|. > . . .| [1:1] VP -> * V NP
|. > . . .| [1:1] VP -> * V
|. [---------> . .| [1:2] VP -> V * NP
|. [---------] . .| [1:2] VP -> V *
|[-------------------] . .| [0:2] S -> NP VP *
|. . > . .| [2:2] ART -> * 'the'
|. . [---------] .| [2:3] ART -> 'the' *
|. . > . .| [2:2] NP -> * ART N
|. . [---------> .| [2:3] NP -> ART * N
|. . . > .| [3:3] N -> * 'cat'
|. . . [---------]| [3:4] N -> 'cat' *
|. . [-------------------]| [2:4] NP -> ART N *
|. . > . .| [2:2] S -> * NP VP
|. [-----------------------------]| [1:4] VP -> V NP *
|. . [------------------->| [2:4] S -> NP * VP
|[=======================================]| [0:4] S -> NP VP *
(S (NP (NAME John)) (VP (V ate) (NP (ART the) (N cat))))
|. John . ate . the . cat .|
|[---------] . . .| [0:1] 'John'
|. [---------] . .| [1:2] 'ate'
|. . [---------] .| [2:3] 'the'
|. . . [---------]| [3:4] 'cat'
|> . . . .| [0:0] S -> * NP VP
|> . . . .| [0:0] NP -> * NAME
|> . . . .| [0:0] NP -> * ART N
|> . . . .| [0:0] NAME -> * 'John'
|[---------] . . .| [0:1] NAME -> 'John' *
|[---------] . . .| [0:1] NP -> NAME *
|[---------> . . .| [0:1] S -> NP * VP
|. > . . .| [1:1] VP -> * V NP
|. > . . .| [1:1] VP -> * V
|. > . . .| [1:1] V -> * 'ate'
|. [---------] . .| [1:2] V -> 'ate' *
|. [---------> . .| [1:2] VP -> V * NP
|. [---------] . .| [1:2] VP -> V *
|[-------------------] . .| [0:2] S -> NP VP *
|. . > . .| [2:2] NP -> * NAME
|. . > . .| [2:2] NP -> * ART N
|. . > . .| [2:2] ART -> * 'the'
|. . [---------] .| [2:3] ART -> 'the' *
|. . [---------> .| [2:3] NP -> ART * N
|. . . > .| [3:3] N -> * 'cat'
|. . . [---------]| [3:4] N -> 'cat' *
|. . [-------------------]| [2:4] NP -> ART N *
|. [-----------------------------]| [1:4] VP -> V NP *
|[=======================================]| [0:4] S -> NP VP *
(S (NP (NAME John)) (VP (V ate) (NP (ART the) (N cat))))
Well-Formed Substring Tables
Parse a sentence John ate the cat
import nltk
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V
NP -> NAME | ART N
NAME -> 'John'
V -> 'ate'
ART -> 'the'
N -> 'cat'
""")
tokens = ['John', 'ate', 'the', 'cat']
def init_wfst(tokens, grammar):
numtokens = len(tokens)
wfst = [['.' for i in range(numtokens+1)] for j in range(numtokens+1)]
for i in range(numtokens):
productions = grammar.productions(rhs=tokens[i])
wfst[i][i+1] = productions[0].lhs()
return wfst
def display(wfst, tokens):
print('\nWFST ' + ' '.join([("%-4d" % i) for i in range(1, len(wfst))]))
for i in range(len(wfst)-1):
print_string = ("%d " % i)
for j in range(1, len(wfst)):
print_string += (" %-4s"% wfst[i][j])
print(print_string)
wfst0 = init_wfst(tokens, grammar)
display(wfst0, tokens)
output
WFST 1 2 3 4
0 NAME . . .
1 . V . .
2 . . ART .
3 . . . N
Charts
Parse a sentence John ate the cat
import nltk
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V
NP -> NAME | ART N
NAME -> 'John'
V -> 'ate'
ART -> 'the'
N -> 'cat'
""")
tokens = ['John', 'ate', 'the', 'cat']
def complete_wfst(wfst, tokens, trace=False):
index = {}
for prod in grammar.productions():
index[prod.rhs()] = prod.lhs()
numtokens = len(tokens)
for span in range(2, numtokens+1):
for start in range(numtokens+1-span):
end = start + span
for mid in range(start+1, end):
nt1, nt2 = wfst[start][mid], wfst[mid][end]
if (nt1,nt2) in index:
if trace:
print("[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]" % \
(start, nt1, mid, nt2, end, start, index[(nt1,nt2)], end))
wfst[start][end] = index[(nt1,nt2)]
return wfst
wfst1 = complete_wfst(wfst0, tokens, trace=True)
output
[2] ART [3] N [4] ==> [2] NP [4]
[1] V [2] NP [4] ==> [1] VP [4]
earleychart demo
import nltk
nltk.parse.earleychart.demo()
Complex grammar and tokens
grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
tokens = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> ART ADJ N | ART N | ADJ N
VP -> AUX VP | V NP
ART -> 'the'
ADJ -> 'large'
N -> 'can' | 'hold' | 'water'
AUX -> 'can'
V -> 'can' | 'hold' | 'water'
""")
tokens = ['the', 'large', 'can', 'can', 'hold', 'the', 'water']