简介
句法分析分为两大类Dependency Parsing和Constituency Parser。中文Dependency Parsing对应的工具比较多,例如哈工大的ltp,Stanford CoreNLP等。但是Constituency Parser对应的工具比较少,经过调研,Stanford CoreNLP应该是最好的选择。
Stanza
本文主要介绍如何使用stanza来进行Constituency Parse。 stanza是Stanford CoreNLP的最新python包,官网链接。
安装
- jdk 1.8 64bit
pip install stanza
- stanza提供了下载Stanford CoreNLP的java包和中文模型的接口
import stanza
stanza.install_corenlp(dir="YOUR_CORENLP_FOLDER")
stanza.download_corenlp_models(model='chinese', version='4.1.0', dir="YOUR_CORENLP_FOLDER")
- 配置CoreNLP环境变量
export CORENLP_HOME=YOUR_CORENLP_FOLDER
使用
获取句子constitutency parser的结果
from stanza.server import CoreNLPClient
class StanzaClient():
def __init__(self):
self.client = CoreNLPClient(annotators=[
'tokenize',
'ssplit',
'pos',
'lemma',
'parse',
],
timeout=30000,
properties="zh",
output_format="json",
memory='5g')
def get_parse_tree(self, sent):
ann = self.client.annotate(sent)
return ann["sentences"][0]["parse"]
For Example, "今天天气不错啊"的parser tree 如下:
(ROOT
(CP
(IP
(NP (NT 今天))
(NP (NN 天气))
(VP (VA 不错)))
(SP 啊)))
应用
组词:输入一句话,期望获得不同的组词方案
举例:今天天气不错啊=>今天天气/不错啊, 今天/天气/不错/啊,今天/天气不错啊
方法:使用nltk.tree.Tree对parser的结果进行处理,组词
import random
from nltk.tree import Tree
class Stack(): #定义类
def __init__(self): #产生一个空的容器
self.__list = []
def put(self, item): #入栈
self.__list.append(item)
def get(self): #出栈
return self.__list.pop()
def speek(self): #返回栈顶元素
return self.__list[-1]
def empty(self): #判断是否已为空
return not self.__list
def size(self): #返回栈中元素个数
return len(self.__list)
class ConParserSeg():
def __init__(self,
height2prob={
4: 0.7,
5: 0.7,
6: 0.5,
7: 0.5,
8: 0.3,
9: 0.3
}):
self.height2prob = height2prob
def sent_cut(self, sent):
t = Tree.fromstring(sent)
q = Stack()
q.put(t[0])
res = []
while not q.empty():
item = q.get()
if isinstance(item, str):
res.append(item)
continue
if item.height() == 2:
res.append("".join(item.leaves()))
continue
for i in range(len(item)):
prob = random.random()
h = item[i].height()
if h in self.height2prob and prob <= self.height2prob[h]:
q.put("".join(item[i].leaves()))
else:
q.put(item[i])
res.reverse()
return res
if __name__ == "__main__":
s = """
(ROOT
(IP
(VP
(PP (P 在)
(NP (NR 日本)))
(VP (VV 发生) (AS 了)
(NP
(DNP
(NP
(QP (CD 一)
(CLP (M 件)))
(NP (NN 千真万确)))
(DEG 的))
(NP (NN 事)))))))
"""
s = "(ROOT\n (IP\n (CP\n (IP\n (VP\n (VRD (VV 吃饭) (VV 去))))\n (SP 吧))\n (IP\n (NP (NN 太阳))\n (VP\n (PP (P 从)\n (NP (NN 东边)))\n (VP (VV 升起))))))"
seg = ConParserSeg()
ret = seg.sent_cut(s)
print(" ".join(ret))