该代码是一个句法分析器的简单的实现逻辑,未经过优化,主要帮助理解分析器主要原理和算法,有了一定基础后参考Stanford Parser的源码(经过了高度的优化)。
下面是文件的大概预览
下面是运行的结果
下面是各个类的代码
package com.sample;
import java.util.Arrays;
import java.util.List;
/**
* 分析器
* 这里分析的规则假设都是Chomsky范式中的2型文法,也就是规则的子成分最多2个符号
*/
class Parser {
//start-->词列表的开始位置,end-->词列表的结束位置,symbolIndex-->规则符号的位置(id)
//记录每个位置的规则分数,double[start][end][symbolIndex]
private double[][][] scoreMatrix;
//记录每个位置的路径,Path[start][end][symbolIndex]
private PathEntity[][][] pathEntityMatrices;
/**
* 主函数
*
* @param wordList 词列表
* @return 语法树字符串
*/
String parse(List<WordEntity> wordList) {
//初始化矩阵
initializeMatrix(wordList);
//进行分析,其中symbolIndex是根符号的编号,这里S的编号是0
double prob = analyze(0, wordList.size() - 1, 0);
//获取最佳语法树
String result = getBestPath(0, wordList.size() - 1, 0, wordList);
//拼接最终概率
result += " prob : " + prob;
return result;
}
/**
* 初始化矩阵(分数和路径)
*
* @param wordList 词列表
*/
private void initializeMatrix(List<WordEntity> wordList) {
int wordsCount = wordList.size();
int symbolsCount = RuleUtil.getInstance().getRule2IdMap().size();
//1.初始化分数矩阵
//初始化分数矩阵的每一个位置的初始值,设置-1为了标志该位置未被计算过,计算后必定大于0
this.scoreMatrix = new double[wordsCount][wordsCount][symbolsCount];
for (int start = 0; start < wordsCount; start++) {
for (int end = 0; end < wordsCount; end++) {
Arrays.fill(this.scoreMatrix[start][end], -1);
}
}
//设置每个词的初始概率
for (int i = 0; i < wordsCount; i++) {
int ruleId = RuleUtil.getInstance().getRule2IdMap().getOrDefault(wordList.get(i).getAttr(), -1);
if (ruleId > 0) {
this.scoreMatrix[i][i][ruleId] = getSymbolProb();
}
}
//2.初始化路径矩阵
this.pathEntityMatrices = new PathEntity[wordsCount][wordsCount][symbolsCount];
}
/**
* 递归分析函数
* 运用动态规划思想分析整个词列表,过程中记录每一步的分数和路径,以节省计算
*
* @param start 词列表开始下标
* @param end 词列表结束下标
* @param symbolIndex 规则位置(id)
* @return 该句话的最大概率
*/
private double analyze(int start, int end, int symbolIndex) {
//如果大于0说明被计算过了,直接返回,如果小于0则进行计算
if (this.scoreMatrix[start][end][symbolIndex] < 0) {
String ruleSymbol = RuleUtil.getInstance().getId2RuleMap().get(symbolIndex);
List<RuleEntity> ruleList = RuleUtil.getInstance().getRulesRepository().get(ruleSymbol);
PathEntity curPathEntity = new PathEntity(-1, -1, -1);
double maxProb = 0.0;
double curProb;
if (ruleList != null) {
for (RuleEntity entity : ruleList) {
if (entity.hasRightConstituent()) {
//如果存在左右两个规则子成分的规则,循环从该词列表碎片从中间切割,判断两部分是否分别满足左右两个规则子成分
for (int cutPosition = start; cutPosition < end; cutPosition++) {
curProb = entity.getProbability() * analyze(start, cutPosition, RuleUtil.getInstance().getRule2IdMap().get(entity.getLeftConstituent())) * analyze(cutPosition + 1, end, RuleUtil.getInstance().getRule2IdMap().get(entity.getRightConstituent()));
if (curProb > maxProb) {
maxProb = curProb;
curPathEntity = new PathEntity(cutPosition, RuleUtil.getInstance().getRule2IdMap().get(entity.getLeftConstituent()), RuleUtil.getInstance().getRule2IdMap().get(entity.getRightConstituent()));
}
}
} else {
//如果只有一个规则,直接用词列表整体去匹配规则子成分
curProb = entity.getProbability() * analyze(start, end, RuleUtil.getInstance().getRule2IdMap().get(entity.getLeftConstituent()));
if (curProb > maxProb) {
maxProb = curProb;
curPathEntity = new PathEntity(end, RuleUtil.getInstance().getRule2IdMap().get(entity.getLeftConstituent()), -1);
}
}
}
}
scoreMatrix[start][end][symbolIndex] = maxProb;
pathEntityMatrices[start][end][symbolIndex] = curPathEntity;
}
return scoreMatrix[start][end][symbolIndex];
}
/**
* 获取最佳语法树
* 通过分析过程记录的路径,寻找最佳的语法树以字符串的形式返回
* 备注:这里用简单的字符串表示了语法树,后续可以添加二叉树的结构实体
*
* @param start 词列表开始下标
* @param end 词列表结束下标
* @param symbolIndex 规则位置(id)
* @param wordList 词列表
* @return 最佳语法树
*/
private String getBestPath(int start, int end, int symbolIndex, List<WordEntity> wordList) {
if (symbolIndex == -1 || start > end) {
return "";
}
StringBuilder result = new StringBuilder();
result.append("(").append(RuleUtil.getInstance().getId2RuleMap().get(symbolIndex)).append(" ");
if (start == end) {
result.append(wordList.get(start).getWord()).append(")");
return result.toString();
}
PathEntity pathEntity = pathEntityMatrices[start][end][symbolIndex];
result.append(getBestPath(start, pathEntity.getCurPosition(), pathEntity.getLeftConstituentId(), wordList));
result.append(getBestPath(pathEntity.getCurPosition() + 1, end, pathEntity.getRightConstituentId(), wordList));
result.append(")");
return result.toString();
}
/**
* 获取规则符号的概率
* 备注:这里默认每个词语属性的出现是必然的,所以概率默认返回1.0,后续可以加入字典概率,以更好地消除歧义结构
*
* @return 规则符号的概率
*/
private double getSymbolProb() {
return 1.0;
}
}
package com.sample;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 规则工具类
* 保存分析器所需要的规则数据
*/
class RuleUtil {
private static RuleUtil instance;
private Map<String, List<RuleEntity>> rulesRepository;//规则仓库,保存所有规则
private Map<String, Integer> rule2IdMap;//规则-->id映射集合(为加快检索速度)
private Map<Integer, String> id2RuleMap;//id-->规则映射集合(为加快检索速度)
private RuleUtil() {
initializeRuleData();
}
static synchronized RuleUtil getInstance() {
if (instance == null) {
instance = new RuleUtil();
}
return instance;
}
/**
* 初始化规则相关的处理数据
*/
private void initializeRuleData() {
this.rulesRepository = new HashMap<>();
this.rule2IdMap = new HashMap<>();
this.id2RuleMap = new HashMap<>();
int ruleId = 0;
for (String rule : getAllRule()) {
String[] ruleSeg = rule.split(CommonEnum.ARRAY_SPLIT_SYMBOL.getValue());
String[] ruleConstituentSeg = ruleSeg[1].split(CommonEnum.SPACE_SPLIT_SYMBOL.getValue());
RuleEntity ruleEntity = null;
if (ruleConstituentSeg.length == 3) {
ruleEntity = new RuleEntity(ruleConstituentSeg[0], ruleConstituentSeg[1], Double.parseDouble(ruleConstituentSeg[2]));
} else if (ruleConstituentSeg.length == 2) {
ruleEntity = new RuleEntity(ruleConstituentSeg[0], null, Double.parseDouble(ruleConstituentSeg[1]));
}
if (ruleEntity != null) {
//填充规则仓库
String ruleSymbol = ruleSeg[0];
if (this.rulesRepository.containsKey(ruleSymbol)) {
this.rulesRepository.get(ruleSymbol).add(ruleEntity);
} else {
List<RuleEntity> ruleList = new ArrayList<>();
ruleList.add(ruleEntity);
this.rulesRepository.put(ruleSymbol, ruleList);
}
//填充规则检索集合
if (!this.rule2IdMap.containsKey(ruleSymbol)) {
this.rule2IdMap.put(ruleSymbol, ruleId);
this.id2RuleMap.put(ruleId, ruleSymbol);
ruleId++;
}
if (!this.rule2IdMap.containsKey(ruleEntity.getLeftConstituent())) {
this.rule2IdMap.put(ruleEntity.getLeftConstituent(), ruleId);
this.id2RuleMap.put(ruleId, ruleEntity.getLeftConstituent());
ruleId++;
}
if (ruleEntity.getRightConstituent() != null && !this.rule2IdMap.containsKey(ruleEntity.getRightConstituent())) {
this.rule2IdMap.put(ruleEntity.getRightConstituent(), ruleId);
this.id2RuleMap.put(ruleId, ruleEntity.getRightConstituent());
ruleId++;
}
}
}
}
/**
* 获取所有规则表达式
* 备注:这里为了简便直接写在代码里了,应该是读取文件或者数据库获取
*
* @return 所有规则列表
*/
private List<String> getAllRule() {
List<String> rules = new ArrayList<>();
rules.add("S->NP VP 0.7");
rules.add("S->VP 0.2");
rules.add("S->NP 0.1");
rules.add("S->VC 0.1");
rules.add("NP->noun 0.3");
rules.add("NP->adj noun 0.2");
rules.add("NP->DJ 0.2");
rules.add("NP->DJ NP 0.3");
rules.add("DJ->VP de 0.4");
rules.add("DJ->NP de 0.6");
rules.add("VP->VC NP 1.0");
rules.add("VC->vt adj 0.3");
rules.add("VC->VC utl 0.5");
rules.add("VC->vt 0.2");
return rules;
}
Map<String, List<RuleEntity>> getRulesRepository() {
return rulesRepository;
}
Map<String, Integer> getRule2IdMap() {
return rule2IdMap;
}
Map<Integer, String> getId2RuleMap() {
return id2RuleMap;
}
}
package com.sample;
import java.util.ArrayList;
import java.util.List;
/**
* 测试类
*/
public class Test {
public static void main(String[] args) {
List<WordEntity> wordList = new ArrayList<>();
wordList.add(new WordEntity("咬", "vt"));
wordList.add(new WordEntity("死", "adj"));
wordList.add(new WordEntity("了", "utl"));
wordList.add(new WordEntity("猎人", "noun"));
wordList.add(new WordEntity("的", "de"));
wordList.add(new WordEntity("狗", "noun"));
Parser parser = new Parser();
System.out.println(parser.parse(wordList));
}
}
package com.sample;
/**
* 工具符号枚举类
*/
public enum CommonEnum {
//箭头形状的分隔符
ARRAY_SPLIT_SYMBOL("->"),
//空格的分隔符
SPACE_SPLIT_SYMBOL(" ");
private String value;
CommonEnum(String value) {
this.value = value;
}
public String getValue() {
return value;
}
}
package com.sample;
/**
* 路径实体类
*/
class PathEntity {
private int curPosition;//词列表中间切分的位置
private int leftConstituentId;//规则左侧子成分编号
private int rightConstituentId;//规则右侧子成分编号
PathEntity(int curPosition, int leftConstituentId, int rightConstituentId) {
this.curPosition = curPosition;
this.leftConstituentId = leftConstituentId;
this.rightConstituentId = rightConstituentId;
}
int getCurPosition() {
return curPosition;
}
int getLeftConstituentId() {
return leftConstituentId;
}
int getRightConstituentId() {
return rightConstituentId;
}
}
package com.sample;
/**
* 规则实体类
*/
class RuleEntity {
//规则式样:rule symbol --> leftConstituent rightConstituent
private String leftConstituent;//规则左侧子成分
private String rightConstituent;//规则右侧子成分
private double probability;//规则的统计概率
RuleEntity(String leftConstituent, String rightConstituent, double probability) {
this.leftConstituent = leftConstituent;
this.rightConstituent = rightConstituent;
this.probability = probability;
}
String getLeftConstituent() {
return this.leftConstituent;
}
String getRightConstituent() {
return this.rightConstituent;
}
double getProbability() {
return this.probability;
}
boolean hasRightConstituent() {
return this.rightConstituent != null;
}
}
package com.sample;
/**
* 词语实体类
*/
class WordEntity {
private String word;//词语汉字
private String attr;//词语属性
WordEntity(String word, String attr) {
this.word = word;
this.attr = attr;
}
String getWord() {
return this.word;
}
String getAttr() {
return this.attr;
}
}