PCFG Parser的简单实现 -- Java版

该代码是一个句法分析器的简单的实现逻辑,未经过优化,主要帮助理解分析器主要原理和算法,有了一定基础后参考Stanford Parser的源码(经过了高度的优化)。

下面是文件的大概预览

 下面是运行的结果

 下面是各个类的代码

package com.sample;

import java.util.Arrays;
import java.util.List;

/**
 * 分析器
 * 这里分析的规则假设都是Chomsky范式中的2型文法,也就是规则的子成分最多2个符号
 */
class Parser {
    //start-->词列表的开始位置,end-->词列表的结束位置,symbolIndex-->规则符号的位置(id)
    //记录每个位置的规则分数,double[start][end][symbolIndex]
    private double[][][] scoreMatrix;
    //记录每个位置的路径,Path[start][end][symbolIndex]
    private PathEntity[][][] pathEntityMatrices;

    /**
     * 主函数
     *
     * @param wordList 词列表
     * @return 语法树字符串
     */
    String parse(List<WordEntity> wordList) {
        //初始化矩阵
        initializeMatrix(wordList);
        //进行分析,其中symbolIndex是根符号的编号,这里S的编号是0
        double prob = analyze(0, wordList.size() - 1, 0);
        //获取最佳语法树
        String result = getBestPath(0, wordList.size() - 1, 0, wordList);
        //拼接最终概率
        result += " prob : " + prob;
        return result;
    }

    /**
     * 初始化矩阵(分数和路径)
     *
     * @param wordList 词列表
     */
    private void initializeMatrix(List<WordEntity> wordList) {
        int wordsCount = wordList.size();
        int symbolsCount = RuleUtil.getInstance().getRule2IdMap().size();
        //1.初始化分数矩阵
        //初始化分数矩阵的每一个位置的初始值,设置-1为了标志该位置未被计算过,计算后必定大于0
        this.scoreMatrix = new double[wordsCount][wordsCount][symbolsCount];
        for (int start = 0; start < wordsCount; start++) {
            for (int end = 0; end < wordsCount; end++) {
                Arrays.fill(this.scoreMatrix[start][end], -1);
            }
        }
        //设置每个词的初始概率
        for (int i = 0; i < wordsCount; i++) {
            int ruleId = RuleUtil.getInstance().getRule2IdMap().getOrDefault(wordList.get(i).getAttr(), -1);
            if (ruleId > 0) {
                this.scoreMatrix[i][i][ruleId] = getSymbolProb();
            }
        }
        //2.初始化路径矩阵
        this.pathEntityMatrices = new PathEntity[wordsCount][wordsCount][symbolsCount];
    }

    /**
     * 递归分析函数
     * 运用动态规划思想分析整个词列表,过程中记录每一步的分数和路径,以节省计算
     *
     * @param start       词列表开始下标
     * @param end         词列表结束下标
     * @param symbolIndex 规则位置(id)
     * @return 该句话的最大概率
     */
    private double analyze(int start, int end, int symbolIndex) {
        //如果大于0说明被计算过了,直接返回,如果小于0则进行计算
        if (this.scoreMatrix[start][end][symbolIndex] < 0) {
            String ruleSymbol = RuleUtil.getInstance().getId2RuleMap().get(symbolIndex);
            List<RuleEntity> ruleList = RuleUtil.getInstance().getRulesRepository().get(ruleSymbol);
            PathEntity curPathEntity = new PathEntity(-1, -1, -1);
            double maxProb = 0.0;
            double curProb;
            if (ruleList != null) {
                for (RuleEntity entity : ruleList) {
                    if (entity.hasRightConstituent()) {
                        //如果存在左右两个规则子成分的规则,循环从该词列表碎片从中间切割,判断两部分是否分别满足左右两个规则子成分
                        for (int cutPosition = start; cutPosition < end; cutPosition++) {
                            curProb = entity.getProbability() * analyze(start, cutPosition, RuleUtil.getInstance().getRule2IdMap().get(entity.getLeftConstituent())) * analyze(cutPosition + 1, end, RuleUtil.getInstance().getRule2IdMap().get(entity.getRightConstituent()));
                            if (curProb > maxProb) {
                                maxProb = curProb;
                                curPathEntity = new PathEntity(cutPosition, RuleUtil.getInstance().getRule2IdMap().get(entity.getLeftConstituent()), RuleUtil.getInstance().getRule2IdMap().get(entity.getRightConstituent()));
                            }
                        }
                    } else {
                        //如果只有一个规则,直接用词列表整体去匹配规则子成分
                        curProb = entity.getProbability() * analyze(start, end, RuleUtil.getInstance().getRule2IdMap().get(entity.getLeftConstituent()));
                        if (curProb > maxProb) {
                            maxProb = curProb;
                            curPathEntity = new PathEntity(end, RuleUtil.getInstance().getRule2IdMap().get(entity.getLeftConstituent()), -1);
                        }
                    }
                }
            }
            scoreMatrix[start][end][symbolIndex] = maxProb;
            pathEntityMatrices[start][end][symbolIndex] = curPathEntity;
        }
        return scoreMatrix[start][end][symbolIndex];
    }

    /**
     * 获取最佳语法树
     * 通过分析过程记录的路径,寻找最佳的语法树以字符串的形式返回
     * 备注:这里用简单的字符串表示了语法树,后续可以添加二叉树的结构实体
     *
     * @param start       词列表开始下标
     * @param end         词列表结束下标
     * @param symbolIndex 规则位置(id)
     * @param wordList    词列表
     * @return 最佳语法树
     */
    private String getBestPath(int start, int end, int symbolIndex, List<WordEntity> wordList) {
        if (symbolIndex == -1 || start > end) {
            return "";
        }
        StringBuilder result = new StringBuilder();
        result.append("(").append(RuleUtil.getInstance().getId2RuleMap().get(symbolIndex)).append(" ");
        if (start == end) {
            result.append(wordList.get(start).getWord()).append(")");
            return result.toString();
        }
        PathEntity pathEntity = pathEntityMatrices[start][end][symbolIndex];
        result.append(getBestPath(start, pathEntity.getCurPosition(), pathEntity.getLeftConstituentId(), wordList));
        result.append(getBestPath(pathEntity.getCurPosition() + 1, end, pathEntity.getRightConstituentId(), wordList));
        result.append(")");
        return result.toString();
    }

    /**
     * 获取规则符号的概率
     * 备注:这里默认每个词语属性的出现是必然的,所以概率默认返回1.0,后续可以加入字典概率,以更好地消除歧义结构
     *
     * @return 规则符号的概率
     */
    private double getSymbolProb() {
        return 1.0;
    }
}
package com.sample;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 规则工具类
 * 保存分析器所需要的规则数据
 */
class RuleUtil {
    private static RuleUtil instance;
    private Map<String, List<RuleEntity>> rulesRepository;//规则仓库,保存所有规则
    private Map<String, Integer> rule2IdMap;//规则-->id映射集合(为加快检索速度)
    private Map<Integer, String> id2RuleMap;//id-->规则映射集合(为加快检索速度)

    private RuleUtil() {
        initializeRuleData();
    }

    static synchronized RuleUtil getInstance() {
        if (instance == null) {
            instance = new RuleUtil();
        }
        return instance;
    }

    /**
     * 初始化规则相关的处理数据
     */
    private void initializeRuleData() {
        this.rulesRepository = new HashMap<>();
        this.rule2IdMap = new HashMap<>();
        this.id2RuleMap = new HashMap<>();
        int ruleId = 0;
        for (String rule : getAllRule()) {
            String[] ruleSeg = rule.split(CommonEnum.ARRAY_SPLIT_SYMBOL.getValue());
            String[] ruleConstituentSeg = ruleSeg[1].split(CommonEnum.SPACE_SPLIT_SYMBOL.getValue());
            RuleEntity ruleEntity = null;
            if (ruleConstituentSeg.length == 3) {
                ruleEntity = new RuleEntity(ruleConstituentSeg[0], ruleConstituentSeg[1], Double.parseDouble(ruleConstituentSeg[2]));
            } else if (ruleConstituentSeg.length == 2) {
                ruleEntity = new RuleEntity(ruleConstituentSeg[0], null, Double.parseDouble(ruleConstituentSeg[1]));
            }
            if (ruleEntity != null) {
                //填充规则仓库
                String ruleSymbol = ruleSeg[0];
                if (this.rulesRepository.containsKey(ruleSymbol)) {
                    this.rulesRepository.get(ruleSymbol).add(ruleEntity);
                } else {
                    List<RuleEntity> ruleList = new ArrayList<>();
                    ruleList.add(ruleEntity);
                    this.rulesRepository.put(ruleSymbol, ruleList);
                }
                //填充规则检索集合
                if (!this.rule2IdMap.containsKey(ruleSymbol)) {
                    this.rule2IdMap.put(ruleSymbol, ruleId);
                    this.id2RuleMap.put(ruleId, ruleSymbol);
                    ruleId++;
                }
                if (!this.rule2IdMap.containsKey(ruleEntity.getLeftConstituent())) {
                    this.rule2IdMap.put(ruleEntity.getLeftConstituent(), ruleId);
                    this.id2RuleMap.put(ruleId, ruleEntity.getLeftConstituent());
                    ruleId++;
                }
                if (ruleEntity.getRightConstituent() != null && !this.rule2IdMap.containsKey(ruleEntity.getRightConstituent())) {
                    this.rule2IdMap.put(ruleEntity.getRightConstituent(), ruleId);
                    this.id2RuleMap.put(ruleId, ruleEntity.getRightConstituent());
                    ruleId++;
                }
            }

        }
    }

    /**
     * 获取所有规则表达式
     * 备注:这里为了简便直接写在代码里了,应该是读取文件或者数据库获取
     *
     * @return 所有规则列表
     */
    private List<String> getAllRule() {
        List<String> rules = new ArrayList<>();
        rules.add("S->NP VP 0.7");
        rules.add("S->VP 0.2");
        rules.add("S->NP 0.1");
        rules.add("S->VC 0.1");
        rules.add("NP->noun 0.3");
        rules.add("NP->adj noun 0.2");
        rules.add("NP->DJ 0.2");
        rules.add("NP->DJ NP 0.3");
        rules.add("DJ->VP de 0.4");
        rules.add("DJ->NP de 0.6");
        rules.add("VP->VC NP 1.0");
        rules.add("VC->vt adj 0.3");
        rules.add("VC->VC utl 0.5");
        rules.add("VC->vt 0.2");
        return rules;
    }

    Map<String, List<RuleEntity>> getRulesRepository() {
        return rulesRepository;
    }

    Map<String, Integer> getRule2IdMap() {
        return rule2IdMap;
    }

    Map<Integer, String> getId2RuleMap() {
        return id2RuleMap;
    }
}
package com.sample;

import java.util.ArrayList;
import java.util.List;

/**
 * 测试类
 */
public class Test {
    public static void main(String[] args) {
        List<WordEntity> wordList = new ArrayList<>();
        wordList.add(new WordEntity("咬", "vt"));
        wordList.add(new WordEntity("死", "adj"));
        wordList.add(new WordEntity("了", "utl"));
        wordList.add(new WordEntity("猎人", "noun"));
        wordList.add(new WordEntity("的", "de"));
        wordList.add(new WordEntity("狗", "noun"));
        Parser parser = new Parser();
        System.out.println(parser.parse(wordList));
    }
}
package com.sample;

/**
 * 工具符号枚举类
 */
public enum CommonEnum {
    //箭头形状的分隔符
    ARRAY_SPLIT_SYMBOL("->"),
    //空格的分隔符
    SPACE_SPLIT_SYMBOL(" ");
    private String value;

    CommonEnum(String value) {
        this.value = value;
    }

    public String getValue() {
        return value;
    }
}
package com.sample;

/**
 * 路径实体类
 */
class PathEntity {
    private int curPosition;//词列表中间切分的位置
    private int leftConstituentId;//规则左侧子成分编号
    private int rightConstituentId;//规则右侧子成分编号

    PathEntity(int curPosition, int leftConstituentId, int rightConstituentId) {
        this.curPosition = curPosition;
        this.leftConstituentId = leftConstituentId;
        this.rightConstituentId = rightConstituentId;
    }

    int getCurPosition() {
        return curPosition;
    }

    int getLeftConstituentId() {
        return leftConstituentId;
    }

    int getRightConstituentId() {
        return rightConstituentId;
    }
}
package com.sample;

/**
 * 规则实体类
 */
class RuleEntity {
    //规则式样:rule symbol --> leftConstituent rightConstituent
    private String leftConstituent;//规则左侧子成分
    private String rightConstituent;//规则右侧子成分
    private double probability;//规则的统计概率

    RuleEntity(String leftConstituent, String rightConstituent, double probability) {
        this.leftConstituent = leftConstituent;
        this.rightConstituent = rightConstituent;
        this.probability = probability;
    }

    String getLeftConstituent() {
        return this.leftConstituent;
    }

    String getRightConstituent() {
        return this.rightConstituent;
    }

    double getProbability() {
        return this.probability;
    }

    boolean hasRightConstituent() {
        return this.rightConstituent != null;
    }
}
package com.sample;

/**
 * 词语实体类
 */
class WordEntity {
    private String word;//词语汉字
    private String attr;//词语属性

    WordEntity(String word, String attr) {
        this.word = word;
        this.attr = attr;
    }

    String getWord() {
        return this.word;
    }

    String getAttr() {
        return this.attr;
    }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值