命名实体识别实践(词典匹配)

任务场景:

实体识别任务中,如果有一份可靠的词典,并且词典和普通的文本间差异比较大的时候,其实可以用磁带你匹配的方式进行实体识别。

本文中实现了一种词典匹配的实体识别方式,采用的是正向最大匹配+检索树+树尾标签列表的方式实现的。也就是其支持单实体可以对应多标签的情形。



    public static void main(String[] args) throws Exception{
        DictWithMultiLable tree = new DictWithMultiLable();
        tree.initSearchParm("F:\\dict.txt"); //初始化词典(歌曲相声等标签)
        long t1 = System.currentTimeMillis();
        Map<String, Set<String>> res = tree.dicMatch("沙漠骆驼");//查找
        long t2 = System.currentTimeMillis();
        System.out.println(res);
        System.out.println("search cose time : " + (t2-t1));
//        gene_line_entity_pair(tree);
    }

    static TreeNode head = new TreeNode(); //树的根节点
    static int maxWordLen = 0; //最大的单词长度
    static Set<String> allWord = new HashSet<String>();  //所有的单词的缓存
    static TreeNode endNode = new TreeNode("<eos>"); //结尾标志

    //词典进行匹配
    public static Map<String, Set<String>> dicMatch(String line){
        Map<String, Set<String>> res_dic= new HashMap<String, Set<String>>();
        int lineLen = line.length();
        int index = 0;
        while(index < lineLen){
            int end  = lineLen <= index + maxWordLen? lineLen: index + maxWordLen;
            String tmp_s = line.substring(index, end);
            boolean find = false;
            for(int i = tmp_s.length(); i >= 0; i--){
                String tw = tmp_s.substring(0,i);
                Set<String> iscontain = isTreeContain(tw);
                if(iscontain != null && iscontain.size() > 0){
                    index = index + i;
                    res_dic.put(tw, iscontain);
                    find = true;
                    break;
                }
            }
            if(!find){
                index = index + 1;
            }
        }
        return res_dic;
    }

    //是否包含词典
    private static Set<String> isTreeContain(String word){
        int wordlen = word.length();
        Set<String> res = null;
        TreeNode curr = head;
        int i= 0;
        for(i = 0; i < wordlen; i++){
            if(curr.getChild() == null){
                res = null;
                break;
            }else{
                List<TreeNode> child = curr.getChild();
                TreeNode tmptree = new TreeNode(word.charAt(i) + "");
                if(!child.contains(tmptree)){
                    res = null;
                    break;
                }else{
                    curr = child.get(child.indexOf(tmptree));
                }
            }
        }
        if(!curr.getChild().contains(endNode) || i < wordlen){
            res = null;
        }else if(i == wordlen && curr.getChild().contains(endNode)){
            return curr.getChild().get(curr.getChild().indexOf(endNode)).getLableList();
        }
        return res;
    }


    //词典构建和初始化
    public static void initSearchParm(String filePath) throws Exception{
        String fileContent = readAllFile(filePath);
        String[] words = fileContent.split("\n");
        System.out.println("file line count :" + words.length);
        long t1 = System.currentTimeMillis();
        buildTree(words);
        System.out.println("build dic time : " + (System.currentTimeMillis() - t1));
    }

    static String readAllFile(String filename) throws Exception {
        File file = new File(filename);
        Long len = file.length();
        byte[] filecontent = new byte[len.intValue()];
        FileInputStream in = new FileInputStream(file);
        in.read(filecontent);
        in.close();
        String content = new String(filecontent, "utf-8");
        return content;
    }

    static void buildTree(String[] words){
        for(String s : words){
            s = s.trim();
            String lable = s.split("\t")[1];
            String name = s.split("\t")[0];

            List<String> token = getOneWordList(name);
            TreeNode curr = head;
            for(String w : token){
                TreeNode node = new TreeNode(w);
                if(curr.getChild() !=null){
                    if(curr.getChild().contains(node)){
                        List<TreeNode> child = curr.getChild();
                        curr = child.get(child.indexOf(node));
                        if(node.equals(endNode)){ //新增标签
                            Set<String> lableSet = curr.getLableList();
                            lableSet.add(lable);
                        }
                    }else{
                        List<TreeNode> child = curr.getChild();
                        child.add(node);
                        curr = child.get(child.indexOf(node));
                        //添加多标签
                        if(node.equals(endNode)){ //新增标签
                            Set<String> lableSet = curr.getLableList();
                            if(lableSet == null){
                                lableSet = new HashSet<String>();
                                lableSet.add(lable);
                                node.setLableList(lableSet);
                            }else {
                                lableSet.add(lable);
                            }
                        }
                    }
                }else{
                    List<TreeNode> child = new ArrayList<TreeNode>();
                    child.add(node);
                    if(node.equals(endNode)){ //新增标签
                        Set<String> lableSet = new HashSet<String>();
                        lableSet.add(lable);
                        node.setLableList(lableSet);
                    }
                    curr.setChild(child);
                    curr = child.get(child.indexOf(node));
                }
            }
        }
    }

    static List<String> getOneWordList(String line){
        List<String> res = new ArrayList<String>();
        allWord.add(line);
        int wordlen = line.trim().length();
        updataMaxLen(wordlen);
        for(int j = 0; j < wordlen; j++) {
            String w = line.charAt(j) + "";
            res.add(w);
        }
        res.add("<eos>");
        return res;
    }

    static void updataMaxLen(int len){
        if(len > maxWordLen){
            maxWordLen = len;
        }
    }

    static int getMaxWordLen(){
        return maxWordLen;
    }




  • 1
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

花咪

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值