搜索引擎之分词器学习

分词器实现代码:
package com.zd.tokenizer;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;

public class Tokenizer {
private Map<Character, Object> dictionary;

public Tokenizer(String dictionaryFilePath) throws IOException {
    //红黑树的实现
    dictionary = new TreeMap<>();
    //从文件加载字典到TreeMap
    this.loadDictionary(dictionaryFilePath);
}

private void loadDictionary(String dictionaryFilePath) throws IOException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(dictionaryFilePath)));
    String line = null;
    while ((line = reader.readLine()) != null) {
        line = line.trim();
        if (line.length() == 0) {
            continue;
        }
        char c;
        Map<Character, Object> child = this.dictionary;

        //组成以这个字符开头的词的树
        for (int i = 0; i < line.length(); i++) {
            c = line.charAt(i);
            Map<Character, Object> ccMap = (Map<Character, Object>) child.get(c);
            if (ccMap == null) {
                ccMap = new HashMap<Character, Object>();
                child.put(c, ccMap);
            }
           child = ccMap;
        }
        child.put(' ', null);
    }
}

public List<String> participie(String text) {
    if (text == null) {
        return null;
    }
    text = text.trim();
    if (text.length() == 0) {
        return null;
    }
    List<String> tokens = new ArrayList<>();
    char c;
    for (int i = 0; i < text.length(); ) {
        StringBuilder token = new StringBuilder();
        Map<Character, Object> child = this.dictionary;
        boolean matchToken = false;

        for (int j = i; j < text.length(); j++) {
            c = text.charAt(j);
            Map<Character, Object> ccMap = (Map<Character, Object>) child.get(c);
            if (ccMap == null) {
                if (child.containsKey(' ')) {
                    matchToken = true;
                    i = j;
                }
                break;
            } else {
                token.append(c);
                child = ccMap;
            }
        }

        //匹配到词
        if (matchToken) {
            tokens.add(token.toString());
        } else {
            if (child.containsKey(' ')) {//短的也是词,如张三丰,张三
                tokens.add(token.toString());
                break;
            } else {//没有匹配到词,则该字符作为一个词
                tokens.add("" + text.charAt(i));
                i++;

            }
        }
    }
    return tokens;
}


public static void main(String[] args) throws IOException {
    Tokenizer tk = new Tokenizer(Tokenizer.class.getResource("/dictionary.txt").getPath());
    List<String> tokens = tk.participie("想过离开,以这种方式存在,是因为那些旁白那些姿态那些伤害");
    for (String s : tokens){
        System.out.println(s);
    }
}

}
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值