


1. 基础分词方法

1.1 基于规则的分词


package cn.juwatech.example;

import java.util.Arrays;
import java.util.List;

public class RuleBasedTokenizer {
    private static final List<String> DICTIONARY = Arrays.asList("hello", "world", "java", "tokenizer");

    public static void main(String[] args) {
        String text = "hello world java tokenizer";
        String[] tokens = text.split(" ");

        for (String token : tokens) {
            if (DICTIONARY.contains(token)) {
                System.out.println("Token: " + token);
            } else {
                System.out.println("Unknown token: " + token);
1.2 基于正则表达式的分词


package cn.juwatech.example;

import java.util.regex.Pattern;
import java.util.regex.Matcher;

public class RegexTokenizer {
    public static void main(String[] args) {
        String text = "This is a simple example.";
        Pattern pattern = Pattern.compile("\\w+");
        Matcher matcher = pattern.matcher(text);

        while (matcher.find()) {
            System.out.println("Token: " + matcher.group());
2. 基于统计的分词方法

2.1 N-gram模型


package cn.juwatech.example;

import java.util.HashMap;
import java.util.Map;

public class NGramTokenizer {
    public static void main(String[] args) {
        String text = "this is a test";
        int n = 2; // bigram

        Map<String, Integer> ngrams = new HashMap<>();
        String[] tokens = text.split(" ");

        for (int i = 0; i < tokens.length - n + 1; i++) {
            StringBuilder sb = new StringBuilder();
            for (int j = 0; j < n; j++) {
                sb.append(tokens[i + j]).append(" ");
            String ngram = sb.toString().trim();
            ngrams.put(ngram, ngrams.getOrDefault(ngram, 0) + 1);

        for (Map.Entry<String, Integer> entry : ngrams.entrySet()) {
            System.out.println("N-gram: " + entry.getKey() + ", Count: " + entry.getValue());
2.2 隐马尔可夫模型(HMM)


package cn.juwatech.example;

import java.util.HashMap;
import java.util.Map;

public class SimpleHMMTokenizer {
    private static final Map<String, Double> transitionProbabilities = new HashMap<>();
    private static final Map<String, Double> emissionProbabilities = new HashMap<>();

    static {
        // Initialize with some example probabilities
        transitionProbabilities.put("B-B", 0.4);
        transitionProbabilities.put("B-I", 0.6);
        transitionProbabilities.put("I-B", 0.3);
        transitionProbabilities.put("I-I", 0.7);
        emissionProbabilities.put("我", 0.5);
        emissionProbabilities.put("喜欢", 0.5);
        emissionProbabilities.put("学习", 0.5);

    public static void main(String[] args) {
        String text = "我喜欢学习";
        // Example of processing text with probabilities
        // In real application, you'd use Viterbi algorithm or similar

        for (char c : text.toCharArray()) {
            System.out.println("Character: " + c + ", Probability: " + emissionProbabilities.getOrDefault(String.valueOf(c), 0.0));
3. 基于深度学习的分词方法

3.1 词嵌入(Word Embeddings)


package cn.juwatech.example;

import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.models.word2vec.Word2Vec.Builder;
import org.deeplearning4j.models.word2vec.Word2Vec.Builder;

public class Word2VecTokenizer {
    public static void main(String[] args) {
        // Initialize Word2Vec model (example code, in practice, you would load a pre-trained model)
        Word2Vec vec = new Word2Vec.Builder().build();

        // Example of using the model
        String word = "example";
        double[] vector = vec.getWordVector(word);

        System.out.println("Vector for '" + word + "':");
        for (double v : vector) {
            System.out.print(v + " ");
4. 实际应用中的分词策略

4.1 中文分词


# 使用jieba库进行中文分词
import jieba

text = "我喜欢学习自然语言处理"
tokens = jieba.cut(text)
print("Tokens:", list(tokens))
4.2 分词在实际项目中的应用
