字符串共同前缀分析

lair-sun

于 2023-03-27 11:27:50 发布

阅读量270

点赞数

文章标签： java 深度优先开发语言

本文链接：https://blog.csdn.net/Redolently/article/details/129791524

版权

现在有N个字符串，现在想分析它们的前缀是否存在共同之处，举个例子，有80%的字符串前缀msg，有20%的字符串前缀是ext，之类的

1.设计思路

使用字典树（Trie）来解决这个问题。将所有的字符串插入到字典树中，每个节点代表一个前缀。当一个字符串插入到字典树中时，可以将它的所有前缀都插入到字典树中。

在字典树中，可以使用一个计数器来记录每个前缀的出现次数。当所有的字符串都被插入到字典树中后，可以遍历字典树并记录所有出现次数超过一定阈值的前缀。这些前缀就是所有字符串共同的前缀。

2.代码实现(Java)

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

class TrieNode {
    Map<Character, TrieNode> children;
    boolean isWord;
    int count;

    public TrieNode() {
        children = new HashMap<>();
        isWord = false;
        count = 0;
    }
}

class Trie {
    TrieNode root;

    public Trie() {
        root = new TrieNode();
    }

    public void insert(String word) {
        TrieNode node = root;
        for (char c : word.toCharArray()) {
            if (!node.children.containsKey(c)) {
                node.children.put(c, new TrieNode());
            }
            node = node.children.get(c);
            node.count++;
        }
        node.isWord = true;
    }

    public List<String> findCommonPrefixes(int threshold) {
        List<String> result = new ArrayList<>();
        dfs(root, "", threshold, result);
        return result;
    }

    private void dfs(TrieNode node, String prefix, int threshold, List<String> result) {
        if (node.count < threshold) {
            return;
        }
        if (node.isWord && node.count >= threshold) {
            result.add(prefix);
        }
        for (Map.Entry<Character, TrieNode> entry : node.children.entrySet()) {
            dfs(entry.getValue(), prefix + entry.getKey(), threshold, result);
        }
    }
}

public class PrefixAnalyzer {
    public static List<String> analyzePrefixes(String[] strings, double thresholdRatio) {
        Trie trie = new Trie();
        for (String s : strings) {
            for (int i = 0; i < s.length(); i++) {
                trie.insert(s.substring(0, i + 1));
            }
        }
        int threshold = (int) Math.ceil(strings.length * thresholdRatio);
        return trie.findCommonPrefixes(threshold);
    }

    public static void main(String[] args) {
        String[] strings = {"msg1", "msg2", "msg3", "msg4", "msg5", "ext1", "ext2", "ext3"};
        double thresholdRatio = 0.5;
        List<String> commonPrefixes = analyzePrefixes(strings, thresholdRatio);
        System.out.println(commonPrefixes); // output: [m, ms, msg]
    }
}
//TreeNode类代表字典树中的一个节点，它包含子节点，一个布尔变量 isWord（用于标记节点结尾）和一个整数 count （用于记录这个前缀在所有字符串中出现的次数）
//Tire类代表整个字典树，它包含根节点，并提供了插入和查找共同前缀的方法，它将所有字符串插入到字典树中，并使用 findCommonPrefixes 方法查找所有出现次数超过阈值的共同前缀
//thresholdRatio 是设置的阈值，我们这里将阈值设为总字符串的50%