java字符串内容查找工具（连续字符查找）

置顶 IT_CREATE
已于 2023-04-02 13:20:26 修改
阅读量504
点赞数
分类专栏：其他文章标签： java 查找连续性字符串查找子字符串查找子字符串出现次数
于 2023-04-02 13:19:14 首次发布
本文链接：https://blog.csdn.net/IT_CREATE/article/details/129909744
版权
其他专栏收录该内容
14 篇文章 3 订阅
订阅专栏
package com.de.util;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;

/**
 * 该类作用 查找重复字符工具
 *
 * @author IT_CREATE
 * @date 2023402 12:03:10
 */
public class SearchRepetitionUtil {
    /**
     * 待测试的字符串
     */
    public static String testTtr = "oneofthecentralresultsofairesearchinthe1970swasthattoachievegoodper" +
            "formanceaisystemsmusthavelargeamountsofknowledgeknowledgeispowertheslogangoeshumansclearly" +
            "usevastamountsofknowledgeandifaiistoachieveitslongtermgoalsaisystemsmustalsousevastamounts" +
            "sincehandcodinglargeamountsofknowledgeintoasystemisslowtediousanderrorpronemachinelearning" +
            "techniqueshavebeendevelopedtoautomaticallyacquireknowledgeoftenintheformofifthenrulesprodu" +
            "ctionsunfortunatelythishasoftenledtoautilityproblemminton1988bthelearninghascausedanoveral" +
            "lslowdowninthesystemforexampleinmanysystemslearnedrulesareusedtoreducethenumberofbasicstep" +
            "sthesystemtakesinordertosolveproblemsbypruningthesystemssearchspaceforinstancebutinorderto" +
            "determineateachstepwhichrulesareapplicablethesystemmustmatchthemagainstitscurrentsituation" +
            "usingcurrenttechniquesthematcherslowsdownasmoreandmorerulesareacquiredsoeachsteptakeslonge" +
            "randlongerthisectcanoutweighthereductioninthenumberofstepstakensothatthenetresultisaslowdo" +
            "wnthishasbeenobservedinseveralrecentsystemsminton1988aetzioni1990tambeetal1990cohen1990ofc" +
            "oursetheproblemofslowdownfromincreasingmatchcostisnotrestrictedtosystemsinwhichthepurposeo" +
            "frulesistoreducethenumberofproblemsolvingstepsasystemacquiringnewrulesforanypurposecanslow" +
            "downiftherulessignicantlyincreasethematchcostandintuitivelyoneexpectsthatthemoreproduction" +
            "sthereareinasystemthehigherthetotalmatchcostwillbethethesisofthisresearchisthatwecansolvet" +
            "hisprobleminabroadclassofsystemsbyimprovingthematchalgorithmtheyuseinessenceouraimistoenab" +
            "lethescalingupofthenumberofrulesinproductionsystemsweadvancethestateoftheartinproductionma" +
            "tchalgorithmsdevelopinganimprovedmatchalgorithmwhoseperformancescaleswellonasignicantlybro" +
            "aderclassofsystemsthanexistingalgorithmsfurthermorewedemonstratethatbyusingthisimprovedmat" +
            "chalgorithmwecanreduceoravoidtheutilityprobleminalargeclassofmachinelearningsystems";

    /**
     * 用作返回map的key
     */
    public enum ReturnKey {
        /**
         * 数量
         */
        COUNT,
        /**
         * 字符
         */
        SUBSTRINGS
    }

    /**
     * 找出文本文件中出现最多的字串的集合
     *
     * @param chainNumber 连续多少个字符算一个字串，也就是字串这个单词的长度
     * @param filePath    需要读取文件路径
     * @return 出现最多的字串的集合和次数
     */
    public static Map<ReturnKey, Object> searchMostSubstringsByFile(int chainNumber, String filePath) {
        List<String> mostSubstrings = new ArrayList<>();
        Map<ReturnKey, Object> returnMap = new LinkedHashMap<>(2);
        returnMap.put(ReturnKey.COUNT, 0);
        returnMap.put(ReturnKey.SUBSTRINGS, mostSubstrings);
        if (strIsEmpty(filePath)) {
            return returnMap;
        }
        File file = new File(filePath);
        if (file.exists()) {
            FileReader fileReader = null;
            try {
                fileReader = new FileReader(file);
                char[] readChar = new char[1024];
                StringBuilder waitParsingStr = new StringBuilder();
                int readLength = 0;
                while ((readLength = fileReader.read(readChar)) != -1) {
                    waitParsingStr.append(readChar, 0, readLength);
                }
                return searchMostSubstrings(chainNumber, waitParsingStr.toString());
            } catch (IOException e) {
                System.out.println(e.getMessage());
            } finally {
                try {
                    if (fileReader != null) {
                        fileReader.close();
                    }
                } catch (IOException e) {
                    System.out.println(e.getMessage());
                }
            }
        }
        return returnMap;
    }

    /**
     * 找出字符产中出现做多的字串集合
     *
     * @param chainNumber    连续多少个字符算一个字串，也就是字串这个单词的长度
     * @param waitParsingStr 需要被解析的字符串
     * @return 出现最多的字串的集合和次数
     */
    public static Map<ReturnKey, Object> searchMostSubstrings(int chainNumber, String waitParsingStr) {
        //需要返回的查找出来的最多的字串的集合
        List<String> mostSubstrings = new ArrayList<>();
        Map<ReturnKey, Object> returnMap = new LinkedHashMap<>(2);
        returnMap.put(ReturnKey.COUNT, 0);
        returnMap.put(ReturnKey.SUBSTRINGS, mostSubstrings);
        //等待解析的字符串的长度
        int waitParsingStrSize = waitParsingStr.length();
        System.out.println("待解析字符串大小 : " + waitParsingStrSize + " , 待解析字符串内容 : " + waitParsingStr);
        if (strIsEmpty(waitParsingStr) || chainNumber > waitParsingStrSize) {
            return returnMap;
        }
        //最多字串的数量
        int mostSubstringCount = 0;
        //解析出来的所有字串的集合
        Set<String> substrings = new HashSet<>();
        //从字符串开头每个字符开始循环解析
        for (int i = 0; i < waitParsingStrSize; i++) {
            //如果查找字串所在的最后的索引小于待解析的字符串则取出该子字符串
            if (i + (chainNumber - 1) < waitParsingStrSize) {
                String substr = waitParsingStr.substring(i, i + chainNumber);
                //如果字串集合中已经包含了本次获取到的字串则跳出进行下一次字串解析
                if (substrings.contains(substr)) {
                    continue;
                }
                substrings.add(substr);
                //获得字串在待解析字符串中出现的次数
                int substrCount = countStr(waitParsingStr, substr);
                //如果当前获得的字串的数量大于之前出现的最大字串的数量，则清除之前的字串，添加当前的字串
                if (substrCount > mostSubstringCount) {
                    mostSubstrings.clear();
                    mostSubstrings.add(substr);
                } else if (substrCount == mostSubstringCount) {
                    // 如果当前获得的字串的数量等于之前出现的最大字串的数量，则添加当前的字串
                    mostSubstrings.add(substr);
                }
                //比较获取当前字串最大的次数进行临时赋值
                mostSubstringCount = Math.max(substrCount, mostSubstringCount);
            }
        }
        returnMap.put(ReturnKey.COUNT, mostSubstringCount);
        return returnMap;
    }

    /**
     * 查找连续性字符串
     *
     * @param chainNumber 连续多少个字符算一个字串，也就是字串这个单词的长度
     * @param waitParsingStr 需要被解析的字符串
     * @return 每个单词对应的次数单词
     */
    public static List<Map<ReturnKey, Object>> searchSubstrings(int chainNumber, String waitParsingStr) {
        //等待解析的字符串的长度
        int waitParsingStrSize = waitParsingStr.length();
        System.out.println("待解析字符串大小 : " + waitParsingStrSize + " , 待解析字符串内容 : " + waitParsingStr);
        if (strIsEmpty(waitParsingStr) || chainNumber > waitParsingStrSize) {
            return Collections.emptyList();
        }
        List<Map<ReturnKey, Object>> result = new ArrayList<>();
        //解析出来的所有字串的集合
        Set<String> substrings = new HashSet<>();
        // 从字符串开头每个字符开始循环解析
        for (int i = 0; i < waitParsingStrSize; i++) {
            //如果查找字串所在的最后的索引小于待解析的字符串则取出该子字符串
            if (i + (chainNumber - 1) < waitParsingStrSize) {
                String substr = waitParsingStr.substring(i, i + chainNumber);
                //如果字串集合中已经包含了本次获取到的字串则跳出进行下一次字串解析
                if (substrings.contains(substr)) {
                    continue;
                }
                substrings.add(substr);
                //获得字串在待解析字符串中出现的次数
                int substrCount = countStr(waitParsingStr, substr);
                Map<ReturnKey, Object> resultMap = new LinkedHashMap<>(2);
                resultMap.put(ReturnKey.COUNT, substrCount);
                resultMap.put(ReturnKey.SUBSTRINGS, substr);
                result.add(resultMap);
            }
        }
        return result;
    }

    /**
     * @param str     原字符串
     * @param sToFind 需要查找的字符串
     * @return 返回在原字符串中sToFind出现的次数
     */
    public static int countStr(String str, String sToFind) {
        if (strIsEmpty(str) || strIsEmpty(sToFind)) {
            return 0;
        }
        int num = 0;
        while (str.contains(sToFind)) {
            str = str.substring(str.indexOf(sToFind) + sToFind.length());
            num++;
        }
        return num;
    }

    /**
     * 判断字符串是否为空
     *
     * @param str 需要判断的字符串
     * @return boolean值，为空返回true，不为空返回true
     */
    private static boolean strIsEmpty(String str) {
        return str == null || str.isEmpty();
    }

    public static void main(String[] args) {
        Map<ReturnKey, Object> returnKeyObjectMap1 = searchMostSubstrings(2, testTtr);
        System.out.println("字符串中出现子串出现最多的次数是 ： " + returnKeyObjectMap1.get(ReturnKey.COUNT));
        System.out.println("字符串中出现最多的子串集合是 : " + returnKeyObjectMap1.get(ReturnKey.SUBSTRINGS));
        Map<ReturnKey, Object> returnKeyObjectMap2 = searchMostSubstringsByFile(2, "C:\\Users\\Administrator\\Desktop\\test\\src\\com\\company\\test.txt");
        System.out.println("字符串中出现子串出现最多的次数是 ： " + returnKeyObjectMap2.get(ReturnKey.COUNT));
        System.out.println("字符串中出现最多的子串集合是 : " + returnKeyObjectMap2.get(ReturnKey.SUBSTRINGS));
    }
}