- /**
- * 以句比文,比较语句重复比例
- * @author: slzs
- * Dec 21, 2012 3:13:57 PM
- * @param source 源数据
- * @param compareStr 比较数据
- * @param mark 是否标记
- * @param separatorArray 分离符,注意特殊字符的转义,如{ '.', '?' }
- * @return Object[] [0]语句重复占自身总句比例 [1]标记源内容 (mark==true时)[2]标记比较内容 (mark==true时)
- * each engineer has a duty to keep the code elegant
- */
- public static Object[] compareForSentence(String source, String compareStr, boolean mark, char... separatorArray) {
- // 语句重复比例
- float sameRate = 0;
- if (!StringUtil.isEmptyTrim(source) && !StringUtil.isEmptyTrim(compareStr)) {
- // 拆句子
- List<String> sentenceArray;
- if (separatorArray != null && separatorArray.length > 0) {
- sentenceArray = separator(compareStr, true, separatorArray);
- } else {
- sentenceArray = separator(compareStr, true, ' ', ' ', '、', ',', ',', '.', '。', '?', '?', '!', '!');
- }
- if (sentenceArray != null) {
- // 总句数
- int sumCount = sentenceArray.size();
- if (sumCount > 0) {
- // 找到相同句数
- int sameCount = 0;
- // 是否连续
- boolean continuation = false;
- // 多样式
- int styleIndex = 0;
- String className = "same_sentence";
- for (String sentence : sentenceArray) {
- if (source.indexOf(sentence) > -1) {
- // 找到相同
- sameCount++;
- if (mark) {
- // 标记源字符串中相同语句
- source = source.replace(sentence, "<span class=\"" + className + "_" + styleIndex + "\">" + sentence + "</span>");
- // 标记比较字符串中相同语句
- compareStr = compareStr.replace(sentence, "<span class=\"" + className + "_" + styleIndex + "\">" + sentence + "</span>");
- if (!continuation) {
- // 非连续相同,5个样式区分标记,连续相同则用相同样式
- styleIndex = styleIndex < 5 ? ++styleIndex : 0;
- }
- }
- continuation = true;
- } else {
- continuation = false;
- }
- }
- // 计算比例
- sameRate = sameCount / (float) sumCount;
- }
- }
- }
- return new Object[] { sameRate, source, compareStr };
- }
- /**
- * 字符串拆分器
- * @author: slzs
- * Dec 21, 2012 3:26:55 PM
- * @param sourceStr 待拆分源字符串
- * @param noRepeat 是否去重
- * @param separatorArray 分离符,以每个字符进行分割
- * @return List<String> 拆出的语句集
- * each engineer has a duty to keep the code elegant
- */
- public static List<String> separator(String sourceStr, boolean noRepeat, char... separatorArray) {
- List<String> strList = null;
- if (separatorArray != null && separatorArray.length > 0 && !StringUtil.isEmptyTrim(sourceStr)) {
- // 拼接分离符号
- String separatorStr = "";
- for (char joinChar : separatorArray) {
- separatorStr += joinChar;
- }
- // 拆分临时字符串数组
- String[] splitArrayTemp = sourceStr.split("[" + separatorStr + "]");
- if (splitArrayTemp != null && splitArrayTemp.length > 0) {
- strList = new ArrayList<String>();
- // 存储拆分信息
- for (String str : splitArrayTemp) {
- // 排除重复语句
- if (!StringUtil.isEmptyTrim(str) && (!noRepeat || !strList.contains(str))) {
- strList.add(str);
- }
- }
- }
- }
- return strList;
- }
转载于:https://blog.51cto.com/lya041/1177873