比较两段文本内容的差异,网上都没有合适的算法,只能自己写了
,效果如下图
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title></title>
</head>
<body>
文本1<br>
<textarea id="text1" cols="100" rows="10">张三李四</textarea><br>
文本2<br>
<textarea id="text2" cols="100" rows="10">李四张三张三</textarea><br>
<button type="button" onclick="compare()">比对</button><br><br>
结果1<br>
<pre id="result1"></pre>
结果2<br>
<pre id="result2"></pre>
<script type="text/javascript">
function compare() {
// 获取要比对的文本
let text1 = document.getElementById('text1').value;
let text2 = document.getElementById('text2').value;
// 进行比对
let {result1, result2} = getHighlight(text1, text2);
// 渲染比对结果
document.getElementById('result1').innerHTML = result1;
document.getElementById('result2').innerHTML = result2;
}
/**
* 获取高亮文本
*
* @param text1 文本1
* @param text2 文本2
* @param highlightBefore 高亮前缀
* @param highlightAfter 高亮后缀
* @return {result2: string, result1: string} 高亮后的字符串
*/
function getHighlight(text1, text2, highlightBefore, highlightAfter) {
// 初始化高亮文本
highlightBefore = highlightBefore ? highlightBefore : '<span style="color:red;">';
highlightAfter = highlightAfter ? highlightAfter : '</span>';
// 字符数组
let char1s = [];
let char2s = [];
// 偏移量
let char2Index = 0;
// 初始化字符数组对象
for (let i in text1) {
char1s.push({
value: text1[i],
highlight: false
});
}
for (let i in text2) {
char2s.push({
value: text2[i],
highlight: false
});
}
for (let i = 0; i < char1s.length; i++, char2Index++) {
console.log(i, char2Index);
// 如果文本1结尾有东西
if (char2Index > char2s.length - 1) {
char1s[i].highlight = true;
continue;
}
let char1 = char1s[i].value;
let char2 = char2s[char2Index].value;
console.log(`char1: ${char1}, char2: ${char2}`);
// 如果字符相等
if (char1 === char2) {
continue;
}
/**
* 初始化比对粒度
*
* 如果连续3个一样的字符,就说明比对成功
* 比对粒度太大,细小的部分可能比对不出来
* 比对粒度太小,重复率高的话容易被比对出来,容易比对错
* 正常调成3就够了,除非重复率特别高的,可以往上调,但建议不超过5
* 最低也不要低于2,不可调成1,否则只要有一个字符一样的,就会被识别出来
*/
let granularity;
if (char1s.length < 10) {
granularity = 2;
} else if (char1s.length < 1000) {
granularity = 3;
} else if (char1s.length < 100000) {
granularity = 4;
} else {
granularity = 5;
}
// 快到结尾时调小比对粒度
if (i > char1s.length - granularity) {
granularity = char1s.length - i;
console.log('修改比对粒度', granularity);
}
// 如果剩余长度小于比对粒度,则不进行比对
if (char1s.length - i < granularity) {
break;
}
// 用来判断是否找到一样的
let end = -1;
// 进行循环比对
// j: 从char2s的哪个索引开始比对
for (let j = char2Index + 1; j < char2s.length - granularity + 1; j++) {
// 比对factor个
let equals = true;
for (let k = j, offset = 0; k < j + granularity; k++, offset++) {
console.log('第' + offset + '次比对', char1s[i + offset].value, char2s[k].value, i + offset, k);
if (char1s[i + offset].value !== char2s[k].value) {
equals = false;
break;
}
}
// 如果相等,结束比对
if (equals) {
end = j;
console.log('比对成功, end=' + end + ', 比对粒度' + granularity);
break;
} else {
console.log('比对失败, 比对粒度' + granularity);
}
}
console.log('比对结束,end=' + end);
// 如果找到了
if (end !== -1) {
for (let k = char2Index; k < end; k++) {
char2s[k].highlight = true;
char2Index++;
console.log('设置高亮', char2s[k]);
}
}
// 如果没找到,就让自己高亮
else {
char1s[i].highlight = true;
char2Index--;
console.log('找不到' + char1);
}
}
// 遍历最后多出来的文本2,全部高亮
for (let i = char2Index; i < char2s.length; i++) {
char2s[i].highlight = true;
}
// 将字符数组对象转换成高亮形式
let result1 = char1s.map(char => char.highlight ? highlightBefore + char.value + highlightAfter : char.value).join('');
let result2 = char2s.map(char => char.highlight ? highlightBefore + char.value + highlightAfter : char.value).join('');
return {result1, result2};
}
</script>
</body>
</html>
另外提供了Java版本,算法是一样的,翻译过来的而已
Char.java
/**
* 用于描述一个字符
*/
class Char {
/**
* 字符
*/
private Character value;
/**
* 高亮
*/
private Boolean highlight;
public Char(Character value) {
this.value = value;
this.highlight = false;
}
public Character getValue() {
return value;
}
public void setValue(Character value) {
this.value = value;
}
public Boolean getHighlight() {
return highlight;
}
public void setHighlight(Boolean highlight) {
this.highlight = highlight;
}
@Override
public String toString() {
return "Char{" +
"value=" + value +
", highlight=" + highlight +
'}';
}
}
CompareResult.java
/**
* 比对结果
*
* @author 猴哥
*/
public class CompareResult {
/**
* 结果1
*/
private String result1;
/**
* 结果2
*/
private String result2;
public CompareResult(String result1, String result2) {
this.result1 = result1;
this.result2 = result2;
}
public String getResult1() {
return result1;
}
public void setResult1(String result1) {
this.result1 = result1;
}
public String getResult2() {
return result2;
}
public void setResult2(String result2) {
this.result2 = result2;
}
@Override
public String toString() {
return "CompareResult{" +
"result1='" + result1 + '\'' +
", result2='" + result2 + '\'' +
'}';
}
}
CompareUtil.java
import java.util.Arrays;
import java.util.Optional;
import java.util.stream.Collectors;
/**
* 比较字符串工具类
*
* @author 猴哥
*/
public class CompareUtil {
private CompareUtil() {}
/**
* 获取高亮文本
*
* @param text1 文本1
* @param text2 文本2
* @param highlightBefore 高亮前缀
* @param highlightAfter 高亮后缀
* @return 高亮后的字符串
*/
public static CompareResult compare(String text1, String text2, String highlightBefore, String highlightAfter) {
// 判断非空
text1 = Optional.ofNullable(text1).orElse("");
text2 = Optional.ofNullable(text2).orElse("");
// 字符数组
Char[] char1s = new Char[text1.length()];
Char[] char2s = new Char[text2.length()];
// 偏移量
int char2Index = 0;
// 初始化字符数组对象
for (int i = 0; i < text1.length(); i++) {
char1s[i] = new Char(text1.charAt(i));
}
for (int i = 0; i < text2.length(); i++) {
char2s[i] = new Char(text2.charAt(i));
}
for (int i = 0; i < char1s.length; i++, char2Index++) {
// 如果文本1结尾有东西
if (char2Index > char2s.length - 1) {
char1s[i].setHighlight(true);
continue;
}
char char1 = char1s[i].getValue();
char char2 = char2s[char2Index].getValue();
System.out.println("char1: " + char1 + ", char2: " + char2);
// 如果字符相等
if (char1 == char2) {
continue;
}
/*
* 初始化比对粒度
*
* 如果连续3个一样的字符,就说明比对成功
* 比对粒度太大,细小的部分可能比对不出来
* 比对粒度太小,重复率高的话容易被比对出来,容易比对错
* 正常调成3就够了,除非重复率特别高的,可以往上调,但建议不超过5
* 最低也不要低于2,不可调成1,否则只要有一个字符一样的,就会被识别出来
*/
int granularity;
if (char1s.length < 10) {
granularity = 2;
} else if (char1s.length < 1000) {
granularity = 3;
} else if (char1s.length < 100000) {
granularity = 4;
} else {
granularity = 5;
}
// 快到结尾时调小比对粒度
if (i > char1s.length - granularity) {
granularity = char1s.length - i;
System.out.println("修改比对粒度: " + granularity);
}
// 如果剩余长度小于比对粒度,则不进行比对
if (char1s.length - i < granularity) {
break;
}
// 用来判断是否找到一样的
int end = -1;
// 进行循环比对
// j: 从char2s的哪个索引开始比对
for (int j = char2Index + 1; j < char2s.length - granularity + 1; j++) {
// 比对factor个
boolean equals = true;
for (int k = j, offset = 0; k < j + granularity; k++, offset++) {
System.out.println("第" + offset + "次比对 " + char1s[i + offset].getValue() + " " + char2s[k].getValue() + " " + (i + offset) + " " + k);
if (!char1s[i + offset].getValue().equals(char2s[k].getValue())) {
equals = false;
break;
}
}
// 如果相等,结束比对
if (equals) {
end = j;
System.out.println("比对成功, end=" + end + ", 比对粒度" + granularity);
break;
} else {
System.out.println("比对失败, 比对粒度" + granularity);
}
}
System.out.println("比对结束,end=" + end);
// 如果找到了
if (end != -1) {
for (int k = char2Index; k < end; k++) {
char2s[k].setHighlight(true);
char2Index++;
System.out.println("设置高亮" + char2s[k]);
}
}
// 如果没找到,就让自己高亮
else {
char1s[i].setHighlight(true);
char2Index--;
System.out.println("找不到" + char1);
}
}
// 遍历最后多出来的文本2,全部高亮
for (int i = char2Index; i < char2s.length; i++) {
char2s[i].setHighlight(true);
}
// 初始化高亮前缀、后缀
final String before = Optional.ofNullable(highlightBefore).orElse("<span style=\"color:red;\">");
final String after = Optional.ofNullable(highlightAfter).orElse("</span>");
// 将字符数组对象转换成高亮形式
String result1 = Arrays.stream(char1s)
.map(char1 -> char1.getHighlight() ? before + char1.getValue() + after : char1.getValue() + "")
.collect(Collectors.joining());
String result2 = Arrays.stream(char2s)
.map(char2 -> char2.getHighlight() ? before + char2.getValue() + after : char2.getValue() + "")
.collect(Collectors.joining());
return new CompareResult(result1, result2);
}
}