下面是两段文字的差异对比方法,使用了com.github.difflib.DiffUtils用来对比差异
和HanLP分词(分词我用了自己的);
先对比段落差异,然后把段落分词再对比文字的差异
分词我自己写了个按句子分,如果要颗粒度小的,就用分词器分 看需求,然后使用比较器比较;
输出拼接html标签来显示差分
/**
* @param originalStr 初始的
* @param revisedStr 修订过的
*/
private HashMap<String, String> contrastDiff(String originalStr, String revisedStr) {
/*
// HanLP分词
Segment segment = HanLP.newSegment()
.enableAllNamedEntityRecognize(true)
.enableNumberQuantifierRecognize(true)
.enableOffset(true);
List<Term> termList0 = segment.seg(originalStr);
List<Term> termList1 = segment.seg(revisedStr);
List<String> collect0 = termList0.stream().map(term -> term.word).collect(Collectors.toList());
List<String> collect1 = termList1.stream().map(term -> term.word).collect(Collectors.toList());
*/
List<String> collect0 = myParticiple(originalStr);
List<String> collect1 = myParticiple(revisedStr);
// 对比差分
Patch<String> patch = DiffUtils.diff(collect0, collect1, true);
List<AbstractDelta<String>> deltas = patch.getDeltas();
HashMap<String, String> resultMap = new HashMap<>();
StringBuilder sourceHtml = new StringBuilder();
StringBuilder targetHtml = new StringBuilder();
HashSet<DeltaType> typeSet = new HashSet<>();
for (AbstractDelta<String> delta : deltas) {
typeSet.add(delta.getType());
String targetStr = String.join("", delta.getTarget().getLines());
String sourceStr = String.join("", delta.getSource().getLines());
if (delta.getType() == DeltaType.DELETE) {
sourceHtml.append("<span class=\"delete-class\">").append(sourceStr).append("</span>");
} else if (delta.getType() == DeltaType.CHANGE) {
sourceHtml.append("<span class=\"change-class\">").append(sourceStr).append("</span>");
targetHtml.append("<span class=\"change-class\">").append(targetStr).append("</span>");
} else if (delta.getType() == DeltaType.INSERT) {
targetHtml.append("<span class=\"insert-class\">").append(targetStr).append("</span>");
} else if (delta.getType() == DeltaType.EQUAL) {
targetHtml.append(sourceStr);
sourceHtml.append(sourceStr);
}
}
DeltaType deltaType;
// 差异类型的逻辑是, 修改 >删除 =新增 >相等
if (typeSet.contains(DeltaType.CHANGE)) {
deltaType = DeltaType.CHANGE;
} else if (typeSet.contains(DeltaType.DELETE)) {
deltaType = DeltaType.DELETE;
} else if (typeSet.contains(DeltaType.INSERT)) {
deltaType = DeltaType.INSERT;
} else {
deltaType = DeltaType.EQUAL;
}
resultMap.put("sourceHtml", sourceHtml.toString()); // 旧法规
resultMap.put("targetHtml", targetHtml.toString()); // 新法规
resultMap.put("deltaType", deltaType.name()); // 差异类型,因为对比相等改为相似度对比,所以要重写判断修改类型的逻辑
return resultMap;
}
/**
* 自定义的分词 hanlp分词颗粒度太细 , 这个按句来分
*/
private List<String> myParticiple(String text) {
List<String> result = new ArrayList<>();
Pattern pattern = Pattern.compile("[,,.。!!;:;:??、\\s\\u3000]+");
Matcher matcher = pattern.matcher(text);
int start = 0;
while (matcher.find()) {
if (matcher.start() > start) {
result.add(text.substring(start, matcher.start()).trim());
}
result.add(matcher.group());
start = matcher.end();
}
if (start < text.length()) {
result.add(text.substring(start).trim());
}
return result;
}
效果展示
两篇文章比较的代码,就是diff基本的使用方法,注意里面的是对象,我没用string
MyBiPredicate myBiPredicate = new MyBiPredicate();
// 对比两个法规差异
Patch<LawXmlStructureDetailSearchDto> patch =
DiffUtils.diff(contrastContentList, currentLawContentList, MeyersDiff.factory().create(myBiPredicate), null, true);
用了对象所以重写新的比较方法,使用了相似对比算法融合自己的业务
import com.stock.ir.articleSplit.dto.LawXmlStructureDetailSearchDto;
import com.stock.ir.common.utils.CompareUtil;
import org.apache.commons.lang3.StringUtils;
import java.util.function.BiPredicate;
public class MyBiPredicate implements BiPredicate<LawXmlStructureDetailSearchDto, LawXmlStructureDetailSearchDto> {
@Override
public boolean test(LawXmlStructureDetailSearchDto o, LawXmlStructureDetailSearchDto o2) {
if (!StringUtils.equals(o.getLevel(), o2.getLevel())) { // 层级不等算不相同
return false;
}
if (StringUtils.equals(o.getLevel(), "4")) { // =4是内容
// 相似度判断
return CompareUtil. getSimilarityRatio(o.getTitle() ,o2.getTitle() ) >0.75;// 内容相同
} else { // 不是4说明是标题
return o.getPTitle().equals(o2.getPTitle());// 标题相同
}
}
}
maven包
<!--差异对比-->
<dependency>
<groupId>io.github.java-diff-utils</groupId>
<artifactId>java-diff-utils</artifactId>
<version>4.12</version>
</dependency>
<!--分词-->
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.4</version>
</dependency>