生成的文本会出现语义包含现象,现在有过滤掉出现语义包含的语句
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.StandardTokenizer;
import java.util.ArrayList;
import java.util.List;
/**
* created by liangpengfei on 2020/10/15
*/
public class FilterInclusion {
public List<Term> getSegList(String sen) {
List<Term> termList = StandardTokenizer.segment(sen);
return termList;
}
public double getSimiliarity(String s1, String s2) {
List ls1 = getSegList(s1);
List ls2 = getSegList(s2);
int len = ls1.size() <= ls2.size() ? ls1.size() : ls2.size();
double count = 0;
double similarity = 0;
for (int i = 0; i < len; i++) {
List ls3 = ls2.subList(0, len);
if (ls3.contains(ls1.get(i))) {
count += 1;
}
}
similarity = count / (len);
return similarity;
}
public List<String> drop_duplicate(List stringList) {
for (int i = 0; i < stringList.size(); i++) {
if (i + 1 == stringList.size()) {
return stringList;
}
String str1 = (String) stringList.get(i);
String str2 = (String) stringList.get(i + 1);
double similarity = getSimiliarity(str1, str2);
if (similarity >= 0.4) {
if (str1.length() >= str2.length()) {
stringList.remove(i + 1);
} else {
stringList.remove(i);
}
}
}
return stringList;
}
public static void main(String[] args) {
FilterInclusion test = new FilterInclusion();
List<String> stringList = new ArrayList<String>();
stringList.add("风电装机大幅下降");
stringList.add("产品价格竞争加剧");
stringList.add("行业周期导致公司经营业绩的波动性风险、应收账款及应收票据余额较大的风险,较高利润率不能维持的风险,新产品拓展速度不及预期的风险");
stringList.add("行业竞争加剧");
stringList.add("下游需求不及预期");
stringList.add("下游需求低于预期、价格战恶化、核心器件自产率提升进度不及预期");
stringList.add("产品国产化替代进度不及预期");
System.out.println(stringList);
System.out.println(test.drop_duplicate(stringList));
}
}
07-17
1452
![](https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png)