java similarity_java 字符串相似度算法

package com.miaodi.api.util;

public class SimilarityRatioUtils {

/**

* 莱茵斯坦距离

*

* @param str

* @param target

* @return

*/

public static int levenshiteinDistance(String str, String target) {

int s = str.length();

int t = target.length();

if (s == 0) {

return t;

}

if (t == 0) {

return s;

}

int d[][];

int si;

int tj;

char strChar;

char tagetChar;

int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1

d = new int[s + 1][t + 1];

for (si = 0; si <= s; si++) {// 初始化第一列

d[si][0] = si;

}

for (si = 1; si <= s; si++) {// 遍历str

strChar = str.charAt(si - 1);

for (tj = 1; tj <= t; tj++) {

tagetChar = target.charAt(tj - 1);

/*

* 忽略大小写 if (strChar == tagetChar || strChar == tagetChar + 32 || strChar + 32

* == tagetChar) {

*/

if (strChar == tagetChar) {

temp = 0;

} else {

temp = 1;

}

d[si][tj] = min(d[si - 1][tj] + 1, d[si][tj - 1] + 1, d[si - 1][tj - 1] + temp);

}

}

return d[s][t];

}

private static int min(int one, int two, int three) {

return (one = one < two ? one : two) < three ? one : three;

}

/**

* 获取两字符串的相似度

*/

public static float getSimilarityRatio(String str, String target) {

// 1-莱茵斯坦距离/最大字符串长度

return 1 - (float) levenshiteinDistance(str, target) / Math.max(str.length(), target.length());

}

}

package com.miaodi.api.handler;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.concurrent.Callable;

import java.util.concurrent.ExecutorService;

import java.util.concurrent.Future;

/**

* 模板提取

* @author Administrator

*

*/

public class TemplateDistill implements Callable{

public TemplateDistill(String source, String target, List> futures, ExecutorService pool, int startSourceIDX,int startTargetIDX, boolean lastIsPlaceholder,String templatePrefix,int variableNum) {

super();

this.source = source;

this.target = target;

this.futures = futures;

this.pool = pool;

this.startSourceIDX = startSourceIDX;

this.startTargetIDX = startTargetIDX;

this.lastIsPlaceholder = lastIsPlaceholder;

this.templatePrefix = templatePrefix == null ? "":templatePrefix;

this.variableNum = variableNum;

}

private String templatePrefix;

private final int maxVariableNum = 3;

private int variableNum;

private int startSourceIDX;

private int startTargetIDX;

private boolean lastIsPlaceholder;

private final String source;

private final String target;

private final List> futures;

private final ExecutorService pool;

@SuppressWarnings("unchecked")

public String call() throws Exception {

StringBuilder template = new StringBuilder(templatePrefix);

while (true) {

// 获取最大连击数

Map result = carom(source, target, startSourceIDX, startTargetIDX);

int maxCaromNum = (Integer) result.get("caromNum");

if (maxCaromNum == 0) {

if(!lastIsPlaceholder) {

variableNum += 1;

if (variableNum > maxVariableNum) {

return null;

}

template.append("|");

lastIsPlaceholder = true;

}

if (result.get("otherTargetIndexs") != null) {

// 开启多线程找模板

List otherTargetIndexs = (List) result.get("otherTargetIndexs");

for (Integer otherTargetIndex : otherTargetIndexs) {

futures.add(pool.submit(new TemplateDistill(source, target, futures, pool, startSourceIDX, otherTargetIndex, lastIsPlaceholder,template.toString(), variableNum)));

}

}

startSourceIDX += 1;

}else{

template.append(source.substring(startSourceIDX, startSourceIDX + maxCaromNum));

startSourceIDX += maxCaromNum;

startTargetIDX += maxCaromNum;

lastIsPlaceholder = false;;

}

if (source.length() <= startSourceIDX) {

if (!lastIsPlaceholder && target.length() > startTargetIDX) {

variableNum += 1;

if (variableNum > maxVariableNum) {

return null;

}

template.append("|");

}

break;

}

if (target.length() <= startTargetIDX) {

if(!lastIsPlaceholder) {

variableNum += 1;

if (variableNum > maxVariableNum) {

return null;

}

template.append("|");

}

break;

}

}

return template.toString();

}

/**

* 获取连击数

* @param source 字符串s

* @param target 字符串t

* @param startSourceIDX 字符串s从什么下标开始匹配

* @param startTargetIDX 字符串t从什么下标开始匹配

* @return

*/

@SuppressWarnings({ "rawtypes", "unchecked" })

private static Map carom(String source,String target,Integer startSourceIDX,Integer startTargetIDX){

// 连击次数

int attemptCaromNum = 0;

while(true) {

// 每次累加1,最后减1

attemptCaromNum += 1;

// 本次匹配的子串的结束下标

int endSourceIDX = startSourceIDX + attemptCaromNum;

if (endSourceIDX > source.length()) {

break;

}

// 本次匹配的子串

String attemptStr = source.substring(startSourceIDX, endSourceIDX);

int idx = target.indexOf(attemptStr, startTargetIDX);

// 匹配失败

if (idx != startTargetIDX) {

break;

}

}

attemptCaromNum -= 1;

// 结果

Map result = new HashMap();

result.put("caromNum", attemptCaromNum);

// 匹配失败时

if (attemptCaromNum == 0) {

// 单纯的一个字符

int endStrIDX = startSourceIDX + 1;

if (endStrIDX <= source.length()) {

// 匹配他失败后的字符

List otherTargetIndexs = new ArrayList();

char c = source.charAt(startSourceIDX);

char[] cs = target.substring(startTargetIDX+1).toCharArray();

for (int i = 0; i < cs.length; i++) {

if (c == cs[i]) {

otherTargetIndexs.add(startTargetIDX+1+i);

}

}

result.put("otherTargetIndexs", otherTargetIndexs.isEmpty() ? null : otherTargetIndexs);

}

}

return result;

}

}

package com.miaodi.api.handler;

import java.util.HashSet;

import java.util.List;

import java.util.Set;

import java.util.Vector;

import java.util.concurrent.ExecutorService;

import java.util.concurrent.Future;

import java.util.concurrent.LinkedBlockingQueue;

import java.util.concurrent.ThreadPoolExecutor;

import java.util.concurrent.TimeUnit;

import com.miaodi.api.util.SimilarityRatioUtils;

public class TemplateDistillUtil {

private static final ExecutorService executorPool = new ThreadPoolExecutor(10, 30, 30, TimeUnit.SECONDS, new LinkedBlockingQueue(50),new ThreadPoolExecutor.CallerRunsPolicy());

public static String distillTemplates(List targets){

if (targets.isEmpty()) {

return null;

}else if(targets.size() == 1) {

return targets.get(0);

}

String source = targets.get(0);

String template = null;

for (int i = 1; i < targets.size(); i++) {

if (i==1) {

template = distillTemplates(source, targets.get(i));

}else {

if (template == null) {

return null;

}

template = distillTemplates(template, targets.get(i));

}

}

return format(template == null ? "|": template);

}

public static String distillTemplates(String source, String target) {

// 符合这两字符串的全部模板

List> futures = new Vector>();

futures.add(executorPool.submit(new TemplateDistill(source,target,futures,executorPool,0,0,false,null,0)));

Set templates = new HashSet();

while (true) {

int y = 0;

for (int i = 0; i < futures.size(); i++) {

try {

y += 1;

String template = futures.get(i).get();

if (template != null) {

templates.add(futures.get(i).get());

}

} catch (Exception e) {

e.printStackTrace();

}

}

if (y == futures.size()) {

break;

}

}

float maxRatio = 0;

String maxRatioTemplate = null;

for (String template : templates) {

float ratio = SimilarityRatioUtils.getSimilarityRatio(source, template);

if (ratio > maxRatio) {

maxRatio = ratio;

maxRatioTemplate = template;

}

}

return maxRatioTemplate;

}

public static String format(String template) {

if (template == null) {

return null;

}

StringBuilder sb = new StringBuilder();

String[] vs = template.split("\\|");

for (int i = 1; i <= vs.length; i++) {

sb.append(vs[i-1]);

if (i != vs.length) {

sb.append("{"+i+"}");

}

}

if (template.lastIndexOf("|") == template.length()-1) {

sb.append("{"+((vs.length-1)<0?0:(vs.length-1))+"}");

}

return sb.toString();

}

//    public static void main(String[] args) {

//

//

System.out.println(format("123|123|"));

//

//        long start = System.currentTimeMillis();

//

//        String[] strs = new String[] {

"廖兴攀-有线,在2017-10-01 11:53:22",

"鲍冬梅-有线,在2017-10-01 11:53:11"

"【兴业银行】尊敬的用户,于10月8日前点 https://rsp.mobi/N3eOs4 申请信用卡,可获批10万额度。退订回T",

"【兴业银行】尊敬的用户,于10月8日前点 https://rsp.mobi/A5jk6Z 申请信用卡,可获批10万额度。退订回T"

"【随你花】尊敬的王建辉,您的账单现在到期了,逾期将取消再借资格,请及时处理!需要链接找客服。回T退订",

"【随你花】尊敬的项俊,您的账单现在到期了,逾期将取消再借资格,请及时处理!需要链接找客服。回T退订",

"廖兴攀-有线,在2017-10-01 11:53:37触发了上线提醒,请留意。",

"鲍冬梅-000有线,在2017-10-01 11:53:45触发了上线提醒,请留意。",

//                "123xxx12",

//                "123xxz12",

//                "123xxz221211",

,"【银汉科技】尊贵的易美达分期客户:郭文娟,你总12期的第2期1866.67元,2017-12-31为本期还款日,到目前未还款,现已经逾期,请珍视个人信用,维护良好的账户记录和信誉。如有疑问,请致电4000-512-012,退订回T。"

//        };

System.out.println(SimilarityRatioUtils.getSimilarityRatio(strs[0], strs[1]));

//        String templates = distillTemplates(strs);

float maxRatio = 0;

String maxRatioTemplate = null;

for (String template : templates) {

float ratio = SimilarityRatioUtils.getSimilarityRatio(strs[0], template);

if (ratio > maxRatio) {

maxRatio = ratio;

maxRatioTemplate = template;

}

}

//        System.out.println("模板:"+templates);

Matcher matcher = Pattern.compile("\\{\\d+\\}").matcher(maxRatioTemplate);

String templete = matcher.replaceAll(".*");

String regex = String.format("^%s$", templete); // 构建新的pattern字符串

for (String string : strs) {

System.out.println(Pattern.compile(regex).matcher(string).find());

}

//        System.out.println("共计耗时:"+ (System.currentTimeMillis()-start) +"毫秒");

//    }

}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值