java 计算字符串的相似度_Java:计算字符串相似度的度量

这种方法是我程序的瓶颈。 (或者,至少在大部分时间用完了。)

其目的是根据字符在两者中出现的方式计算两个字符串之间的相似性,或者在一个字符串中出现或不出现在另一个字符串中。

代码是正确的,但我想优化它。您是否看到任何低效率,不良做法或不必要的复杂性?

(不要担心Gene类。这里真正关注的是Genes携带的字符串。)

public static int simpleScore(Gene g0, Gene g1) {

if (g0.equals(g1)) { //identical genes have distance 0

return 0;

}

String frag0 = g0.getDNA();

String frag1 = g1.getDNA();

Map charOccurencesFrag0 = new HashMap(Constants.DNA_CHARS.length);

Map charOccurencesFrag1 = new HashMap(Constants.DNA_CHARS.length);

int multSCOST = 0;

int multDCOST = 0;

int multDDCOST = 0;

for (char c : Constants.DNA_CHARS) { //look through all acceptable DNA chars

charOccurencesFrag0.put(c, StringUtils.countMatches(frag0, "" + c));

charOccurencesFrag1.put(c, StringUtils.countMatches(frag1, "" + c));

//if a char appears in one frag but not the other

if ((frag0.indexOf(c) != -1 && frag1.indexOf(c) == -1) || (frag0.indexOf(c) == -1 && frag1.indexOf(c) != -1)) {

multDDCOST += (charOccurencesFrag0.get(c) + charOccurencesFrag1.get(c));

}

//if a char appears in both frags

if (frag0.indexOf(c) != -1 && frag1.indexOf(c) != -1) {

int diff = Math.abs(charOccurencesFrag0.get(c) - charOccurencesFrag1.get(c));

multDCOST += diff;

multSCOST += Math.min(charOccurencesFrag0.get(c), charOccurencesFrag1.get(c));

}

}

return Constants.SCOST * multSCOST + Constants.DDCOST * multDDCOST + Constants.DCOST * multDCOST;

}根据回复更新了代码

这导致了相当大的加速。感谢所有评论的人。还有其他人有什么想法吗?

public static int simpleScore(Gene g0, Gene g1) {

if (g0.equals(g1)) { //identical genes have distance 0

return 0;

}

String frag0 = g0.getDNA();

String frag1 = g1.getDNA();

int[] charOccurencesFrag0 = countOccurrences(frag0, Constants.DNA_CHARS);

int[] charOccurencesFrag1 = countOccurrences(frag1, Constants.DNA_CHARS);

int multSCOST = 0;

int multDCOST = 0;

int multDDCOST = 0;

for (int ket = 0; ket < Constants.DNA_CHARS.length; ket++) {

//if a char appears in one frag but not the other

if ((charOccurencesFrag0[ket] > 0 && charOccurencesFrag1[ket] == 0) || (charOccurencesFrag0[ket] == 0 && charOccurencesFrag1[ket] >0)){

multDDCOST += charOccurencesFrag0[ket] + charOccurencesFrag1[ket];

}

//if a char appears in both frags

if (charOccurencesFrag0[ket] != 0 && charOccurencesFrag1[ket] != 0){

int diff = Math.abs(charOccurencesFrag0[ket] - charOccurencesFrag1[ket]);

multDCOST += diff;

multSCOST += Math.min(charOccurencesFrag0[ket] , charOccurencesFrag1[ket]);

}

}

return Constants.SCOST * multSCOST + Constants.DDCOST * multDDCOST + Constants.DCOST * multDCOST;

}

// from MAK on SO

private static int[] countOccurrences(String x, char[] validDNAChars){

int[] count=new int[validDNAChars.length];

for(int i=0;i

int index=-1;

for(int j=0;j

if (x.charAt(i)==validDNAChars[j]){

index=j;

}

}

if (index>=0) count[index]++;

}

return count;

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值