java人物跳舞的算法_最佳字符串匹配算法(Damerau-Levenshtein距离算法)的Java实现...

原文:http://www.javacodegeeks.com/2013/11/java-implementation-of-optimal-string-alignment.html

---------------------------------------------------------------------------------------------------------------------------------

For a while, I’ve used the Apache Commons lang StringUtils implementation ofLevenshtein distance. ?It implements a few well known tricks to use less memory by only hanging on to two arrays instead of allocating a huge n x m table for the memoisation table. ?It also only checks a “stripe” of width 2 * k?+1 where k is the maximum number of edits.

?

In most practical usages of levenshtein you just care if a string is within some small number (1, 2, 3) of edits from another string. ?This avoid much of the n * m computation that makes levenstein “expensive”. ?We found that with a k <= 3, levenshtein with these tricks was faster than?Jaro-Winkler distance, which is an approximate edit distance calculation that was created to be a faster approximate (well there were many reasons).

?

Unfortunately, the Apache Commons Lang implementation only calculates Levenshtein and not the possible more useful?Damerau-Levenshtein distance. ?Levenshtein defines the edit operations insert, delete, and substitute. ?The Damerau variant adds *transposition* to the list, which is pretty useful for most of the places I use edit distance. ?Unfortunately DL distance is not a true metric in that it doesn’t respect the triangle inequality, but there are plenty of applications that are unaffected by this. ?As you can see from that wikipedia page, there is often confusion between Optimal String Alignment and DL distance. ?In practice OSA is a simpler algorithm and requires less book-keeping so the runtime is probably marginally faster.

?

I could not find any implementations of OSA or DL that used the memory tricks and “stripe” tricks that I saw in Apache Commons Lang. ?So I implemented my own OSA using those tricks. ?At some point I’ll also implement DL with the tricks and see what the performance differences are:

?

Here’s OSA in Java. ?It’s public domain; feel free to use as you like. The unit tests are below. Only dependency is on Guava- but its just the preconditions class and an annotation for documentation so easy to remove that dependency if you like:

?

package com.github.steveash.util;

import static com.google.common.base.Preconditions.checkArgument;

import static com.google.common.base.Preconditions.checkNotNull;

import static com.google.common.primitives.Shorts.checkedCast;

import static java.lang.Math.abs;

import static java.lang.Math.max;

import java.util.Arrays;

import com.google.common.annotations.VisibleForTesting;

/**

* Implementation of the OSA which is similar to the Damerau-Levenshtein in that it allows for transpositions to

* count as a single edit distance, but is not a true metric and can over-estimate the cost because it disallows

* substrings to edited more than once. See wikipedia for more discussion on OSA vs DL

*

* See Algorithms on Strings, Trees and Sequences by Dan Gusfield for more information.

*

* This also has a set of local buffer implementations to avoid allocating new buffers each time, which might be

* a premature optimization

*

* @author Steve Ash

*/

public class OptimalStringAlignment {

private static final int threadLocalBufferSize = 64;

private static final ThreadLocal costLocal = new ThreadLocal() {

@Override

protected short[] initialValue() {

return new short[threadLocalBufferSize];

}

};

private static final ThreadLocal back1Local = new ThreadLocal() {

@Override

protected short[] initialValue() {

return new short[threadLocalBufferSize];

}

};

private static final ThreadLocal back2Local = new ThreadLocal() {

@Override

protected short[] initialValue() {

return new short[threadLocalBufferSize];

}

};

public static int editDistance(CharSequence s, CharSequence t, int threshold) {

checkNotNull(s, "cannot measure null strings");

checkNotNull(t, "cannot measure null strings");

checkArgument(threshold >= 0, "Threshold must not be negative");

checkArgument(s.length() < Short.MAX_VALUE, "Cannot take edit distance of strings longer than 32k chars");

checkArgument(t.length() < Short.MAX_VALUE, "Cannot take edit distance of strings longer than 32k chars");

if (s.length() + 1 > threadLocalBufferSize || t.length() + 1 > threadLocalBufferSize)

return editDistanceWithNewBuffers(s, t, checkedCast(threshold));

short[] cost = costLocal.get();

short[] back1 = back1Local.get();

short[] back2 = back2Local.get();

return editDistanceWithBuffers(s, t, checkedCast(threshold), back2, back1, cost);

}

@VisibleForTesting

static int editDistanceWithNewBuffers(CharSequence s, CharSequence t, short threshold) {

int slen = s.length();

short[] back1 = new short[slen + 1]; // "up 1" row in table

short[] back2 = new short[slen + 1]; // "up 2" row in table

short[] cost = new short[slen + 1]; // "current cost"

return editDistanceWithBuffers(s, t, threshold, back2, back1, cost);

}

private static int editDistanceWithBuffers(CharSequence s, CharSequence t, short threshold,

short[] back2, short[] back1, short[] cost) {

short slen = (short) s.length();

short tlen = (short) t.length();

// if one string is empty, the edit distance is necessarily the length of the other

if (slen == 0) {

return tlen <= threshold ? tlen : -1;

} else if (tlen == 0) {

return slen <= threshold ? slen : -1;

}

// if lengths are different > k, then can't be within edit distance

if (abs(slen - tlen) > threshold)

return -1;

if (slen > tlen) {

// swap the two strings to consume less memory

CharSequence tmp = s;

s = t;

t = tmp;

slen = tlen;

tlen = (short) t.length();

}

initMemoiseTables(threshold, back2, back1, cost, slen);

for (short j = 1; j <= tlen; j++) {

cost[0] = j; // j is the cost of inserting this many characters

// stripe bounds

int min = max(1, j - threshold);

int max = min(slen, (short) (j + threshold));

// at this iteration the left most entry is "too much" so reset it

if (min > 1) {

cost[min - 1] = Short.MAX_VALUE;

}

iterateOverStripe(s, t, j, cost, back1, back2, min, max);

// swap our cost arrays to move on to the next "row"

short[] tempCost = back2;

back2 = back1;

back1 = cost;

cost = tempCost;

}

// after exit, the current cost is in back1

// if back1[slen] > k then we exceeded, so return -1

if (back1[slen] > threshold) {

return -1;

}

return back1[slen];

}

private static void iterateOverStripe(CharSequence s, CharSequence t, short j,

short[] cost, short[] back1, short[] back2, int min, int max) {

// iterates over the stripe

for (int i = min; i <= max; i++) {

if (s.charAt(i - 1) == t.charAt(j - 1)) {

cost[i] = back1[i - 1];

} else {

cost[i] = (short) (1 + min(cost[i - 1], back1[i], back1[i - 1]));

}

if (i >= 2 && j >= 2) {

// possible transposition to check for

if ((s.charAt(i - 2) == t.charAt(j - 1)) &&

s.charAt(i - 1) == t.charAt(j - 2)) {

cost[i] = min(cost[i], (short) (back2[i - 2] + 1));

}

}

}

}

private static void initMemoiseTables(short threshold, short[] back2, short[] back1, short[] cost, short slen) {

// initial "starting" values for inserting all the letters

short boundary = (short) (min(slen, threshold) + 1);

for (short i = 0; i < boundary; i++) {

back1[i] = i;

back2[i] = i;

}

// need to make sure that we don't read a default value when looking "up"

Arrays.fill(back1, boundary, slen + 1, Short.MAX_VALUE);

Arrays.fill(back2, boundary, slen + 1, Short.MAX_VALUE);

Arrays.fill(cost, 0, slen + 1, Short.MAX_VALUE);

}

private static short min(short a, short b) {

return (a <= b ? a : b);

}

private static short min(short a, short b, short c) {

return min(a, min(b, c));

}

}

?

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值