数据集成字符串匹配算法:EditDIstance,NeedlemanWunch,Soundex,Jaccard

出品人:孙林,乔嘉林

String matching

EditDIstance

计算两个长度差不多的字符串的差距,距离表示从一个字符串最少改几个字符能变成另一个。越小越相近。适用任意两个字符串的比较。
“abc”
“abb”
结果为1

public class EditDistance {

public static void main(String[] args){
    System.out.println("helloworld");
    System.out.println("distance = " + minDistance("David Smiths", "Davidd Simth"));
}

public static int minDistance(String word1, String word2) {
    int len1 = word1.length();
    int len2 = word2.length();
    // len1+1, len2+1, because finally return dp[len1][len2]
    int[][] dp = new int[len1 + 1][len2 + 1];

    for (int i = 0; i <= len1; i++)
        dp[i][0] = i;
    for (int j = 0; j <= len2; j++)
        dp[0][j] = j;

    //iterate though, and check last char
    for (int i = 1; i <= len1; i++) {
        char c1 = word1.charAt(i-1);
        for (int j = 1; j <= len2; j++) {
            char c2 = word2.charAt(j-1);
            //if last two chars equal
            if (c1 == c2) {
                //update dp value for +1 length
                dp[i][j] = dp[i-1][j-1];
            } else {
                int replace = dp[i-1][j-1] + 1;
                int insert = dp[i-1][j] + 1;
                int delete = dp[i][j-1] + 1;

                int min = Math.min(replace, insert);
                min = Math.min(min,delete);
                dp[i][j] = min;
            }
        }
    }
    return dp[len1][len2];
}

}

NeedlemanWunch

基于最长公共子串的文本比较,适用于两个字符串长度差距比较大。长度越大越相近。需要提前给定一个字符匹配打分表。
“dva”,“deeve”
结果为1
“dva”,”dva”
结果为6

import java.util.StringJoiner;

/**
 * Created by forestneo on 2016/12/25.
 */

public class NeedlemanWunch {

private static int[][] scoreTable = new int[26][26];

private static int gap = 1;

public static void main(String[] args){
    String str1 = "dva";
    String str2 = "deeve";

    NeedlemanWunch ne = new NeedlemanWunch();
    ne.initilizeTable(1);
    System.out.println("length = " + ne.needleman(str1, str2));
}

/**
 * @param blankGap the value of gap for blank
 */
public static void initilizeTable(int blankGap){
    gap = blankGap;

    for(int i = 0; i < 26; i++) {
        for(int j = 0; j < 26; j++) {
            scoreTable[i][j] = 0;
        }
    }

    /*Table for test "dave"*/
    for(int i = 0; i < 26; i++) {
        for(int j = 0; j < 26; j++) {
            scoreTable[i][j] = -1;
        }
    }
    scoreTable[index('d')][index('d')] = 2;
    scoreTable[index('a')][index('a')] = 2;
    scoreTable[index('v')][index('v')] = 2;
    scoreTable[index('e')][index('e')] = 2;
}

public static int index(char ch){
    return ch - 'a';
}

public static int needleman(String string1, String string2){
    int[][] dpTable = new int[string1.length()+1][string2.length()+1];

    /*initilize dpTable*/
    for(int i = 0; i < string2.length() + 1; i++)
        dpTable[0][i] = -i;
    for(int i = 0; i < string1.length() + 1; i++)
        dpTable[i][0] = -i;

    for(int i = 0; i < string1.length(); i++) {
        for (int j = 0; j < string2.length(); j++) {
            char a = string1.charAt(i);
            char b = string2.charAt(j);

            //pos at table[i+1][j+1]
            int gapValue1 = dpTable[i][j+1] - gap;
            int gapValue2 = dpTable[i+1][j] - gap;
            int matchValue = dpTable[i][j] + scoreTable[index(a)][index(b)];

            int max = Math.max(gapValue1, gapValue2);
            max = Math.max(max, matchValue);
            dpTable[i+1][j+1] = max;
        }
    }

//        System.out.println("This is Table");
//        for(int i = 0; i < string1.length()+1; i++){
//            for(int j = 0; j < string2.length()+1; j++) {
//                System.out.printf("%4d\t|", dpTable[i][j]);
//            }
//            System.out.println();
//        }

    return dpTable[string1.length()][string2.length()];

}

}

Soundex

将一个无空格的人名转化为一个长度为4的字符串,这个字符串代表发音。发音不足4位补o。发音一样则为一个人名,不一样则为不同。
如果有空格则按空格切分分别计算,最后字符串相加。
“Gough”
结果“G2oo”

/*
 * Created by forestneo on 2016/12/22.
 **      0 AEIOUHWY
**      1 BFPV
**      2 CGJKQSXZ
**      3 DT
**      4 L
**      5 MN
**      6 R
*/

    import java.io.IOException;

public class Soundex {
private static final char[] mapping = {
        //a  b   c   d   e   f   g   h   i   j   k   l   m   n
        '0','1','2','3','0','1','2','0','0','2','2','4','5','5',
        //o  p   q   r   s   t   u   v   w   x   y   z
        '0','1','2','6','2','3','0','1','0','2','0','2'
};

private static char codeOf (char c){
    return (mapping[c - 'A']);
}

private static final int CODE_LENGTH = 4;


/*for Test use*/
public static void main (String[] args) throws IOException {
    String inputStr = "Gough";

    String soundex = getSoundex(inputStr);
    System.out.println("soundex = " + soundex);
}


public static String getSoundex(String inputStr){
    char[] retChar = new char[CODE_LENGTH];
    //step 1: get the first letter
    retChar[0] = inputStr.charAt(0);
    int index = 1;

    char pre = '?';

    char[] charArray = inputStr.toUpperCase().toCharArray();
    for(int i = 1; i < charArray.length && index < CODE_LENGTH; i++) {
        //Step 2: get over 'W' and 'H'
        if (charArray[i] == 'W' || charArray[i] == 'H')
            continue;
        char c = codeOf(charArray[i]);
        //Step 3 and 4
        if (c == pre || c == '0')
            continue;
        retChar[index++] = c;
        pre = c;
    }
    //length is less than 4, pad with 'o'
    while(index < CODE_LENGTH)
        retChar[index++] = 'o';
    return new String(retChar);
}

}

Jaccard

计算两个字符串的Jaccard相似度,每个字符串可以转化为一个集合,如“abc”->{#a,ab,bc,c#}结果为两个字符串的交集/并集。值越大越相似。适用于任意两个字符串
“dav”,”dave”
o为3,{#d,da,av}
Jaccard为0.5

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
* Created by qiaojialin on 2016/12/25.
*/

public class Jaccard {
public static void main(String[] args) {
    float j = jaccard("dave", "dave");
    int o = o("dave", "dave");
    System.out.println(o);
    System.out.println(j);
}

public static int o(String a, String b) {
    Set<String> setA = set(a);
    Set<String> setB = set(b);
    Set<String> o = new HashSet<String>();
    o.addAll(setA);
    o.retainAll(setB);
    return o.size();
}

public static float jaccard(String a, String b) {

    Set<String> setA = set(a);
    Set<String> setB = set(b);
    Set<String> inter = new HashSet<String>();

    inter.addAll(setA);
    inter.retainAll(setB);

    Set<String> union = new HashSet<String>();
    union.addAll(setA);
    union.addAll(setB);

    float interSize = inter.size();
    float unionSize = union.size();

    return interSize / unionSize;
}
public static Set<String> set(String x) {
    Set<String> set = new HashSet<String>();

    set.add("#" + x.charAt(0));
    set.add(x.charAt(x.length() -1) + "#");

    for(int i = 0; i < x.length() - 1; i++) {
        set.add(x.charAt(i) + "" + x.charAt(i + 1));
    }

    return set;
}
}

Record matching:

默认两个Record的各个属性代表相同schema,这样相同schema按String matching方法比较,总相似度为多个String matching的加权和。加上一些领域信息。如名字不匹配比重比较高,手机号不匹配比重较低。

Schema matching:

给定两张表,列数可能不一样,需要自己指定哪些列代表的相同的意义。可能多对多。手动合并和转化列,转化成相同schema的record,再做record matching。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值