最近项目有个问题,客户重复,比如**市A有限公司,**A有限公司,少了一个市,或者有限什么的,这样导致本来一个客户在系统中就出现多条记录,现在利用字符相似度比较找出差不多的客户给业务部门确认。
引用原文地址
http://darkmasky.iteye.com/blog/1115039
程序代码如下,由于只是一次使用,并没有写的很规范,Main.java中被注释的一段如果加上去,准确率高但是范围小了,所以做参考。
package com.shine.db;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import a.Customer;
public class JDBC {
public static List<Customer> getData() {
String sql = "select t.fnumber,t.fname_l2, length(t.fname_l2) s from t_bd_customer t order by s";
String driver = "oracle.jdbc.driver.OracleDriver";
String connStr = "jdbc:oracle:thin:@192.168.0.5:1521:test";
Connection conn = null;
ArrayList<Customer> l = new ArrayList<Customer>();
try {
Class.forName(driver);
conn = DriverManager.getConnection(connStr, "user", "***");
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery(sql);
while (rs.next()) {
Customer c = new Customer();
c.setName(rs.getString("fname_l2"));
c.setNumber(rs.getString("fnumber"));
c.setLen(rs.getInt("s"));
l.add(c);
}
rs.close();
stmt.close();
conn.close();
} catch (Exception e) {
e.printStackTrace();
}
return l;
}
}
package a;
public class Customer {
String number;
String name;
int len;
public int getLen() {
return len;
}
public void setLen(int l) {
this.len = l;
}
public String getNumber() {
return number;
}
public void setNumber(String numner) {
this.number = numner;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
package a;
import java.util.List;
import com.shine.db.JDBC;
public class Main {
public static void main(String[] args) {
List<Customer> data = JDBC.getData();
for (int i = 0; i < data.size(); i++) {
Customer c1 = (Customer) data.get(i);
String s1 = c1.getName();
int l1 = c1.getLen();
for (int j = i + 1; j < data.size(); j++) {
Customer c2 = (Customer) data.get(j);
String s2 = c2.getName();
int l2 = c2.getLen();
double similarDegree = getSimilarityRatio(s1, s2);
if (similarDegree >= 0.85) {
System.out.println(s1 + "(" + c1.getNumber() + "),"
+ s2 + "(" + c2.getNumber() + ")");
}
// if (l1 != l2) {
// double similarDegree = getSimilarityRatio(s1, s2);
// if (similarDegree >= 0.9) {
// System.out.println(s1 + "(" + c1.getNumber() + "),"
// + s2 + "(" + c2.getNumber() + ")");
// }
// }
}
}
}
private static int compare(String str, String target) {
int d[][]; // 矩阵
int n = str.length();
int m = target.length();
int i; // 遍历str的
int j; // 遍历target的
char ch1; // str的
char ch2; // target的
int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
d = new int[n + 1][m + 1];
for (i = 0; i <= n; i++) { // 初始化第一列
d[i][0] = i;
}
for (j = 0; j <= m; j++) { // 初始化第一行
d[0][j] = j;
}
for (i = 1; i <= n; i++) { // 遍历str
ch1 = str.charAt(i - 1);
// 去匹配target
for (j = 1; j <= m; j++) {
ch2 = target.charAt(j - 1);
if (ch1 == ch2) {
temp = 0;
} else {
temp = 1;
}
// 左边+1,上边+1, 左上角+temp取最小
d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]
+ temp);
}
}
return d[n][m];
}
private static int min(int one, int two, int three) {
return (one = one < two ? one : two) < three ? one : three;
}
/**
*
* 获取两字符串的相似度
*
*
*
* @param str
*
* @param target
*
* @return
*/
public static float getSimilarityRatio(String str, String target) {
return 1 - (float) compare(str, target)
/ Math.max(str.length(), target.length());
}
}