Java—SimHash原理与实现
SimHash 原理
原理链接
SimHash 实现
package GetSimilar;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.Map;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
public class SimHash2 {
private int hashbits = 64;
public Map<String, Double> IDF;
public SimHash2(){
IDF = getIDF();
}
public SimHash2(int hashbits) {
this.hashbits = hashbits;
}
public Map<String, Double> getIDF(){
Map<String, Double> result = new HashMap<String, Double>();
try {
FileInputStream fis = new FileInputStream("D:/Document/data/Similar/idf.txt");
BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
String line = "";
while ((line = br.readLine()) != null) {
String[] split = line.split("\t");
result.put(split[0], Double.valueOf(split[1]));
}
br.close();
fis.close();
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
private String cleanResume(String content) {
content = Jsoup.clean(content, Whitelist.none());
content = StringUtils.lowerCase(content);
String[] strings = { " ", "\n", "\r", "\t", "\\r", "\\n", "\\t", " " };
for (String s : strings) {
content = content.replaceAll(s, "");
}
return content;
}
private BigInteger simHash(String tokens) {
tokens = cleanResume(tokens);
int[] v = new int[this.hashbits];
Result ansjList = wordAnalyzer(tokens);
Map<String, Integer> wordCount = new HashMap<String, Integer>();
Integer count = 0;
for (Term term : ansjList) {
count = wordCount.get(term.getName());
if (count == null) {
wordCount.put(term.getName(), 1);
} else {
wordCount.put(term.getName(), count + 1);
}
}
int len = wordCount.size();
String word = "";
for (Term term : ansjList) {
word = term.getName();
BigInteger t = this.hash(word);
for (int i = 0; i < this.hashbits; i++) {
BigInteger bitmask = new BigInteger("1").shiftLeft(i);
double tf = (double) wordCount.get(word) / len;
if(IDF.get(word)==null) continue;
Double weight = 100 * tf * IDF.get(word);
if (t.and(bitmask).signum() != 0) {
v[i] += weight;
} else {
v[i] -= weight;
}
}
}
BigInteger fingerprint = new BigInteger("0");
for (int i = 0; i < this.hashbits; i++) {
if (v[i] >= 0) {
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
}
}
return fingerprint;
}
private BigInteger hash(String source) {
if (source == null || source.length() == 0) {
return new BigInteger("0");
} else {
while (source.length() < 3) {
source = source + source.charAt(0);
}
char[] sourceArray = source.toCharArray();
BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
BigInteger m = new BigInteger("1000003");
BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1"));
for (char item : sourceArray) {
BigInteger temp = BigInteger.valueOf((long) item);
x = x.multiply(m).xor(temp).and(mask);
}
x = x.xor(new BigInteger(String.valueOf(source.length())));
if (x.equals(new BigInteger("-1"))) {
x = new BigInteger("-2");
}
return x;
}
}
public int hammingDistance(String s1, String s2) {
BigInteger one = this.simHash(s1);
BigInteger two = this.simHash(s2);
BigInteger m = new BigInteger("1").shiftLeft(this.hashbits).subtract(new BigInteger("1"));
BigInteger x = one.xor(two).and(m);
int tot = 0;
while (x.signum() != 0) {
tot += 1;
x = x.and(x.subtract(new BigInteger("1")));
}
return tot;
}
public double getSemblance(String s1, String s2) {
double i = (double) this.hammingDistance(s1, s2);
return 1 - i / this.hashbits;
}
public static Result wordAnalyzer(String line) {
StopRecognition filter = new StopRecognition();
filter.insertStopNatures("w");
filter.insertStopNatures("null");
filter.insertStopNatures("m");
String[] stopWords = { "如图所示", "的", "中", "下列", "说法", "正确", "是", "若", "为", "则", "在" };
filter.insertStopWords(stopWords);
Result fliterContent = ToAnalysis.parse(line).recognition(filter);
return fliterContent;
}
public static void main(String[] args) {
String s1 = "如图所示,电源电压保持不变,闭合开关 S 0,滑动变阻器R的滑片向右移动的过程中,下列说法正确的是 A.闭合开关S,若甲、乙均为电压表,则两表示数均变小 B.断开开关S,若甲、乙均为电流表,则两表示数均变大 C.闭合开关S,若甲、乙均为电压表,则甲示数不变,乙示数变大 D.断开开关S,若甲、乙均为电流表,则乙示数不变,甲示数变小";
String s2 = "小明今天去吃肯德基";
SimHash ob = new SimHash();
System.out.println(ob.getSemblance(s1, s2));
}
}