();
+
+ StringBuffer buffer = new StringBuffer();
- //Step Two: Shrink.
- for(int i=0;i0 ? 1 : 0;
- if(bit == 1){
- result |= 1 << (HASH_LENGTH-1-i);
+ for (int i = 0; i < this.intSimHash.bitLength(); i++) {
+ // 褰撲笖浠呭綋璁剧疆浜嗘寚瀹氱殑浣嶆椂锛岃繑鍥� true
+ boolean sr = simHash.intSimHash.testBit(i);
+
+ if (sr) {
+ buffer.append("1");
+ } else {
+ buffer.append("0");
+ }
+
+ if ((i + 1) % numEach == 0) {
+ // 灏嗕簩杩涘埗杞负BigInteger
+ BigInteger eachValue = new BigInteger(buffer.toString(), 2);
+ System.out.println("----" + eachValue);
+ buffer.delete(0, buffer.length());
+ characters.add(eachValue);
}
}
- System.out.println("String \""+str+ "\" hashcode is:"+result
- +". Binary format is: "+Integer.toBinaryString(result));
- return result;
+
+ return characters;
}
-}
+ public static void main(String[] args) throws IOException {
+ String s = "浼犵粺鐨� hash 绠楁硶鍙礋璐e皢鍘熷鍐呭灏介噺鍧囧寑闅忔満鍦版槧灏勪负涓�涓鍚嶅�硷紝" + "鍘熺悊涓婄浉褰撲簬浼殢鏈烘暟浜х敓绠楁硶銆備骇鐢熺殑涓や釜绛惧悕锛屽鏋滅浉绛夛紝璇存槑鍘熷鍐呭鍦ㄤ竴瀹氭 鐜� 涓嬫槸鐩哥瓑鐨勶紱"
+ + "濡傛灉涓嶇浉绛夛紝闄や簡璇存槑鍘熷鍐呭涓嶇浉绛夊锛屼笉鍐嶆彁渚涗换浣曚俊鎭紝鍥犱负鍗充娇鍘熷鍐呭鍙浉宸竴涓瓧鑺傦紝" + "鎵�浜х敓鐨勭鍚嶄篃寰堝彲鑳藉樊鍒瀬澶с�備粠杩欎釜鎰忎箟 涓婃潵 璇达紝瑕佽璁′竴涓� hash 绠楁硶锛�"
+ + "瀵圭浉浼肩殑鍐呭浜х敓鐨勭鍚嶄篃鐩歌繎锛屾槸鏇翠负鑹伴毦鐨勪换鍔★紝鍥犱负瀹冪殑绛惧悕鍊奸櫎浜嗘彁渚涘師濮嬪唴瀹规槸鍚︾浉绛夌殑淇℃伅澶栵紝" + "杩樿兘棰濆鎻愪緵涓嶇浉绛夌殑 鍘熷鍐呭鐨勫樊寮傜▼搴︾殑淇℃伅銆�";
+ SimHash hash1 = new SimHash(s, 64);
+ System.out.println(hash1.intSimHash + " " + hash1.intSimHash.bitLength());
+ // 璁$畻 娴锋槑璺濈 鍦� 3 浠ュ唴鐨勫悇鍧楃鍚嶇殑 hash 鍊�
+ hash1.subByDistance(hash1, 3);
+
+ // 鍒犻櫎棣栧彞璇濓紝骞跺姞鍏ヤ袱涓共鎵颁覆
+ s = "鍘熺悊涓婄浉褰撲簬浼殢鏈烘暟浜х敓绠楁硶銆備骇鐢熺殑涓や釜绛惧悕锛屽鏋滅浉绛夛紝璇存槑鍘熷鍐呭鍦ㄤ竴瀹氭 鐜� 涓嬫槸鐩哥瓑鐨勶紱"
+ + "濡傛灉涓嶇浉绛夛紝闄や簡璇存槑鍘熷鍐呭涓嶇浉绛夊锛屼笉鍐嶆彁渚涗换浣曚俊鎭紝鍥犱负鍗充娇鍘熷鍐呭鍙浉宸竴涓瓧鑺傦紝" + "鎵�浜х敓鐨勭鍚嶄篃寰堝彲鑳藉樊鍒瀬澶с�備粠杩欎釜鎰忎箟 涓婃潵 璇达紝瑕佽璁′竴涓� hash 绠楁硶锛�"
+ + "瀵圭浉浼肩殑鍐呭浜х敓鐨勭鍚嶄篃鐩歌繎锛屾槸鏇翠负鑹伴毦鐨勪换鍔★紝鍥犱负瀹冪殑绛惧悕鍊奸櫎浜嗘彁渚涘師濮嬪唴瀹规槸鍚︾浉绛夌殑淇℃伅澶栵紝" + "骞叉壈1杩樿兘棰濆鎻愪緵涓嶇浉绛夌殑 鍘熷鍐呭鐨勫樊寮傜▼搴︾殑淇℃伅銆�";
+ SimHash hash2 = new SimHash(s, 64);
+ System.out.println(hash2.intSimHash + " " + hash2.intSimHash.bitCount());
+ hash1.subByDistance(hash2, 3);
+
+ // 棣栧彞鍓嶆坊鍔犱竴鍙ヨ瘽锛屽苟鍔犲叆鍥涗釜骞叉壈涓�
+ s = "imhash绠楁硶鐨勮緭鍏ユ槸涓�涓悜閲忥紝杈撳嚭鏄竴涓� f 浣嶇殑绛惧悕鍊笺�備负浜嗛檲杩版柟渚匡紝" + "鍋囪杈撳叆鐨勬槸涓�涓枃妗g殑鐗瑰緛闆嗗悎锛屾瘡涓壒寰佹湁涓�瀹氱殑鏉冮噸銆�"
+ + "浼犵粺骞叉壈4鐨� hash 绠楁硶鍙礋璐e皢鍘熷鍐呭灏介噺鍧囧寑闅忔満鍦版槧灏勪负涓�涓鍚嶅�硷紝" + "鍘熺悊涓婅繖娆″樊寮傛湁澶氬ぇ鍛�3鐩稿綋浜庝吉闅忔満鏁颁骇鐢熺畻娉曘�備骇鐢熺殑涓や釜绛惧悕锛屽鏋滅浉绛夛紝"
+ + "璇存槑鍘熷鍐呭鍦ㄤ竴瀹氭 鐜� 涓嬫槸鐩哥瓑鐨勶紱濡傛灉涓嶇浉绛夛紝闄や簡璇存槑鍘熷鍐呭涓嶇浉绛夊锛屼笉鍐嶆彁渚涗换浣曚俊鎭紝" + "鍥犱负鍗充娇鍘熷鍐呭鍙浉宸竴涓瓧鑺傦紝鎵�浜х敓鐨勭鍚嶄篃寰堝彲鑳藉樊鍒瀬澶с�備粠杩欎釜鎰忎箟 涓婃潵 璇达紝"
+ + "瑕佽璁′竴涓� hash 绠楁硶锛屽鐩镐技鐨勫唴瀹逛骇鐢熺殑绛惧悕涔熺浉杩戯紝鏄洿涓鸿壈闅剧殑浠诲姟锛屽洜涓哄畠鐨勭鍚嶅�奸櫎浜嗘彁渚涘師濮�" + "鍐呭鏄惁鐩哥瓑鐨勪俊鎭锛屽共鎵�1杩樿兘棰濆鎻愪緵涓嶇浉绛夌殑 鍘熷鍐嶆潵骞叉壈2鍐呭鐨勫樊寮傜▼搴︾殑淇℃伅銆�";
+ SimHash hash3 = new SimHash(s, 64);
+ System.out.println(hash3.intSimHash + " " + hash3.intSimHash.bitCount());
+ hash1.subByDistance(hash3, 3);
+
+ System.out.println("============================");
+
+ int dis = hash1.getDistance(hash1.strSimHash, hash2.strSimHash);
+ System.out.println(hash1.hammingDistance(hash2) + " " + dis);
+ // 鏍规嵁楦藉发鍘熺悊锛堜篃鎴愭娊灞夊師鐞嗭紝瑙佺粍鍚堟暟瀛︼級锛屽鏋滀袱涓鍚嶇殑娴锋槑璺濈鍦� 3 浠ュ唴锛屽畠浠繀鏈変竴鍧楃鍚峴ubByDistance()瀹屽叏鐩稿悓銆�
+ int dis2 = hash1.getDistance(hash1.strSimHash, hash3.strSimHash);
+ System.out.println(hash1.hammingDistance(hash3) + " " + dis2);
+ }
+}
\ No newline at end of file
diff --git a/quick-simhash/src/main/java/com/quick/simhash/SimHashTool.java b/quick-simhash/src/main/java/com/quick/simhash/SimHashTool.java
new file mode 100644
index 0000000..de6812b
--- /dev/null
+++ b/quick-simhash/src/main/java/com/quick/simhash/SimHashTool.java
@@ -0,0 +1,241 @@
+package com.quick.simhash;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+
+/**
+ * 鐩镐技鍝堝笇绠楁硶宸ュ叿绫�
+ *
+ * @author lyq
+ *
+ */
+public class SimHashTool {
+ // 浜岃繘鍒跺搱甯屼綅鏁�
+ private int hashBitNum;
+ // 鐩稿悓浣嶆暟鏈�灏忛槇鍊�
+ private double minSupportValue;
+
+ public SimHashTool(int hashBitNum, double minSupportValue) {
+ this.hashBitNum = hashBitNum;
+ this.minSupportValue = minSupportValue;
+ }
+
+ /**
+ * 姣旇緝鏂囩珷鐨勭浉浼煎害
+ *
+ * @param newsPath1
+ * 鏂囩珷璺緞1
+ * @param newsPath2
+ * 鏂囩珷璺緞2
+ */
+ public void compareArticals(String newsPath1, String newsPath2) {
+ String content1;
+ String content2;
+ int sameNum;
+ int[] hashArray1;
+ int[] hashArray2;
+
+
+ // 璇诲彇鍒嗚瘝缁撴灉
+ content1 = readDataFile(newsPath1);
+ content2 = readDataFile(newsPath2);
+ hashArray1 = calSimHashValue(content1);
+ hashArray2 = calSimHashValue(content2);
+
+ // 姣旇緝鍝堝笇浣嶆暟鐩稿悓涓暟
+ sameNum = 0;
+ for (int i = 0; i < hashBitNum; i++) {
+ if (hashArray1[i] == hashArray2[i]) {
+ sameNum++;
+ }
+ }
+
+ // 涓庢渶灏忛槇鍊艰繘琛屾瘮杈�
+ if (sameNum > this.hashBitNum * this.minSupportValue) {
+ System.out.println(String.format("鐩镐技搴︿负%s,瓒呰繃闃堝��%s,鎵�浠ユ柊闂�1涓庢柊闂�2鏄浉浼肩殑",
+ sameNum * 1.0 / hashBitNum, minSupportValue));
+ } else {
+ System.out.println(String.format("鐩镐技搴︿负%s,灏忎簬闃堝��%s,鎵�浠ユ柊闂�1涓庢柊闂�2涓嶆槸鐩镐技鐨�",
+ sameNum * 1.0 / hashBitNum, minSupportValue));
+ }
+ }
+
+ /**
+ * 璁$畻鏂囨湰鐨勭浉浼煎搱甯屽��
+ *
+ * @param content
+ * 鏂伴椈鍐呭鏁版嵁
+ * @return
+ */
+ private int[] calSimHashValue(String content) {
+ int index;
+ long hashValue;
+ double weight;
+ int[] binaryArray;
+ int[] resultValue;
+ double[] hashArray;
+ String w;
+ String[] words;
+ News news;
+
+ news = new News(content);
+ news.statWords();
+ hashArray = new double[hashBitNum];
+ resultValue = new int[hashBitNum];
+
+ words = content.split(" ");
+ for (String str : words) {
+ index = str.indexOf('/');
+ if (index == -1) {
+ continue;
+ }
+ w = str.substring(0, index);
+
+ // 鑾峰彇鏉冮噸鍊硷紝鏍规嵁璇嶉鎵�寰�
+ weight = news.getWordFrequentValue(w);
+ if(weight == -1){
+ continue;
+ }
+ // 杩涜鍝堝笇鍊肩殑璁$畻
+ hashValue = BKDRHash(w);
+ // 鍙栦綑鎶婁綅鏁板彉涓簄浣�
+ hashValue %= Math.pow(2, hashBitNum);
+
+ // 杞负浜岃繘鍒剁殑褰㈠紡
+ binaryArray = new int[hashBitNum];
+ numToBinaryArray(binaryArray, (int) hashValue);
+
+ for (int i = 0; i < binaryArray.length; i++) {
+ // 濡傛灉姝や綅缃笂涓�1锛屽姞鏉冮噸
+ if (binaryArray[i] == 1) {
+ hashArray[i] += weight;
+ } else {
+ // 涓�0鍒欏噺鏉冮噸鎿嶄綔
+ hashArray[i] -= weight;
+ }
+ }
+ }
+
+ // 杩涜鏁扮粍鏀剁缉鎿嶄綔锛屾牴鎹�肩殑姝h礋鍙凤紝閲嶆柊鏀逛负浜岃繘鍒舵暟鎹舰寮�
+ for (int i = 0; i < hashArray.length; i++) {
+ if (hashArray[i] > 0) {
+ resultValue[i] = 1;
+ } else {
+ resultValue[i] = 0;
+ }
+ }
+
+ return resultValue;
+ }
+
+ /**
+ * 鏁板瓧杞负浜岃繘鍒跺舰寮�
+ *
+ * @param binaryArray
+ * 杞寲鍚庣殑浜岃繘鍒舵暟缁勫舰寮�
+ * @param num
+ * 寰呰浆鍖栨暟瀛�
+ */
+ private void numToBinaryArray(int[] binaryArray, int num) {
+ int index = 0;
+ int temp = 0;
+ while (num != 0) {
+ binaryArray[index] = num % 2;
+ index++;
+ num /= 2;
+ }
+
+ // 杩涜鏁扮粍鍓嶅拰灏鹃儴鐨勮皟鎹�
+ for (int i = 0; i < binaryArray.length / 2; i++) {
+ temp = binaryArray[i];
+ binaryArray[i] = binaryArray[binaryArray.length - 1 - i];
+ binaryArray[binaryArray.length - 1 - i] = temp;
+ }
+ }
+
+ /**
+ * BKDR瀛楃鍝堝笇绠楁硶
+ *
+ * @param str
+ * @return
+ */
+ public static long BKDRHash(String str) {
+ int seed = 31; /* 31 131 1313 13131 131313 etc.. */
+ long hash = 0;
+ int i = 0;
+
+ for (i = 0; i < str.length(); i++) {
+ hash = (hash * seed) + (str.charAt(i));
+ }
+
+ hash = Math.abs(hash);
+ return hash;
+ }
+
+ /**
+ * 浠庢枃浠朵腑璇诲彇鏁版嵁
+ */
+ private String readDataFile(String filePath) {
+ File file = new File(filePath);
+ StringBuilder strBuilder = null;
+
+ try {
+ BufferedReader in = new BufferedReader(new FileReader(file));
+ String str;
+ strBuilder = new StringBuilder();
+ while ((str = in.readLine()) != null) {
+ strBuilder.append(str);
+ }
+ in.close();
+ } catch (IOException e) {
+ e.getStackTrace();
+ }
+
+ return strBuilder.toString();
+ }
+
+ /**
+ * 鍒╃敤鍒嗚瘝绯荤粺杩涜鏂伴椈鍐呭鐨勫垎璇�
+ *
+ * @param srcPath
+ * 鏂伴椈鏂囦欢璺緞
+ */
+ private void parseNewsContent(String srcPath) {
+ // TODO Auto-generated method stub
+ int index;
+ String dirApi;
+ String desPath;
+
+ dirApi = System.getProperty("user.dir") + "\\lib";
+ // 缁勮杈撳嚭璺緞鍊�
+ index = srcPath.indexOf('.');
+ desPath = srcPath.substring(0, index) + "-split.txt";
+
+ try {
+ ICTCLAS50 testICTCLAS50 = new ICTCLAS50();
+ // 鍒嗚瘝鎵�闇�搴撶殑璺緞銆佸垵濮嬪寲
+ if (testICTCLAS50.ICTCLAS_Init(dirApi.getBytes("GB2312")) == false) {
+ System.out.println("Init Fail!");
+ return;
+ }
+ // 灏嗘枃浠跺悕string绫诲瀷杞负byte绫诲瀷
+ byte[] Inputfilenameb = srcPath.getBytes();
+
+ // 鍒嗚瘝澶勭悊鍚庤緭鍑烘枃浠跺悕銆佸皢鏂囦欢鍚峴tring绫诲瀷杞负byte绫诲瀷
+ byte[] Outputfilenameb = desPath.getBytes();
+
+ // 鏂囦欢鍒嗚瘝(绗竴涓弬鏁颁负杈撳叆鏂囦欢鐨勫悕,绗簩涓弬鏁颁负鏂囦欢缂栫爜绫诲瀷,绗笁涓弬鏁颁负鏄惁鏍囪璇嶆�ч泦1 yes,0
+ // no,绗洓涓弬鏁颁负杈撳嚭鏂囦欢鍚�)
+ testICTCLAS50.ICTCLAS_FileProcess(Inputfilenameb, 0, 1,
+ Outputfilenameb);
+ // 閫�鍑哄垎璇嶅櫒
+ testICTCLAS50.ICTCLAS_Exit();
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ }
+
+ }
+
+}
\ No newline at end of file
diff --git a/quick-simhash/src/main/resources/IKAnalyzer.cfg.xml b/quick-simhash/src/main/resources/IKAnalyzer.cfg.xml
new file mode 100644
index 0000000..c26bc4a
--- /dev/null
+++ b/quick-simhash/src/main/resources/IKAnalyzer.cfg.xml
@@ -0,0 +1,11 @@
+
+
++IK Analyzer 鎵╁睍閰嶇疆
+
+
+stopword.dic;
+
+
\ No newline at end of file
diff --git a/quick-simhash/src/main/resources/stopword.dic b/quick-simhash/src/main/resources/stopword.dic
new file mode 100644
index 0000000..c1b994b
--- /dev/null
+++ b/quick-simhash/src/main/resources/stopword.dic
@@ -0,0 +1,33 @@
+a
+an
+and
+are
+as
+at
+be
+but
+by
+for
+if
+in
+into
+is
+it
+no
+not
+of
+on
+or
+such
+that
+the
+their
+then
+there
+these
+they
+this
+to
+was
+will
+with
\ No newline at end of file