java int数列转字符串,鍥剧墖杞瓧绗︿覆

();

+

+ StringBuffer buffer = new StringBuffer();

- //Step Two: Shrink.

- for(int i=0;i0 ? 1 : 0;

- if(bit == 1){

- result |= 1 << (HASH_LENGTH-1-i);

+ for (int i = 0; i < this.intSimHash.bitLength(); i++) {

+ // 褰撲笖浠呭綋璁剧疆浜嗘寚瀹氱殑浣嶆椂锛岃繑鍥� true

+ boolean sr = simHash.intSimHash.testBit(i);

+

+ if (sr) {

+ buffer.append("1");

+ } else {

+ buffer.append("0");

+ }

+

+ if ((i + 1) % numEach == 0) {

+ // 灏嗕簩杩涘埗杞负BigInteger

+ BigInteger eachValue = new BigInteger(buffer.toString(), 2);

+ System.out.println("----" + eachValue);

+ buffer.delete(0, buffer.length());

+ characters.add(eachValue);

}

}

- System.out.println("String \""+str+ "\" hashcode is:"+result

- +". Binary format is: "+Integer.toBinaryString(result));

- return result;

+

+ return characters;

}

-}

+ public static void main(String[] args) throws IOException {

+ String s = "浼犵粺鐨� hash 绠楁硶鍙礋璐e皢鍘熷鍐呭灏介噺鍧囧寑闅忔満鍦版槧灏勪负涓�涓鍚嶅�硷紝" + "鍘熺悊涓婄浉褰撲簬浼殢鏈烘暟浜х敓绠楁硶銆備骇鐢熺殑涓や釜绛惧悕锛屽鏋滅浉绛夛紝璇存槑鍘熷鍐呭鍦ㄤ竴瀹氭 鐜� 涓嬫槸鐩哥瓑鐨勶紱"

+ + "濡傛灉涓嶇浉绛夛紝闄や簡璇存槑鍘熷鍐呭涓嶇浉绛夊锛屼笉鍐嶆彁渚涗换浣曚俊鎭紝鍥犱负鍗充娇鍘熷鍐呭鍙浉宸竴涓瓧鑺傦紝" + "鎵�浜х敓鐨勭鍚嶄篃寰堝彲鑳藉樊鍒瀬澶с�備粠杩欎釜鎰忎箟 涓婃潵 璇达紝瑕佽璁′竴涓� hash 绠楁硶锛�"

+ + "瀵圭浉浼肩殑鍐呭浜х敓鐨勭鍚嶄篃鐩歌繎锛屾槸鏇翠负鑹伴毦鐨勪换鍔★紝鍥犱负瀹冪殑绛惧悕鍊奸櫎浜嗘彁渚涘師濮嬪唴瀹规槸鍚︾浉绛夌殑淇℃伅澶栵紝" + "杩樿兘棰濆鎻愪緵涓嶇浉绛夌殑 鍘熷鍐呭鐨勫樊寮傜▼搴︾殑淇℃伅銆�";

+ SimHash hash1 = new SimHash(s, 64);

+ System.out.println(hash1.intSimHash + " " + hash1.intSimHash.bitLength());

+ // 璁$畻 娴锋槑璺濈 鍦� 3 浠ュ唴鐨勫悇鍧楃鍚嶇殑 hash 鍊�

+ hash1.subByDistance(hash1, 3);

+

+ // 鍒犻櫎棣栧彞璇濓紝骞跺姞鍏ヤ袱涓共鎵颁覆

+ s = "鍘熺悊涓婄浉褰撲簬浼殢鏈烘暟浜х敓绠楁硶銆備骇鐢熺殑涓や釜绛惧悕锛屽鏋滅浉绛夛紝璇存槑鍘熷鍐呭鍦ㄤ竴瀹氭 鐜� 涓嬫槸鐩哥瓑鐨勶紱"

+ + "濡傛灉涓嶇浉绛夛紝闄や簡璇存槑鍘熷鍐呭涓嶇浉绛夊锛屼笉鍐嶆彁渚涗换浣曚俊鎭紝鍥犱负鍗充娇鍘熷鍐呭鍙浉宸竴涓瓧鑺傦紝" + "鎵�浜х敓鐨勭鍚嶄篃寰堝彲鑳藉樊鍒瀬澶с�備粠杩欎釜鎰忎箟 涓婃潵 璇达紝瑕佽璁′竴涓� hash 绠楁硶锛�"

+ + "瀵圭浉浼肩殑鍐呭浜х敓鐨勭鍚嶄篃鐩歌繎锛屾槸鏇翠负鑹伴毦鐨勪换鍔★紝鍥犱负瀹冪殑绛惧悕鍊奸櫎浜嗘彁渚涘師濮嬪唴瀹规槸鍚︾浉绛夌殑淇℃伅澶栵紝" + "骞叉壈1杩樿兘棰濆鎻愪緵涓嶇浉绛夌殑 鍘熷鍐呭鐨勫樊寮傜▼搴︾殑淇℃伅銆�";

+ SimHash hash2 = new SimHash(s, 64);

+ System.out.println(hash2.intSimHash + " " + hash2.intSimHash.bitCount());

+ hash1.subByDistance(hash2, 3);

+

+ // 棣栧彞鍓嶆坊鍔犱竴鍙ヨ瘽锛屽苟鍔犲叆鍥涗釜骞叉壈涓�

+ s = "imhash绠楁硶鐨勮緭鍏ユ槸涓�涓悜閲忥紝杈撳嚭鏄竴涓� f 浣嶇殑绛惧悕鍊笺�備负浜嗛檲杩版柟渚匡紝" + "鍋囪杈撳叆鐨勬槸涓�涓枃妗g殑鐗瑰緛闆嗗悎锛屾瘡涓壒寰佹湁涓�瀹氱殑鏉冮噸銆�"

+ + "浼犵粺骞叉壈4鐨� hash 绠楁硶鍙礋璐e皢鍘熷鍐呭灏介噺鍧囧寑闅忔満鍦版槧灏勪负涓�涓鍚嶅�硷紝" + "鍘熺悊涓婅繖娆″樊寮傛湁澶氬ぇ鍛�3鐩稿綋浜庝吉闅忔満鏁颁骇鐢熺畻娉曘�備骇鐢熺殑涓や釜绛惧悕锛屽鏋滅浉绛夛紝"

+ + "璇存槑鍘熷鍐呭鍦ㄤ竴瀹氭 鐜� 涓嬫槸鐩哥瓑鐨勶紱濡傛灉涓嶇浉绛夛紝闄や簡璇存槑鍘熷鍐呭涓嶇浉绛夊锛屼笉鍐嶆彁渚涗换浣曚俊鎭紝" + "鍥犱负鍗充娇鍘熷鍐呭鍙浉宸竴涓瓧鑺傦紝鎵�浜х敓鐨勭鍚嶄篃寰堝彲鑳藉樊鍒瀬澶с�備粠杩欎釜鎰忎箟 涓婃潵 璇达紝"

+ + "瑕佽璁′竴涓� hash 绠楁硶锛屽鐩镐技鐨勫唴瀹逛骇鐢熺殑绛惧悕涔熺浉杩戯紝鏄洿涓鸿壈闅剧殑浠诲姟锛屽洜涓哄畠鐨勭鍚嶅�奸櫎浜嗘彁渚涘師濮�" + "鍐呭鏄惁鐩哥瓑鐨勪俊鎭锛屽共鎵�1杩樿兘棰濆鎻愪緵涓嶇浉绛夌殑 鍘熷鍐嶆潵骞叉壈2鍐呭鐨勫樊寮傜▼搴︾殑淇℃伅銆�";

+ SimHash hash3 = new SimHash(s, 64);

+ System.out.println(hash3.intSimHash + " " + hash3.intSimHash.bitCount());

+ hash1.subByDistance(hash3, 3);

+

+ System.out.println("============================");

+

+ int dis = hash1.getDistance(hash1.strSimHash, hash2.strSimHash);

+ System.out.println(hash1.hammingDistance(hash2) + " " + dis);

+ // 鏍规嵁楦藉发鍘熺悊锛堜篃鎴愭娊灞夊師鐞嗭紝瑙佺粍鍚堟暟瀛︼級锛屽鏋滀袱涓鍚嶇殑娴锋槑璺濈鍦� 3 浠ュ唴锛屽畠浠繀鏈変竴鍧楃鍚峴ubByDistance()瀹屽叏鐩稿悓銆�

+ int dis2 = hash1.getDistance(hash1.strSimHash, hash3.strSimHash);

+ System.out.println(hash1.hammingDistance(hash3) + " " + dis2);

+ }

+}

\ No newline at end of file

diff --git a/quick-simhash/src/main/java/com/quick/simhash/SimHashTool.java b/quick-simhash/src/main/java/com/quick/simhash/SimHashTool.java

new file mode 100644

index 0000000..de6812b

--- /dev/null

+++ b/quick-simhash/src/main/java/com/quick/simhash/SimHashTool.java

@@ -0,0 +1,241 @@

+package com.quick.simhash;

+

+import java.io.BufferedReader;

+import java.io.File;

+import java.io.FileReader;

+import java.io.IOException;

+

+/**

+ * 鐩镐技鍝堝笇绠楁硶宸ュ叿绫�

+ *

+ * @author lyq

+ *

+ */

+public class SimHashTool {

+ // 浜岃繘鍒跺搱甯屼綅鏁�

+ private int hashBitNum;

+ // 鐩稿悓浣嶆暟鏈�灏忛槇鍊�

+ private double minSupportValue;

+

+ public SimHashTool(int hashBitNum, double minSupportValue) {

+ this.hashBitNum = hashBitNum;

+ this.minSupportValue = minSupportValue;

+ }

+

+ /**

+ * 姣旇緝鏂囩珷鐨勭浉浼煎害

+ *

+ * @param newsPath1

+ * 鏂囩珷璺緞1

+ * @param newsPath2

+ * 鏂囩珷璺緞2

+ */

+ public void compareArticals(String newsPath1, String newsPath2) {

+ String content1;

+ String content2;

+ int sameNum;

+ int[] hashArray1;

+ int[] hashArray2;

+

+

+ // 璇诲彇鍒嗚瘝缁撴灉

+ content1 = readDataFile(newsPath1);

+ content2 = readDataFile(newsPath2);

+ hashArray1 = calSimHashValue(content1);

+ hashArray2 = calSimHashValue(content2);

+

+ // 姣旇緝鍝堝笇浣嶆暟鐩稿悓涓暟

+ sameNum = 0;

+ for (int i = 0; i < hashBitNum; i++) {

+ if (hashArray1[i] == hashArray2[i]) {

+ sameNum++;

+ }

+ }

+

+ // 涓庢渶灏忛槇鍊艰繘琛屾瘮杈�

+ if (sameNum > this.hashBitNum * this.minSupportValue) {

+ System.out.println(String.format("鐩镐技搴︿负%s,瓒呰繃闃堝��%s,鎵�浠ユ柊闂�1涓庢柊闂�2鏄浉浼肩殑",

+ sameNum * 1.0 / hashBitNum, minSupportValue));

+ } else {

+ System.out.println(String.format("鐩镐技搴︿负%s,灏忎簬闃堝��%s,鎵�浠ユ柊闂�1涓庢柊闂�2涓嶆槸鐩镐技鐨�",

+ sameNum * 1.0 / hashBitNum, minSupportValue));

+ }

+ }

+

+ /**

+ * 璁$畻鏂囨湰鐨勭浉浼煎搱甯屽��

+ *

+ * @param content

+ * 鏂伴椈鍐呭鏁版嵁

+ * @return

+ */

+ private int[] calSimHashValue(String content) {

+ int index;

+ long hashValue;

+ double weight;

+ int[] binaryArray;

+ int[] resultValue;

+ double[] hashArray;

+ String w;

+ String[] words;

+ News news;

+

+ news = new News(content);

+ news.statWords();

+ hashArray = new double[hashBitNum];

+ resultValue = new int[hashBitNum];

+

+ words = content.split(" ");

+ for (String str : words) {

+ index = str.indexOf('/');

+ if (index == -1) {

+ continue;

+ }

+ w = str.substring(0, index);

+

+ // 鑾峰彇鏉冮噸鍊硷紝鏍规嵁璇嶉鎵�寰�

+ weight = news.getWordFrequentValue(w);

+ if(weight == -1){

+ continue;

+ }

+ // 杩涜鍝堝笇鍊肩殑璁$畻

+ hashValue = BKDRHash(w);

+ // 鍙栦綑鎶婁綅鏁板彉涓簄浣�

+ hashValue %= Math.pow(2, hashBitNum);

+

+ // 杞负浜岃繘鍒剁殑褰㈠紡

+ binaryArray = new int[hashBitNum];

+ numToBinaryArray(binaryArray, (int) hashValue);

+

+ for (int i = 0; i < binaryArray.length; i++) {

+ // 濡傛灉姝や綅缃笂涓�1锛屽姞鏉冮噸

+ if (binaryArray[i] == 1) {

+ hashArray[i] += weight;

+ } else {

+ // 涓�0鍒欏噺鏉冮噸鎿嶄綔

+ hashArray[i] -= weight;

+ }

+ }

+ }

+

+ // 杩涜鏁扮粍鏀剁缉鎿嶄綔锛屾牴鎹�肩殑姝h礋鍙凤紝閲嶆柊鏀逛负浜岃繘鍒舵暟鎹舰寮�

+ for (int i = 0; i < hashArray.length; i++) {

+ if (hashArray[i] > 0) {

+ resultValue[i] = 1;

+ } else {

+ resultValue[i] = 0;

+ }

+ }

+

+ return resultValue;

+ }

+

+ /**

+ * 鏁板瓧杞负浜岃繘鍒跺舰寮�

+ *

+ * @param binaryArray

+ * 杞寲鍚庣殑浜岃繘鍒舵暟缁勫舰寮�

+ * @param num

+ * 寰呰浆鍖栨暟瀛�

+ */

+ private void numToBinaryArray(int[] binaryArray, int num) {

+ int index = 0;

+ int temp = 0;

+ while (num != 0) {

+ binaryArray[index] = num % 2;

+ index++;

+ num /= 2;

+ }

+

+ // 杩涜鏁扮粍鍓嶅拰灏鹃儴鐨勮皟鎹�

+ for (int i = 0; i < binaryArray.length / 2; i++) {

+ temp = binaryArray[i];

+ binaryArray[i] = binaryArray[binaryArray.length - 1 - i];

+ binaryArray[binaryArray.length - 1 - i] = temp;

+ }

+ }

+

+ /**

+ * BKDR瀛楃鍝堝笇绠楁硶

+ *

+ * @param str

+ * @return

+ */

+ public static long BKDRHash(String str) {

+ int seed = 31; /* 31 131 1313 13131 131313 etc.. */

+ long hash = 0;

+ int i = 0;

+

+ for (i = 0; i < str.length(); i++) {

+ hash = (hash * seed) + (str.charAt(i));

+ }

+

+ hash = Math.abs(hash);

+ return hash;

+ }

+

+ /**

+ * 浠庢枃浠朵腑璇诲彇鏁版嵁

+ */

+ private String readDataFile(String filePath) {

+ File file = new File(filePath);

+ StringBuilder strBuilder = null;

+

+ try {

+ BufferedReader in = new BufferedReader(new FileReader(file));

+ String str;

+ strBuilder = new StringBuilder();

+ while ((str = in.readLine()) != null) {

+ strBuilder.append(str);

+ }

+ in.close();

+ } catch (IOException e) {

+ e.getStackTrace();

+ }

+

+ return strBuilder.toString();

+ }

+

+ /**

+ * 鍒╃敤鍒嗚瘝绯荤粺杩涜鏂伴椈鍐呭鐨勫垎璇�

+ *

+ * @param srcPath

+ * 鏂伴椈鏂囦欢璺緞

+ */

+ private void parseNewsContent(String srcPath) {

+ // TODO Auto-generated method stub

+ int index;

+ String dirApi;

+ String desPath;

+

+ dirApi = System.getProperty("user.dir") + "\\lib";

+ // 缁勮杈撳嚭璺緞鍊�

+ index = srcPath.indexOf('.');

+ desPath = srcPath.substring(0, index) + "-split.txt";

+

+ try {

+ ICTCLAS50 testICTCLAS50 = new ICTCLAS50();

+ // 鍒嗚瘝鎵�闇�搴撶殑璺緞銆佸垵濮嬪寲

+ if (testICTCLAS50.ICTCLAS_Init(dirApi.getBytes("GB2312")) == false) {

+ System.out.println("Init Fail!");

+ return;

+ }

+ // 灏嗘枃浠跺悕string绫诲瀷杞负byte绫诲瀷

+ byte[] Inputfilenameb = srcPath.getBytes();

+

+ // 鍒嗚瘝澶勭悊鍚庤緭鍑烘枃浠跺悕銆佸皢鏂囦欢鍚峴tring绫诲瀷杞负byte绫诲瀷

+ byte[] Outputfilenameb = desPath.getBytes();

+

+ // 鏂囦欢鍒嗚瘝(绗竴涓弬鏁颁负杈撳叆鏂囦欢鐨勫悕,绗簩涓弬鏁颁负鏂囦欢缂栫爜绫诲瀷,绗笁涓弬鏁颁负鏄惁鏍囪璇嶆�ч泦1 yes,0

+ // no,绗洓涓弬鏁颁负杈撳嚭鏂囦欢鍚�)

+ testICTCLAS50.ICTCLAS_FileProcess(Inputfilenameb, 0, 1,

+ Outputfilenameb);

+ // 閫�鍑哄垎璇嶅櫒

+ testICTCLAS50.ICTCLAS_Exit();

+ } catch (Exception ex) {

+ ex.printStackTrace();

+ }

+

+ }

+

+}

\ No newline at end of file

diff --git a/quick-simhash/src/main/resources/IKAnalyzer.cfg.xml b/quick-simhash/src/main/resources/IKAnalyzer.cfg.xml

new file mode 100644

index 0000000..c26bc4a

--- /dev/null

+++ b/quick-simhash/src/main/resources/IKAnalyzer.cfg.xml

@@ -0,0 +1,11 @@

+

+

++IK Analyzer 鎵╁睍閰嶇疆

+

+

+stopword.dic;

+

+

\ No newline at end of file

diff --git a/quick-simhash/src/main/resources/stopword.dic b/quick-simhash/src/main/resources/stopword.dic

new file mode 100644

index 0000000..c1b994b

--- /dev/null

+++ b/quick-simhash/src/main/resources/stopword.dic

@@ -0,0 +1,33 @@

+a

+an

+and

+are

+as

+at

+be

+but

+by

+for

+if

+in

+into

+is

+it

+no

+not

+of

+on

+or

+such

+that

+the

+their

+then

+there

+these

+they

+this

+to

+was

+will

+with

\ No newline at end of file

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值