今天,学习了网页去重的一些方法,算法当中SimHash算法,我尤为关注。下面我将详细介绍一下这个算法
首先,介绍一下SimHash:如果两个相似文档的语义指纹只相差几个位或更少,这样的语义指纹叫做SimHash
计算海明距离的两种方法:
1,按位与
2,两个长整形异或后,然后计算结果中1的个数
取得每个特征的64位hash值
public static int hamming(long l1, long l2) {
long lxor = l1 ^ l2;
return BitUtil.pop(lxor);
}
long lxor = l1 ^ l2;
return BitUtil.pop(lxor);
}
SimHash计算过程:
初始化长度为64位的向量,该向量的每个维度都是0
循环处理:取每个特征的64位hash值,如果这个hash值得第i位是1,则将向量的第i个数加上特征权重,反之,如果为0,则减去相应的权重
完成所有特征的处理,向量中某些数为正,某些数为负,正数对应的位为1,负数为0,得到最终64位的SimHash
在写入文件过程中,可以把SimHash值使用差分编码进行压缩后保存,下面是一个简单实现代码
View Code
1 package com.test; 2 3 import java.io.BufferedInputStream; 4 import java.io.BufferedOutputStream; 5 import java.io.DataInputStream; 6 import java.io.FileInputStream; 7 import java.io.FileNotFoundException; 8 import java.io.FileOutputStream; 9 import java.io.IOException; 10 11 public class DetaCompress { 12 13 public static byte[] longToBytes(long n) { 14 byte[] buf = new byte[8];// 新建一个byte数组 15 for (int i = buf.length - 1; i >= 0; i--) { 16 buf[i] = (byte) (n & 0x00000000000000ff);// 取低8位的值 17 n >>>= 8;// 右移8位 18 } 19 return buf; 20 } 21 22 // 把一个long型的数据进行压缩 23 public static void writeVLong(long i, BufferedOutputStream dos) 24 throws IOException { 25 while ((i & ~0x7F) != 0) { 26 dos.write((byte) ((i & 0x7f) | 0x80)); // 写入低位字节 27 i >>>= 7; // 右移7位 28 } 29 30 dos.write((byte) i); 31 // System.out.println((byte)i+" 写入低位字节"); 32 33 } 34 35 // 把一个压缩后的long型的数据读取出来 36 static long readVLong(DataInputStream dis) throws IOException { 37 byte b = dis.readByte(); // 读入一个字节 38 int i = b & 0x7F; // 取低7位的值 39 // 每个高位的字节多乘个2的7次方,也就是128 40 for (int shift = 7; (b & 0x80) != 0; shift += 7) { 41 if (dis.available() != 0) { 42 b = dis.readByte(); 43 i |= (b & 0x7F) << shift; // 当前字节表示的位乘2的shift次方 44 } 45 } 46 return i;// 返回最终结果i 47 } 48 49 // 把long型数组simHashSet写入fileName指定的文件中去 50 static int write(long[] simHashSet, String fileName) { 51 int j = 0; 52 try { 53 BufferedOutputStream dos = new BufferedOutputStream( 54 new FileOutputStream(fileName)); 55 byte[] b = longToBytes(simHashSet[0]);// 数组的第一个数字一个转换成二进制 56 dos.write(b);// 把它写到文件中 57 for (int i = 1; i < simHashSet.length; i++) { 58 long lo = simHashSet[i] - simHashSet[i - 1];// 用一个变量记录数组中后一个数减前一个数的差 59 writeVLong(lo, dos);// 把这个差值写入文件 60 } 61 dos.close(); 62 j = simHashSet.length; 63 } catch (FileNotFoundException e) { 64 e.printStackTrace(); 65 } catch (IOException e) { 66 e.printStackTrace(); 67 } 68 return j; 69 } 70 71 // 从fileName指定的文件中把long型数组写出来 72 static long[] read(int len, String fileName) { 73 try { 74 DataInputStream dis = new DataInputStream(new BufferedInputStream( 75 new FileInputStream(fileName))); 76 long[] simHashSet = new long[len]; 77 simHashSet[0] = dis.readLong();// 从文件读取第一个long型数字放入数组 78 for (int i = 1; i < len; i++) { 79 simHashSet[i] = readVLong(dis);// 读取文件剩下的元素 80 simHashSet[i] = simHashSet[i] + simHashSet[i - 1]; // 将元素都变成数组后一个数和前一个数字的和 81 } 82 dis.close(); 83 84 return simHashSet; 85 } catch (FileNotFoundException e) { 86 e.printStackTrace(); 87 } catch (IOException e) { 88 e.printStackTrace(); 89 } 90 return null; 91 } 92 }
排重的总体思想是:
先把要检索的f 位指纹集合缩小,将集合f位划分几块,
精确匹配高d位,集合容量缩小变为f'=|s|/2^d'
然后在小集合中检索f-d'位的海明距离
下面是实现的例子:
View Code
1 package com.lietu.simhash; 2 3 import java.io.BufferedReader; 4 import java.io.BufferedWriter; 5 import java.io.File; 6 import java.io.FileInputStream; 7 import java.io.FileNotFoundException; 8 import java.io.FileOutputStream; 9 import java.io.FileWriter; 10 import java.io.IOException; 11 import java.io.InputStream; 12 import java.io.InputStreamReader; 13 import java.io.OutputStream; 14 import java.io.OutputStreamWriter; 15 import java.io.UnsupportedEncodingException; 16 import java.util.ArrayList; 17 import java.util.Collections; 18 import java.util.Comparator; 19 import java.util.HashMap; 20 import java.util.HashSet; 21 import java.util.Iterator; 22 import java.util.StringTokenizer; 23 import java.util.Map.Entry; 24 25 /** 26 * 64位分四块,最多找出有3位差别的simhash 27 * 28 * @author lg 29 * 30 */ 31 // TODO: 保存排序后的中间状态 32 public class SimHashSet4 implements Iterable<SimHashData> { 33 ArrayList<SimHashData> t1 = new ArrayList<SimHashData>(); 34 ArrayList<SimHashData> t2 = new ArrayList<SimHashData>(); 35 ArrayList<SimHashData> t3 = new ArrayList<SimHashData>(); 36 ArrayList<SimHashData> t4 = new ArrayList<SimHashData>(); 37 38 public ArrayList<SimHashData> getT1(){ 39 return t1; 40 } 41 static Comparator<SimHashData> comp = new Comparator<SimHashData>() { 42 public int compare(SimHashData o1, SimHashData o2) { 43 if (o1.q == o2.q) 44 return 0; 45 return (isLessThanUnsigned(o1.q, o2.q)) ? 1 : -1; 46 } 47 }; // 比较无符号64位 48 static Comparator<Long> compHigh = new Comparator<Long>() { 49 public int compare(Long o1, Long o2) { 50 o1 |= 0xFFFFFFFFFFFFL; 51 o2 |= 0xFFFFFFFFFFFFL; 52 // System.out.println(Long.toBinaryString(o1)); 53 // System.out.println(Long.toBinaryString(o2)); 54 // System.out.println((o1 == o2)); 55 if (o1.equals(o2)) 56 return 0; 57 return (isLessThanUnsigned(o1, o2)) ? 1 : -1; 58 } 59 }; // 比较无符号64位中的高16位 60 61 public void load(String fileName) { 62 String line = null; 63 64 try { 65 InputStream is = new FileInputStream(new File(fileName)); 66 67 BufferedReader br = new BufferedReader(new InputStreamReader(is)); 68 69 while ((line = br.readLine()) != null) { 70 addSimHash(line.trim()); 71 } 72 br.close(); 73 74 } catch (FileNotFoundException e) { 75 e.printStackTrace(); 76 } catch (UnsupportedEncodingException e) { 77 e.printStackTrace(); 78 } catch (IOException e) { 79 e.printStackTrace(); 80 } 81 } 82 83 public static boolean isLessThanUnsigned(long n1, long n2) { 84 return (n1 < n2) ^ ((n1 < 0) != (n2 < 0)); 85 } 86 87 public void sort() { 88 t2.clear(); 89 t3.clear(); 90 t4.clear(); 91 for (SimHashData simHash : t1) 92 { 93 long t = Long.rotateLeft(simHash.q, 16); 94 t2.add(new SimHashData(t, simHash.no)); 95 96 t = Long.rotateLeft(t, 16); 97 t3.add(new SimHashData(t, simHash.no)); 98 99 t = Long.rotateLeft(t, 16); 100 t4.add(new SimHashData(t, simHash.no)); 101 } 102 103 Collections.sort(t1, comp); 104 Collections.sort(t2, comp); 105 Collections.sort(t3, comp); 106 Collections.sort(t4, comp); 107 } 108 109 public boolean contains(SimHashData key) { 110 int low = 0; 111 int high = t1.size() - 1; 112 113 while (low <= high) { 114 int mid = (low + high) >>> 1; 115 SimHashData midVal = t1.get(mid); 116 int cmp = comp.compare(midVal, key); 117 118 if (cmp < 0) 119 low = mid + 1; 120 else if (cmp > 0) 121 high = mid - 1; 122 else 123 return true; // key found 124 } 125 return false; // key not found 126 } 127 128 /** 129 * probe exact match 130 * 131 * @param t 132 * @return 133 */ 134 public Span probe(ArrayList<SimHashData> t, long key) { 135 // System.out.println("t:"+t.size()); 136 int low = 0; 137 int high = t.size() - 1; 138 139 while (low <= high) { 140 int mid = (low + high) >>> 1; 141 Long midVal = t.get(mid).q; 142 int cmp = compHigh.compare(midVal, key); 143 144 if (cmp < 0) 145 low = mid + 1; 146 else if (cmp > 0) 147 high = mid - 1; 148 else { 149 // key found 150 int matchStart = mid; 151 int matchEnd = mid; 152 while (matchStart > 0) { 153 midVal = t.get(matchStart - 1).q; 154 if (compHigh.compare(midVal, key) == 0) { 155 --matchStart; 156 } else { 157 break; 158 } 159 } 160 161 while (matchEnd < (t.size() - 1)) { 162 midVal = t.get(matchEnd + 1).q; 163 if (compHigh.compare(midVal, key) == 0) { 164 ++matchEnd; 165 } else { 166 break; 167 } 168 } 169 return new Span(matchStart, matchEnd); 170 } 171 } 172 return null; // key not found 173 } 174 175 /** 176 * get most 3 bit difference. 177 * 178 * @param fingerPrint 179 * @param k 180 * @return 181 */ 182 public HashSet<SimHashData> getSimSet(long fingerPrint, int k) { 183 184 HashSet<SimHashData> retAll = new HashSet<SimHashData>(); 185 Span s1 = probe(t1, fingerPrint); 186 if (s1 != null) { 187 // System.out.println("s1:"+s1); 188 ArrayList<SimHashData> ret1 = getSim(t1, s1, fingerPrint, k); 189 retAll.addAll(ret1); 190 } 191 long q2 = Long.rotateLeft(fingerPrint, 16); 192 Span s2 = probe(t2, q2); 193 if (s2 != null) { 194 // System.out.println("s2:"+s2); 195 ArrayList<SimHashData> ret2 = getSim(t2, s2, q2, k); 196 // rotateRight(ret2, 16); 197 retAll.addAll(ret2); 198 } 199 200 long q3 = Long.rotateLeft(q2, 16); 201 Span s3 = probe(t3, q3); 202 if (s3 != null) { 203 // System.out.println("s3:"+s3); 204 ArrayList<SimHashData> ret3 = getSim(t3, s3, q3, k); 205 // rotateRight(ret3, 32); 206 retAll.addAll(ret3); 207 } 208 209 long q4 = Long.rotateLeft(q3, 16); 210 Span s4 = probe(t4, q4); 211 if (s4 != null) { 212 // System.out.println("s4:" + s4); 213 ArrayList<SimHashData> ret4 = getSim(t4, s4, q4, k); 214 // rotateRight(ret4, 48); 215 retAll.addAll(ret4); 216 } 217 // System.out.println("o:"+Long.toBinaryString(fingerPrint)); 218 return retAll; 219 } 220 221 /** 222 * 从Span找出部分相等的,取出最多差k位的 223 * 224 * @param t 225 * @param s 226 * @param fingerPrint 227 * @param k 228 * @return 229 */ 230 public ArrayList<SimHashData> getSim(ArrayList<SimHashData> t, Span s, 231 long fingerPrint, int k) { 232 ArrayList<SimHashData> result = new ArrayList<SimHashData>(); 233 234 for (int i = s.getStart(); i <= s.getEnd(); ++i) { 235 SimHashData data = t.get(i); 236 long q = data.q; 237 if (BitUtil.diffIn(fingerPrint, q, k)) { 238 result.add(data); 239 } 240 } 241 242 return result; 243 } 244 245 public void addSimHash(String line) { 246 StringTokenizer st = new StringTokenizer(line, ":"); 247 String key = st.nextToken(); 248 long t = BitUtil.decodeLong(key); 249 long no = Long.parseLong(st.nextToken()); 250 // Long.parseLong(key,2); 251 // System.out.println(t); 252 t1.add(new SimHashData(t, no)); 253 } 254 255 public void addSimHash(SimHashData key) { 256 t1.add(key); 257 } 258 259 public void addInc(String key) { 260 long t = BitUtil.decodeLong(key); 261 // Long.parseLong(key,2); 262 // System.out.println(t); 263 SimHashData element = new SimHashData(t); 264 int insertionPoint = findInsertionPoint(t1, element); 265 t1.add(insertionPoint, element); 266 267 long q2 = Long.rotateLeft(t, 16); 268 element = new SimHashData(q2); 269 insertionPoint = findInsertionPoint(t2, element); 270 t2.add(insertionPoint, element); 271 272 long q3 = Long.rotateLeft(q2, 16); 273 element = new SimHashData(q3); 274 insertionPoint = findInsertionPoint(t3, element); 275 t3.add(insertionPoint, element); 276 277 long q4 = Long.rotateLeft(q3, 16); 278 element = new SimHashData(q4); 279 insertionPoint = findInsertionPoint(t4, element); 280 t4.add(insertionPoint, element); 281 } 282 283 /** 284 * Find the insertion point for the argument in a sorted list. 285 * 286 * @param element 287 * find this object's insertion point in the sorted list 288 * @return the index of the insertion point 289 */ 290 int findInsertionPoint(ArrayList<SimHashData> list, SimHashData element) { 291 // Find the new element's insertion point. 292 int insertionPoint = Collections.binarySearch(list, element, comp); 293 if (insertionPoint < 0) { 294 insertionPoint = -(insertionPoint + 1); 295 } 296 return insertionPoint; 297 } 298 299 public Iterator<SimHashData> iterator() { 300 return t1.iterator(); 301 } 302 303 public void save(String fileName) { 304 BufferedWriter writer; 305 try { 306 writer = new BufferedWriter(new FileWriter(fileName)); 307 for (SimHashData simhash : t1) { 308 //String str=BitUtil.encodeLong(simhash.q).substring(8); 309 String str=BitUtil.encodeLong(simhash.q); 310 writer.write(str); 311 // writer.write(simhash.q+""); 312 writer.write(":"); 313 writer.write(String.valueOf(simhash.no)); 314 writer.write("\r\n"); 315 } 316 writer.flush(); 317 writer.close(); 318 } catch (Exception e) { 319 e.printStackTrace(); 320 } 321 } 322 323 public void save(String fileName, String[] newStr) { 324 BufferedWriter writer; 325 try { 326 OutputStream out = new FileOutputStream(fileName, true); 327 OutputStreamWriter outWriter = new OutputStreamWriter(out); 328 writer = new BufferedWriter(outWriter); 329 for (int i = 0; i < newStr.length; i++) { 330 if (newStr[i] != null) { 331 writer.append(newStr[i]); 332 writer.append("\r\n"); 333 if (i % 10000 == 0) 334 System.out.println(i + ":" + newStr[i]); 335 } else { 336 break; 337 } 338 } 339 writer.flush(); 340 writer.close(); 341 System.out.println("结束!"); 342 } catch (Exception e) { 343 e.printStackTrace(); 344 } 345 } 346 347 // 将数据读成SimHashData对象型集合 348 public ArrayList<SimHashData> readData(String path) { 349 ArrayList<SimHashData> list = new ArrayList<SimHashData>(); 350 351 try { 352 InputStream input = new FileInputStream(new File(path)); 353 BufferedReader br = new BufferedReader(new InputStreamReader(input)); 354 String line = ""; 355 while ((line = br.readLine()) != null) { 356 StringTokenizer st = new StringTokenizer(line, ":"); 357 long key = BitUtil.decodeLong(st.nextToken()); 358 long no = Long.parseLong(st.nextToken()); 359 list.add(new SimHashData(key, no)); 360 } 361 br.close(); 362 } catch (FileNotFoundException e) { 363 e.printStackTrace(); 364 } catch (IOException e) { 365 e.printStackTrace(); 366 } 367 return list; 368 } 369 370 371 372 }
介绍一篇论文:Google Detecting NearDuplicates For Web Crawling 论文介绍了把SimHash用于爬虫抓取过程的网页去重。
最后,说一下分布式文档排重:利用分布式系统框架如hadoop等,使用MapReduce进行文档排重,提高了效率和节省了时间,这已经成为了常用的大数据量的排重方式
以上,是我对SimHash的一些总结,请大家指教!大家共勉