根据以下JAVA实现方法改编
http://itindex.net/detail/50448-%E7%9B%B8%E4%BC%BC-%E8%AE%A1%E7%AE%97-google
以下为SimHash+汉明距离的C#实现:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Numerics;
using System.Text;
namespace chx
{
public class SimHash
{
private String tokens;
private BigInteger strSimHash;
private int hashbits = 128;
public BigInteger StrSimHash
{
get
{
return strSimHash;
}
}
public SimHash(String tokens, int hashbits)
{
this.tokens = tokens;
this.hashbits = hashbits;
this.strSimHash = simHash();
}
public SimHash(String tokens)
{
this.tokens = tokens;
this.strSimHash = simHash();
}
private BigInteger simHash()
{
int[] v = new int[this.hashbits];
ChxTokenizer stringTokens = new ChxTokenizer(this.tokens);
while (stringTokens.hasMoreTokens())
{
String temp = stringTokens.nextToken();
BigInteger t = this.hash(temp);
//Console.WriteLine("temp = {0} : {1}", temp, t);
for (int i = 0; i < this.hashbits; i++)
{
BigInteger bitmask = BigInteger.One << i;
if ((t & bitmask).Sign!=0)
{
v[i] += 1;
}
else
{
v[i] -= 1;
}
}
}
BigInteger fingerprint = BigInteger.Zero;
for (int i = 0; i < this.hashbits; i++)
{
if (v[i] >= 0)
{
fingerprint = fingerprint + (BigInteger.Parse("1") << i);
}
}
return fingerprint;
}
private BigInteger hash(string source)
{
if (source == null || source.Length == 0)
{
return BigInteger.Zero;
}
else
{
char[] sourceArray = source.ToCharArray();
BigInteger x = new BigInteger(((long)sourceArray[0]) << 7);
BigInteger m = BigInteger.Parse("1000003");
BigInteger mask = BigInteger.Pow(new BigInteger(2), this.hashbits) - BigInteger.One;
foreach (char item in sourceArray)
{
BigInteger temp = new BigInteger((long)item);
x = ((x * m) ^ temp) & mask;
}
x = x ^ (new BigInteger(source.Length));
if (x.Equals(BigInteger.MinusOne))
{
x = new BigInteger(-2);
}
return x;
}
}
public int HammingDistance(SimHash other)
{
BigInteger m = (BigInteger.One << this.hashbits) - BigInteger.One;
BigInteger x = (this.strSimHash ^ other.strSimHash) & m;
int tot = 0;
while (x.Sign != 0)
{
tot += 1;
x = x & (x- BigInteger.One);
}
return tot;
}
}
//简单的分词法,直接将中文分成单个汉。可以用其他分词法代替
public class ChxTokenizer
{
private string source;
private int index;
private int length;
public ChxTokenizer(string source)
{
this.source = source;
this.index = 0;
this.length = (source ?? "").Length;
}
public bool hasMoreTokens()
{
return index < length;
}
public string nextToken()
{
String s = source.Substring(index, 1);
index++;
return s;
}
}
}
使用方法示例:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace chx
{
class Program
{
static void Main(string[] args)
{
Test();
}
private static void Test()
{
var s1 = "中文分词太麻烦了,也有些中文分词组件也不错";
var hash1 = new SimHash(s1);
Console.WriteLine("S1.simhash: {0}", hash1.StrSimHash);
var s2 = "有些中文分词太麻烦了,也有些中文分词组件也不错";
var hash2 = new SimHash(s2);
Console.WriteLine("S2.simhash: {0}", hash1.StrSimHash);
var s3 = "有些中文分词太麻烦了";
var hash3 = new SimHash(s3);
Console.WriteLine("S3.simhash: {0}", hash1.StrSimHash);
Console.WriteLine("============================");
Console.WriteLine("s1与s2的汉明距离:{0}",hash1.HammingDistance(hash2));
Console.WriteLine("s1与s3的汉明距离:{0}",hash1.HammingDistance(hash3));
}
}
}