SimHash+汉明距离的C#实现方法

根据以下JAVA实现方法改编
http://itindex.net/detail/50448-%E7%9B%B8%E4%BC%BC-%E8%AE%A1%E7%AE%97-google

以下为SimHash+汉明距离的C#实现:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Numerics;
using System.Text;

namespace chx
{
    public class SimHash
    {
        private String tokens;
        private BigInteger strSimHash;
        private int hashbits = 128;

        public BigInteger StrSimHash
        {
            get
            {
                return strSimHash;
            }
        }

        public SimHash(String tokens, int hashbits)
        {
            this.tokens = tokens;
            this.hashbits = hashbits;
            this.strSimHash = simHash();
        }
        public SimHash(String tokens)
        {
            this.tokens = tokens;
            this.strSimHash = simHash();
        }

        private BigInteger simHash()
        {
            int[] v = new int[this.hashbits];
            ChxTokenizer stringTokens = new ChxTokenizer(this.tokens);
            while (stringTokens.hasMoreTokens())
            {
                String temp = stringTokens.nextToken();
                BigInteger t = this.hash(temp);
                //Console.WriteLine("temp = {0} : {1}", temp, t);
                for (int i = 0; i < this.hashbits; i++)
                {
                    BigInteger bitmask = BigInteger.One << i;
                    if ((t & bitmask).Sign!=0)
                    {
                        v[i] += 1;
                    }
                    else
                    {
                        v[i] -= 1;
                    }
                }
            }
            BigInteger fingerprint = BigInteger.Zero;
            for (int i = 0; i < this.hashbits; i++)
            {
                if (v[i] >= 0)
                {
                    fingerprint = fingerprint + (BigInteger.Parse("1") << i);
                }
            }
            return fingerprint;
        }

        private BigInteger hash(string source)
        {
            if (source == null || source.Length == 0)
            {
                return BigInteger.Zero;
            }
            else
            {
                char[] sourceArray = source.ToCharArray();
                BigInteger x = new BigInteger(((long)sourceArray[0]) << 7);
                BigInteger m = BigInteger.Parse("1000003");
                BigInteger mask = BigInteger.Pow(new BigInteger(2), this.hashbits) - BigInteger.One;
                foreach (char item in sourceArray)
                {
                    BigInteger temp = new BigInteger((long)item);
                    x = ((x * m) ^ temp) & mask;
                }
                x = x ^ (new BigInteger(source.Length));
                if (x.Equals(BigInteger.MinusOne))
                {
                    x = new BigInteger(-2);
                }
                return x;
            }
        }

        public int HammingDistance(SimHash other)
        {
            BigInteger m = (BigInteger.One << this.hashbits) - BigInteger.One;
            BigInteger x = (this.strSimHash ^ other.strSimHash) & m;
            int tot = 0;
            while (x.Sign != 0)
            {
                tot += 1;
                x = x & (x- BigInteger.One);
            }
            return tot;
        }  

    }

    //简单的分词法,直接将中文分成单个汉。可以用其他分词法代替
    public class ChxTokenizer
    {
        private string source;
        private int index;
        private int length;
        public ChxTokenizer(string source)
        {
            this.source = source;
            this.index = 0;
            this.length = (source ?? "").Length;
        }

        public bool hasMoreTokens()
        {
            return index < length;
        }

        public string nextToken()
        {
            String s = source.Substring(index, 1);
            index++;
            return s; 
        }
    }
}

使用方法示例:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace chx
{
    class Program
    {
        static void Main(string[] args)
        {
            Test();
        }
    private static void Test()
    {
        var s1 = "中文分词太麻烦了,也有些中文分词组件也不错";
        var hash1 = new SimHash(s1);
        Console.WriteLine("S1.simhash: {0}", hash1.StrSimHash);
        var s2 = "有些中文分词太麻烦了,也有些中文分词组件也不错";
        var hash2 = new SimHash(s2);
        Console.WriteLine("S2.simhash: {0}", hash1.StrSimHash);
        var s3 = "有些中文分词太麻烦了";
        var hash3 = new SimHash(s3);
        Console.WriteLine("S3.simhash: {0}", hash1.StrSimHash);
        Console.WriteLine("============================");  
        Console.WriteLine("s1与s2的汉明距离:{0}",hash1.HammingDistance(hash2));
        Console.WriteLine("s1与s3的汉明距离:{0}",hash1.HammingDistance(hash3));  
    }
   }
}
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值