python 实现MinHash和MinHashLSH算法

最近实现了一把MinHash和MinHashLSH算法,发现实现的细节还是挺难的,所以我把datasketch的源代码改了一下,去除了很多冗余的代码,保留了算法的实现主要细节部分。

MinHash算法:

import hashlib
import numpy as np

def sha1_hash32(data):
    return struct.unpack('<I', hashlib.sha1(data).digest()[:4])[0]
_mersenne_prime = (1 << 61) - 1
_max_hash = (1 << 32) - 1
_hash_range = (1 << 32)


class MinHash(object):

    def __init__(self, d=128, seed=1,
            hashfunc=sha1_hash32,
            hashvalues=None, permutations=None):
        if hashvalues is not None:
            d = len(hashvalues)
        self.seed = seed
        # Check the hash function.
        if not callable(hashfunc):
            raise ValueError("The hashfunc must be a callable.")
        self.hashfunc = hashfunc
    
        # Initialize hash values
        if hashvalues is not None:
            self.hashvalues = self._parse_hashvalues(hashvalues)
        else:
            self.hashvalues = self._init_hashvalues(d)
        if permutations is not None:
            self.permutations = permutations
        else:
            generator = np.random.RandomState(self.seed)
            self.permutations = np.array([(generator.randint(1, _mersenne_prime, dtype=np.uint64),
                                           generator.randint(0, _mersenne_prime, dtype=np.uint64))
                                          for _ in range(d)], dtype=np.uint64).T
        if len(self) != len(self.permutations[0]):
            raise ValueError("Numbers of hash values and permutations mismatch")

    def _init_hashvalues(self, d):
        return np.ones(d, dtype=np.uint64)*_max_hash

    def _parse_hashvalues(self, hashvalues):
        return np.array(hashvalues, dtype=np.uint64)

    def add(self, b):

        hv = self.hashfunc(b)
        a, b = self.permutations
        phv = np.bitwise_and((a * hv + b) % _mersenne_prime, np.uint64(_max_hash))
        self.hashvalues = np.minimum(phv, self.hashvalues)

    def jaccard(self, other):

        if other.seed != self.seed:
            raise ValueError("different seeds")
        if len(self) != len(other):
            raise ValueError("different numbers of permutation functions")
        return np.float(np.count_nonzero(self.hashvalues==other.hashvalues)) /  np.float(len(self))


    def __len__(self):
        return len(self.hashvalues)

    def __eq__(self, other):
        return type(self) is type(other) and  self.seed == other.seed and np.array_equal(self.hashvalues, other.hashvalues)

然后是MinhashLSH

 

class DictListStorage():
    
    def __getitem__(self, key):
        return self.get(key)

    def __delitem__(self, key):
        return self.remove(key)

    def __len__(self):
        return self.size()

    def __iter__(self):
        for key in self.keys():
            yield key
            
    def __init__(self, config,name):
        self._dict = defaultdict(list)

    def keys(self):
        return self._dict.keys()

    def get(self, key):
        return self._dict.get(key, [])

    def insert(self, key, *vals, **kwargs):
        self._dict[key].extend(vals)

    def size(self):
        return len(self._dict)

    def itemcounts(self, **kwargs):
        return {k: len(v) for k, v in self._dict.items()}

    def has_key(self, key):
        return key in self._dict
class DictSetStorage():
    def __init__(self, config,name):
        self._dict = defaultdict(set)

    def get(self, key):
        return self._dict.get(key, set())

    def insert(self, key, *vals, **kwargs):
        self._dict[key].update(vals)
def _random_name(length):
    return ''.join(random.choice(string.ascii_lowercase)
                   for _ in range(length)).encode('utf8')

def _false_positive_probability(threshold, b, r):
    _probability = lambda s : 1 - (1 - s**float(r))**float(b)
    a, err = integrate(_probability, 0.0, threshold)
    return a

def _false_negative_probability(threshold, b, r):
    _probability = lambda s : 1 - (1 - (1 - s**float(r))**float(b))
    a, err = integrate(_probability, threshold, 1.0)
    return a

def _optimal_param(threshold, num_perm, false_positive_weight,
        false_negative_weight):
    min_error = float("inf")
    opt = (0, 0)
    for b in range(1, num_perm+1):
        max_r = int(num_perm / b)
        for r in range(1, max_r+1):
            fp = _false_positive_probability(threshold, b, r)
            fn = _false_negative_probability(threshold, b, r)
            error = fp*false_positive_weight + fn*false_negative_weight
            if error < min_error:
                min_error = error
                opt = (b, r,fp,fn)
    return opt
class MinHashLSH(object):

    def __init__(self, threshold=0.9, d=128, weights=(0.5, 0.5),
                 params=None, storage_config=None):
        if storage_config is None:
            storage_config = {'type': 'dict'}  

        if sum(weights) != 1.0:
            raise ValueError("Weights must sum to 1.0")
        self.h = d
        if params is not None:
            self.b, self.r = params
            if self.b * self.r > d:
                raise ValueError("The product of b and r in params is "
                        "{} * {} = {} -- it must be less than d {}. ".format(self.b, self.r, self.b*self.r, d))
        else:
            false_positive_weight, false_negative_weight = weights
            self.b, self.r ,self.fp,self.fn= _optimal_param(threshold, d,false_positive_weight, false_negative_weight)
            print('the best parameter b={},r={},fp={},fn={}'.format(self.b,self.r,self.fp,self.fn))

        basename = storage_config.get('basename', _random_name(11))
        self.hashtables=[]
        self.hashranges=[]
        for i in range(self.b):
            name=b''.join([basename, b'_bucket_', struct.pack('>H', i)])
            item=DictSetStorage(storage_config, name=name)
            self.hashtables.append(item)

            self.hashranges.append((i*self.r, (i+1)*self.r))

        self.keys = DictListStorage(storage_config, name=b''.join([basename, b'_keys']))



    def insert(self, key, minhash):
        self._insert(key, minhash, buffer=False)


    def _insert(self, key, minhash, buffer=False):
        if  key in self.keys:
            raise ValueError("key already exists")
        Hs=[]
        for start, end in self.hashranges:
            Hs.append(self._H(minhash.hashvalues[start:end]))

        self.keys.insert(key, *Hs, buffer=buffer)
        
        for H, hashtable in zip(Hs, self.hashtables):
            hashtable.insert(H, key, buffer=buffer)

    def query(self, minhash):
        candidates = set()
        for (start, end), hashtable in zip(self.hashranges, self.hashtables):
            H = self._H(minhash.hashvalues[start:end])
            for key in hashtable.get(H):
                candidates.add(key)
     
        return list(candidates)

    
    def _H(self,hs):
        return bytes(hs.byteswap().data)

这是实现的全过程了,哪天我能够把这些东西自己手动实现出来,我应该就很牛了,哈哈,现在还在学习模仿阶段。

参考文献

[1]. datasketch. https://github.com/ekzhu/datasketch

MinHash是一种用于近似集合相似度计算的技术。下面是一个用Python实现MinHash的示例代码: ```python import numpy as np import hashlib class MinHash: def __init__(self, num_perm): self.num_perm = num_perm self.permutations = self._generate_permutations() def _generate_permutations(self): np.random.seed(0) minhash_permutations = np.random.randint(low=0, high=np.iinfo(np.int64).max, size=(self.num_perm, 2), dtype=np.int64) return minhash_permutations def _hash_value(self, value): return hashlib.sha1(value.encode()).hexdigest() def compute_hash(self, value): hash_value = self._hash_value(value) hash_code = int(hash_value, 16) return hash_code def compute_signature(self, document): signature = np.inf * np.ones(self.num_perm, dtype=np.int64) for word in document.split(): hash_code = self.compute_hash(word) for i in range(self.num_perm): a, b = self.permutations[i] hash_value = (a * hash_code + b) % np.iinfo(np.int64).max signature[i] = min(signature[i], hash_value) return signature def compute_similarity(self, signature1, signature2): return np.mean(signature1 == signature2) # 示例用法 document1 = "This is a document about cats" document2 = "This is a document about dogs" minhash = MinHash(num_perm=128) signature1 = minhash.compute_signature(document1) signature2 = minhash.compute_signature(document2) similarity = minhash.compute_similarity(signature1, signature2) print(f"Similarity between the documents: {similarity}") ``` 在上述示例代码中,我们首先定义了一个MinHash类,它接受参数`num_perm`,表示要使用的哈希函数数量。在初始化时,我们生成了一组随机排列用于哈希计算。 `_hash_value`方法使用SHA1算法对输入值进行哈希计算,并返回哈希值的十六进制表示。 `compute_hash`方法将字符串值转换为哈希码。 `compute_signature`方法计算给定文档的MinHash签名。对于文档中的每个词,我们计算其哈希值,并将其与每个哈希函数的参数相乘并取模。然后,我们将每个哈希函数的最小值作为文档的签名。 `compute_similarity`方法计算两个文档的相似度。它简单地计算两个签名之间相等哈希函数的比例。 在示例用法中,我们创建了两个文档,并使用MinHash计算它们的签名。然后,我们计算了两个签名之间的相似度,并打印了结果。 请注意,此处的示例代码是简化版的MinHash实现,并且可能不适用于大规模数据集。在实际应用中,您可能需要使用更高效的数据结构和算法来处理大量数据。
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

农民小飞侠

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值