【机器学习课程-华盛顿大学】:4 聚类和检索 4.2 编程测试:LSH代码实现

一、自己写lsh代码:

def train_lsh(data, num_vector=16, seed=None):
    
    dim = data.shape[1]
    if seed is not None:
        np.random.seed(seed)
    random_vectors = generate_random_vectors(num_vector, dim)
  
    powers_of_two = 1 << np.arange(num_vector-1, -1, -1)
  
    table = {}
    
    # Partition data points into bins
    bin_index_bits = (data.dot(random_vectors) >= 0)
  
    # Encode bin index bits into integers
    bin_indices = bin_index_bits.dot(powers_of_two)
    
    # Update `table` so that `table[i]` is the list of document ids with bin index equal to i.
    for data_index, bin_index in enumerate(bin_indices):
        if bin_index not in table:
            # If no list yet exists for this bin, assign the bin an empty list.
            table[bin_index] = []
        # Fetch the list of document ids associated with the bin and add the document id to the end.
        table[bin_index].append(data_index)

    model = {'data': data,
             'bin_index_bits': bin_index_bits,
             'bin_indices': bin_indices,
             'table': table,
             'random_vectors': random_vectors,
             'num_vector': num_vector}
    
    return model
model = train_lsh(corpus, num_vector=16, seed=143)
table = model['table']
if   0 in table and table[0]   == [39583] and \
   143 in table and table[143] == [19693, 28277, 29776, 30399]:
    print 'Passed!'
else:
    print 'Check your code.'

 

二、学了一个好玩的

 

doc_obama = corpus[35817, :]  # first document
index_bits_obama = (doc_obama.dot(random_vectors) >= 0)
powers_of_two = (1 << np.arange(15, -1, -1))
print index_bits_obama
print powers_of_two
print index_bits_obama.dot(powers_of_two)

 

三、自己写lsh最近邻代码:

def search_nearby_bins(query_bin_bits, table, search_radius=2, initial_candidates=set()):
    """
    For a given query vector and trained LSH model, return all candidate neighbors for
    the query among all bins within the given search radius.
    
    Example usage
    -------------
    >>> model = train_lsh(corpus, num_vector=16, seed=143)
    >>> q = model['bin_index_bits'][0]  # vector for the first document
  
    >>> candidates = search_nearby_bins(q, model['table'])
    """
    num_vector = len(query_bin_bits)
    powers_of_two = 1 << np.arange(num_vector-1, -1, -1)
    
    # Allow the user to provide an initial set of candidates.
    candidate_set = copy(initial_candidates)
    
    for different_bits in combinations(range(num_vector), search_radius):       
        # Flip the bits (n_1,n_2,...,n_r) of the query bin to produce a new bit vector.
        ## Hint: you can iterate over a tuple like a list
        alternate_bits = copy(query_bin_bits)
        for i in different_bits:
            alternate_bits[i] = not query_bin_bits[i]
        
        # Convert the new bit vector to an integer index
        nearby_bin = alternate_bits.dot(powers_of_two)
        
        # Fetch the list of documents belonging to the bin indexed by the new bit vector.
        # Then add those documents to candidate_set
        # Make sure that the bin exists in the table!
        # Hint: update() method for sets lets you add an entire list to the set
        if nearby_bin in table:
            nearby_docs = model['table'][nearby_bin]
            candidate_set.update(nearby_docs)
            
    return candidate_set
obama_bin_index = model['bin_index_bits'][35817] # bin index of Barack Obama
candidate_set = search_nearby_bins(obama_bin_index, model['table'], search_radius=0)
if candidate_set == set([35817, 21426, 53937, 39426, 50261]):
    print 'Passed test'
else:
    print 'Check your code'
print 'List of documents in the same bin as Obama: 35817, 21426, 53937, 39426, 50261'

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
the code from net,share with you: 获取图像库里所有图片文件的名称 Blue hills.jpg Lena.jpg myhand1.jpg myhand2.jpg Sunset.jpg Test2.jpg Test3.JPG Water lilies.jpg Winter.jpg 获取图像特征 0 36 82 115 132 164 200 142 129 119 54 1 198 141 101 64 43 65 124 200 197 186 2 104 40 25 34 81 88 56 108 131 199 3 142 56 43 61 117 121 107 160 120 200 4 46 67 80 104 200 193 90 66 59 43 5 121 110 141 149 200 195 171 144 123 116 6 99 125 116 88 68 82 99 141 200 104 7 200 178 175 151 154 183 156 159 164 177 8 131 161 192 199 181 182 186 174 147 129 获取随机点位 将整数转换为01 特征值MD5压缩处理 1 733c7ccfc61b08a62633fc2356b90478 2 bdc6c147ad15bcf1bd91e343ec15d4f8 3 70a0d1a5d191e0bd3ceb65003ce4e386 4 1d8d003fe0d417eb1d3413773e58f4de 5 b04a5fd1301bf66c192a23e155236dd5 6 0d7afe69ff25e61710e7f63d5a7779ca 7 0b534a69d1eb5edfd7fef486b9270fc7 8 373b04655e21dee1372e374916a7774f 9 68b400bf74bc815844f58e19ead1ceb7 1 733c7ccfc61b08a62633fc2356b90478 2 bdc6c147ad15bcf1bd91e343ec15d4f8 3 70a0d1a5d191e0bd3ceb65003ce4e386 4 1d8d003fe0d417eb1d3413773e58f4de 5 b04a5fd1301bf66c192a23e155236dd5 6 0d7afe69ff25e61710e7f63d5a7779ca 7 0b534a69d1eb5edfd7fef486b9270fc7 8 373b04655e21dee1372e374916a7774f 9 68b400bf74bc815844f58e19ead1ceb7 1 733c7ccfc61b08a62633fc2356b90478 2 bdc6c147ad15bcf1bd91e343ec15d4f8 3 70a0d1a5d191e0bd3ceb65003ce4e386 4 1d8d003fe0d417eb1d3413773e58f4de 5 b04a5fd1301bf66c192a23e155236dd5 6 0d7afe69ff25e61710e7f63d5a7779ca 7 0b534a69d1eb5edfd7fef486b9270fc7 8 373b04655e21dee1372e374916a7774f 9 68b400bf74bc815844f58e19ead1ceb7 1 733c7ccfc61b08a62633fc2356b90478 2 bdc6c147ad15bcf1bd91e343ec15d4f8 3 70a0d1a5d191e0bd3ceb65003ce4e386 4 1d8d003fe0d417eb1d3413773e58f4de 5 b04a5fd1301bf66c192a23e155236dd5 6 0d7afe69ff25e61710e7f63d5a7779ca 7 0b534a69d1eb5edfd7fef486b9270fc7 8 373b04655e21dee1372e374916a7774f 9 68b400bf74bc815844f58e19ead1ceb7 1 733c7ccfc61b08a62633fc2356b90478 2 bdc6c147ad15bcf1bd91e343ec15d4f8 3 70a0d1a5d191e0bd3ceb65003ce4e386 4 1d8d003fe0d417eb1d3413773e58f4de 5 b04a5fd1301bf66c192a23e155236dd5 6 0d7afe69ff25e61710e7
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值