一、自己写lsh代码:
def train_lsh(data, num_vector=16, seed=None):
dim = data.shape[1]
if seed is not None:
np.random.seed(seed)
random_vectors = generate_random_vectors(num_vector, dim)
powers_of_two = 1 << np.arange(num_vector-1, -1, -1)
table = {}
# Partition data points into bins
bin_index_bits = (data.dot(random_vectors) >= 0)
# Encode bin index bits into integers
bin_indices = bin_index_bits.dot(powers_of_two)
# Update `table` so that `table[i]` is the list of document ids with bin index equal to i.
for data_index, bin_index in enumerate(bin_indices):
if bin_index not in table:
# If no list yet exists for this bin, assign the bin an empty list.
table[bin_index] = []
# Fetch the list of document ids associated with the bin and add the document id to the end.
table[bin_index].append(data_index)
model = {'data': data,
'bin_index_bits': bin_index_bits,
'bin_indices': bin_indices,
'table': table,
'random_vectors': random_vectors,
'num_vector': num_vector}
return model
model = train_lsh(corpus, num_vector=16, seed=143)
table = model['table']
if 0 in table and table[0] == [39583] and \
143 in table and table[143] == [19693, 28277, 29776, 30399]:
print 'Passed!'
else:
print 'Check your code.'
二、学了一个好玩的
doc_obama = corpus[35817, :] # first document
index_bits_obama = (doc_obama.dot(random_vectors) >= 0)
powers_of_two = (1 << np.arange(15, -1, -1))
print index_bits_obama
print powers_of_two
print index_bits_obama.dot(powers_of_two)
三、自己写lsh最近邻代码:
def search_nearby_bins(query_bin_bits, table, search_radius=2, initial_candidates=set()):
"""
For a given query vector and trained LSH model, return all candidate neighbors for
the query among all bins within the given search radius.
Example usage
-------------
>>> model = train_lsh(corpus, num_vector=16, seed=143)
>>> q = model['bin_index_bits'][0] # vector for the first document
>>> candidates = search_nearby_bins(q, model['table'])
"""
num_vector = len(query_bin_bits)
powers_of_two = 1 << np.arange(num_vector-1, -1, -1)
# Allow the user to provide an initial set of candidates.
candidate_set = copy(initial_candidates)
for different_bits in combinations(range(num_vector), search_radius):
# Flip the bits (n_1,n_2,...,n_r) of the query bin to produce a new bit vector.
## Hint: you can iterate over a tuple like a list
alternate_bits = copy(query_bin_bits)
for i in different_bits:
alternate_bits[i] = not query_bin_bits[i]
# Convert the new bit vector to an integer index
nearby_bin = alternate_bits.dot(powers_of_two)
# Fetch the list of documents belonging to the bin indexed by the new bit vector.
# Then add those documents to candidate_set
# Make sure that the bin exists in the table!
# Hint: update() method for sets lets you add an entire list to the set
if nearby_bin in table:
nearby_docs = model['table'][nearby_bin]
candidate_set.update(nearby_docs)
return candidate_set
obama_bin_index = model['bin_index_bits'][35817] # bin index of Barack Obama
candidate_set = search_nearby_bins(obama_bin_index, model['table'], search_radius=0)
if candidate_set == set([35817, 21426, 53937, 39426, 50261]):
print 'Passed test'
else:
print 'Check your code'
print 'List of documents in the same bin as Obama: 35817, 21426, 53937, 39426, 50261'