TF/05_Nearest_Neighbor_Methods/TF/05_Nearest_Neighbor_Methods

TF/05_Nearest_Neighbor_Methods

# 05_address_matching.py
# Address Matching with k-Nearest Neighbors
#----------------------------------
#
# This function illustrates a way to perform
# address matching between two data sets.
#
# For each test address, we will return the
# closest reference address to it.
#
# We will consider two distance functions:
# 1) Edit distance for street number/name and
# 2) Euclidian distance (L2) for the zip codes

import random
import string
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import ops
ops.reset_default_graph()

# First we generate the data sets we will need
# n = Size of created data sets
n = 10
street_names = ['abbey', 'baker', 'canal', 'donner', 'elm']
street_types = ['rd', 'st', 'ln', 'pass', 'ave']

random.seed(31)  #make results reproducible
rand_zips = [random.randint(65000,65999) for i in range(5)]

# Function to randomly create one typo in a string w/ a probability
def create_typo(s, prob=0.75):
    if random.uniform(0,1) < prob:
        rand_ind = random.choice(range(len(s)))
        s_list = list(s)
        s_list[rand_ind]=random.choice(string.ascii_lowercase)
        s = ''.join(s_list)
    return(s)

# Generate the reference dataset
numbers = [random.randint(1, 9999) for i in range(n)]
streets = [random.choice(street_names) for i in range(n)]
street_suffs = [random.choice(street_types) for i in range(n)]
zips = [random.choice(rand_zips) for i in range(n)]
full_streets = [str(x) + ' ' + y + ' ' + z for x,y,z in zip(numbers, streets, street_suffs)]
reference_data = [list(x) for x in zip(full_streets,zips)]

# Generate test dataset with some typos
typo_streets = [create_typo(x) for x in streets]
typo_full_streets = [str(x) + ' ' + y + ' ' + z for x,y,z in zip(numbers, typo_streets, street_suffs)]
test_data = [list(x) for x in zip(typo_full_streets,zips)]

# Now we can perform address matching
# Create graph
sess = tf.Session()

# Placeholders
test_address = tf.sparse_placeholder( dtype=tf.string)
test_zip = tf.placeholder(shape=[None, 1], dtype=tf.float32)
ref_address = tf.sparse_placeholder(dtype=tf.string)
ref_zip = tf.placeholder(shape=[None, n], dtype=tf.float32)

# Declare Zip code distance for a test zip and reference set
zip_dist = tf.square(tf.subtract(ref_zip, test_zip))

# Declare Edit distance for address
address_dist = tf.edit_distance(test_address, ref_address, normalize=True)

# Create similarity scores
zip_max = tf.gather(tf.squeeze(zip_dist), tf.argmax(zip_dist, 1))
zip_min = tf.gather(tf.squeeze(zip_dist), tf.argmin(zip_dist, 1))
zip_sim = tf.div(tf.subtract(zip_max, zip_dist), tf.subtract(zip_max, zip_min))
address_sim = tf.subtract(1., address_dist)

# Combine distance functions
address_weight = 0.5
zip_weight = 1. - address_weight
weighted_sim = tf.add(tf.transpose(tf.multiply(address_weight, address_sim)), tf.multiply(zip_weight, zip_sim))

# Predict: Get max similarity entry
top_match_index = tf.argmax(weighted_sim, 1)


# Function to Create a character-sparse tensor from strings
def sparse_from_word_vec(word_vec):
    num_words = len(word_vec)
    indices = [[xi, 0, yi] for xi,x in enumerate(word_vec) for yi,y in enumerate(x)]
    chars = list(''.join(word_vec))
    return(tf.SparseTensorValue(indices, chars, [num_words,1,1]))

# Loop through test indices
reference_addresses = [x[0] for x in reference_data]
reference_zips = np.array([[x[1] for x in reference_data]])

# Create sparse address reference set
sparse_ref_set = sparse_from_word_vec(reference_addresses)

for i in range(n):
    test_address_entry = test_data[i][0]
    test_zip_entry = [[test_data[i][1]]]

    # Create sparse address vectors
    test_address_repeated = [test_address_entry] * n
    sparse_test_set = sparse_from_word_vec(test_address_repeated)

    feeddict={test_address: sparse_test_set,
               test_zip: test_zip_entry,
               ref_address: sparse_ref_set,
               ref_zip: reference_zips}
    best_match = sess.run(top_match_index, feed_dict=feeddict)
    best_street = reference_addresses[best_match[0]]
    [best_zip] = reference_zips[0][best_match]
    [[test_zip_]] = test_zip_entry
    print('Address: ' + str(test_address_entry) + ', ' + str(test_zip_))
    print('Match  : ' + str(best_street) + ', ' + str(best_zip))
Address: 2308 bakar rd, 65480
Match  : 2308 baker rd, 65480
Address: 709 bakeo pass, 65480
Match  : 709 baker pass, 65480
Address: 2273 glm ln, 65782
Match  : 2273 elm ln, 65782
Address: 1843 donner st, 65402
Match  : 1843 donner st, 65402
Address: 8769 klm st, 65402
Match  : 8769 elm st, 65402
Address: 3798 dpnner ln, 65012
Match  : 3798 donner ln, 65012
Address: 2288 bajer pass, 65012
Match  : 2288 baker pass, 65012
Address: 2416 epm ln, 65480
Match  : 2416 elm ln, 65480
Address: 543 abgey ave, 65115
Match  : 543 abbey ave, 65115
Address: 994 abbey st, 65480
Match  : 994 abbey st, 65480
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值