python字符串模糊匹配_基于字符串的模糊匹配

最新推荐文章于 2024-08-18 03:31:57 发布

weixin_39719018

最新推荐文章于 2024-08-18 03:31:57 发布

阅读量3.2k

点赞数

文章标签： python字符串模糊匹配

近期由于数据库中保存的一些类似小区名称，街道名称存在简写，错别字等不规范的现象，需要将不规范的书写进行纠错改正。在进行纠错的过程中用到了【编辑距离】的计算方式来与对照表进行精确匹配。

编辑距离

1.Levenshtein距离是一种计算两个字符串间的差异程度的字符串度量(string metric)。我们可以认为Levenshtein距离就是从一个字符串修改到另一个字符串时，其中编辑单个字符(比如修改、插入、删除)所需要的最少次数。

2.jaro距离

3.jaro-winkler距离

注：其中的相似度 = 1 - 距离

由于jaro的distance中存在局部可视窗口的概念，即使有相同的子串出现，但是超过可视窗口的长度依旧不会计算，但是业务的数据大多数带有写比较长的前缀，就会影响最终匹配的准确度，所以将可视窗口的长度放大至比较字符串的最长串的长度，所以将包中的部分源码修改，python代码如下:

def count_matches(s1, s2, len1, len2):

assert len1 and len1 <= len2

# search_range = max(len2//2-1, 0)

# print ("search_range",search_range)

search_range = len2

num_matches = 0

flags1 = [0] * len1

flags2 = [0] * len2

for i, char in enumerate(s1):

lolim = max(i - search_range, 0)

hilim = min(i + search_range, len2 - 1)

for j in range(lolim, hilim + 1):

if not flags2[j] and char == s2[j]:

flags1[i] = flags2[j] = 1

# where_matched[i] = j

num_matches += 1

break

return num_matches, flags1, flags2 # , where_matched

def count_half_transpositions(s1, s2, flags1, flags2):

half_transposes = 0

k = 0

for i, flag in enumerate(flags1):

if not flag: continue

while not flags2[k]: k += 1

if s1[i] != s2[k]:

half_transposes += 1

k += 1

return half_transposes

def count_typos(s1, s2, flags1, flags2, typo_table):

assert 0 in flags1

typo_score = 0

for i, flag1 in enumerate(flags1):

if flag1: continue # Iterate through unmatched chars

row = s1[i]

if row not in typo_table:

# If we don't have a similarity mapping for the char, continue

continue

typo_row = typo_table[row]

for j, flag2 in enumerate(flags2):

if flag2: continue

col = s2[j]

if col not in typo_row: continue

# print 'Similarity!', row, col

typo_score += typo_row[col]

flags2[j] = 2

break

return typo_score, flags2

def fn_jaro(len1, len2, num_matches, half_transposes, typo_score, typo_scale):

if not len1:

if not len2: return 1.0

return 0.0

if not num_matches: return 0.0

similar = (typo_score / typo_scale) + num_matches

weight = (similar / len1

+ similar / len2

+ (num_matches - half_transposes // 2) / num_matches)

return weight / 3

def string_metrics(s1, s2, typo_table=None, typo_scale=1, boost_threshold=None,

pre_len=0, pre_scale=0, longer_prob=False):

len1 = len(s1)

len2 = len(s2)

if len2 < len1:

s1, s2 = s2, s1

len1, len2 = len2, len1

assert len1 <= len2

if not (len1 and len2): return len1, len2, 0, 0, 0, 0, False

num_matches, flags1, flags2 = count_matches(s1, s2, len1, len2)

# If no characters in common - return

if not num_matches: return len1, len2, 0, 0, 0, 0, False

half_transposes = count_half_transpositions(s1, s2, flags1, flags2)

# adjust for similarities in non-matched characters

typo_score = 0

if typo_table and len1 > num_matches:

typo_score, flags2 = count_typos(s1, s2, flags1, flags2, typo_table)

if not boost_threshold:

return len1, len2, num_matches, half_transposes, typo_score, 0, 0

pre_matches = 0

adjust_long = False

weight_typo = fn_jaro(len1, len2, num_matches, half_transposes,

typo_score, typo_scale)

# Continue to boost the weight if the strings are similar

if weight_typo > boost_threshold:

# Adjust for having up to first 'pre_len' chars (not digits) in common

limit = min(len1, pre_len)

while pre_matches < limit:

char1 = s1[pre_matches]

if not (char1.isalpha() and char1 == s2[pre_matches]):

break

pre_matches += 1

if longer_prob:

cond = len1 > pre_len

cond = cond and num_matches > pre_matches + 1

cond = cond and 2 * num_matches >= len1 + pre_matches

cond = cond and s1[0].isalpha()

if cond:

adjust_long = True

return (len1, len2, num_matches, half_transposes,

typo_score, pre_matches, adjust_long)

def metric_jaro(string1, string2):

"The standard, basic Jaro string metric."

ans = string_metrics(string1, string2)

len1, len2, num_matches, half_transposes = ans[:4]

assert ans[4:] == (0, 0, False)

return fn_jaro(len1, len2, num_matches, half_transposes, 0, 1)

def metric_jaro_score(s1,s2):

return metric_jaro(s1,s2)

print (metric_jaro_score("赛鼎线世纪明珠45号","世纪明珠45号"))

weixin_39719018

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫