这里写自定义目录标题
欢迎使用Markdown编辑器
你好!
新的改变
如何插入一段漂亮的代码片
姓名相似度的批量查找与替换 代码片
.
import pandas as pd
from fuzzywuzzy import fuzz
# 示例数据:包含非精确匹配的姓名
customer_names = ["John Doe", "Jon D.", "Jane Smith", "Smith Jane", "j. smith", "J. Doe", "Alice Johnson"]
# 存储规范化后的唯一姓名
unique_names = set()
data = []
for name in customer_names:
# 使用 TheFuzz 库找到最相似的姓名并规范化
most_similar_name = max(unique_names, key=lambda x: fuzz.ratio(x, name), default=None)
# 若找到最相似的姓名并且相似度高于阈值,则认定为同一个姓名
if most_similar_name and fuzz.ratio(most_similar_name, name) > 80:
data.append((name, most_similar_name))
else:
unique_names.add(name)
data.append((name, name))
# 将结果存储在同一个 DataFrame 中
df = pd.DataFrame(data, columns=["原始姓名", "规范后的唯一姓名"])
print(df)
如果还存在姓和名位置颠倒的情况,我们就使用下面的代码:
import pandas as pd
from fuzzywuzzy import fuzz
# 示例数据:包含非精确匹配的姓名
customer_names = ["John Doe", "Jon D.", "Jane Smith", "Smith Jane", "j. smith", "J. Doe", "Alice Johnson"]
# 存储规范化后的唯一姓名
unique_names = set()
data = []
for name in customer_names:
# 先尝试拆分名字,处理姓和名位置颠倒
name_parts = name.split()
if len(name_parts) == 2:
first_name, last_name = name_parts
reversed_name = f"{last_name} {first_name}"
# 使用 TheFuzz 库找到最相似的姓名并规范化
most_similar_name = max(unique_names, key=lambda x: fuzz.ratio(x, name), default=None)
# 若找到最相似的姓名并且相似度高于阈值,则认定为同一个姓名
if most_similar_name and fuzz.ratio(most_similar_name, name) > 70:
data.append((name, most_similar_name))
elif reversed_name and fuzz.ratio(most_similar_name, reversed_name) > 70:
data.append((name, most_similar_name))
else:
unique_names.add(name)
data.append((name, name))
# 将结果存储在同一个 DataFrame 中
df = pd.DataFrame(data, columns=["原始姓名", "规范后的唯一姓名"])
print(df)
但是对于名字为两串的,效果并不佳,
import pandas as pd
from fuzzywuzzy import fuzz
# 示例数据:包含非精确匹配的姓名
customer_names = ["John Doe", "Jon D.", "Jane Smith", "Smith Jane",'Alice J Jane','Jane Alice J','Jane Alice John','Jane Alice', "j. smith", "J. Doe", "Alice Johnson", 'Vladek Kasperchik','Kasperchik Vladek','Kasperchik Vladek P']
# 存储规范化后的唯一姓名
unique_names = set()
data = []
for name in customer_names:
# 先尝试拆分名字,处理姓和名位置颠倒
name_parts = name.split()
if len(name_parts) == 2:
first_name, last_name = name_parts
reversed_name = f"{last_name} {first_name}"
# 使用 TheFuzz 库找到最相似的姓名并规范化
most_similar_name = max(unique_names, key=lambda x: fuzz.ratio(x, name), default=None)
# 若找到最相似的姓名并且相似度高于阈值,则认定为同一个姓名
if most_similar_name and fuzz.ratio(most_similar_name, name) > 85:
data.append((name, most_similar_name))
elif reversed_name and fuzz.ratio(most_similar_name, reversed_name) > 85:
data.append((name, most_similar_name))
else:
unique_names.add(name)
data.append((name, name))
# 将结果存储在同一个 DataFrame 中
df = pd.DataFrame(data, columns=["原始姓名", "规范后的唯一姓名"])
print(df)
或者换成这种
import pandas as pd
from fuzzywuzzy import fuzz
# 示例数据:包含非精确匹配的姓名
customer_names = ["John Doe", "Jon D.", "Jane Smith", "Smith Jane",'Alice J Jane','Alice Jane','Jane Alice J','Jane Alice John','Jane Alice', "j. smith", "J. Doe", "Alice Johnson", 'Vladek Kasperchik','Kasperchik Vladek','Kasperchik Vladek P']
# 存储规范化后的唯一姓名
unique_names = set()
data = []
for name in customer_names:
# 首先尝试拆分名字部分
name_parts = name.split()
normalized_name = ""
if len(name_parts) == 1:
# 如果只有一个部分,即名字
normalized_name = name
elif len(name_parts) == 2:
# 如果有两个部分,可能是名字和父姓
first_part, second_part = name_parts
if len(first_part) > len(second_part):
# 如果第一个部分较长,认定为名字和父姓
normalized_name = f"{second_part} {first_part}"
else:
# 如果第二个部分较长,认定为名字和姓氏
normalized_name = f"{first_part} {second_part}"
else:
# 如果有三个部分,分别考虑名字、父姓、姓氏的情况
first_part, second_part, third_part = name_parts
if len(first_part) > len(third_part):
# 如果第一个部分较长,认定为名字和父姓
normalized_name = f"{third_part} {first_part} {second_part}"
else:
# 如果第三个部分较长,认定为名字和姓氏
normalized_name = f"{first_part} {second_part} {third_part}"
# 使用 TheFuzz 库找到最相似的姓名并规范化
most_similar_name = max(unique_names, key=lambda x: fuzz.ratio(x, normalized_name), default=None)
# 若找到最相似的姓名并且相似度高于阈值,则认定为同一个姓名
if most_similar_name and fuzz.ratio(most_similar_name, normalized_name) > 80:
data.append((name, most_similar_name))
else:
unique_names.add(normalized_name)
data.append((name, normalized_name))
# 将结果存储在同一个 DataFrame 中
df = pd.DataFrame(data, columns=["原始姓名", "规范后的唯一姓名"])
print(df)
如何创建一个注脚
一个具有注脚的文本。1
注脚的解释 ↩︎