# 一、计算两列字段的相似度
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download
from collections import Counter
import math
import nltk
nltk.download('punkt') #第一次要下载一下
# 下载停用词
download('stopwords')
# 停用词列表
stop_words = set(stopwords.words('english'))
data=pd.read_excel('/Users/penny/Desktop/penny_样本.xlsx')
data['full_name']=data.apply(lambda x:x['full_name'].upper(),axis=1)
data['full_name']=data['full_name'].replace({',':' '}).replace({'.':' '})
data['payee_name']=data.apply(lambda x:x['payee_name'].upper(),axis=1)
data['payee_name']=data['payee_name'].replace({',':' '}).replace({'.':' '})
def cosine_similarity(text1, text2):
# 分词并移除停用词
tokens1 = [word.lower() for word in word_tokenize(text1) if word.lower() not in stop_words]
tokens2 = [word.lower() for word in word_tokenize(text2) if word.lower() not in stop_words]
# 创建词频计数器
count1 = Counter(tokens1)
count2 = Counter(tokens2)
# 计算余弦相似度
dot_product = sum(count1[word] * count2[word] for word in count1 if word in count2)
magnitude1 = math.sqrt(sum(count1[word] ** 2 for word in count1))
magnitude2 = math.sqrt(sum(count2[word] ** 2 for word in count2))
similarity = dot_product / (magnitude1 * magnitude2)
return similarity
# # 测试代码
# text1 = "I like apples"
# text2 = "I love apples"
# similarity = cosine_similarity(text1, text2)
# print(similarity)
# 如果左边的词是3个字,右边的词是2个字段,,且只有一个词相同则返回0.4
# 如果左边的词是3个字,右边的词是3个字段,,且只有一个词相同则返回0.33
data['similarity'] = data.apply(lambda x:cosine_similarity(x['full_name'], x['payee_name']),axis=1)
#二、计算列表中,字符串出现的高频词
import re
from collections import Counter
row_list=['Hello, world!', 'This is a test.', 'Hello, hello, hello.']
def get_most_common_words(string_list, num_common=10):
# 将列表中的所有字符串连接成一个大字符串
text = ' '.join(string_list)
# 将字符串转化为小写形式
text = text.lower()
# 使用正则表达式匹配单词
words = re.findall(r'\b\w+\b', text)
# 统计每个单词的频数
word_counts = Counter(words)
# 找到最常见的 num_common 个单词
most_common_words = word_counts.most_common(num_common)
return most_common_words
print(get_most_common_words(row_list, num_common=30))
计算两列字段的相似性
于 2023-07-25 17:47:45 首次发布