计算两列字段的相似性

penny1218

已于 2024-04-22 19:27:19 修改

阅读量73

点赞数

文章标签： python

于 2023-07-25 17:47:45 首次发布

本文链接：https://blog.csdn.net/penny1218/article/details/131923253

版权

# 一、计算两列字段的相似度
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download
from collections import Counter
import math
import nltk
nltk.download('punkt') #第一次要下载一下
# 下载停用词
download('stopwords')

# 停用词列表
stop_words = set(stopwords.words('english'))

data=pd.read_excel('/Users/penny/Desktop/penny_样本.xlsx')

data['full_name']=data.apply(lambda x:x['full_name'].upper(),axis=1)
data['full_name']=data['full_name'].replace({',':' '}).replace({'.':' '})

data['payee_name']=data.apply(lambda x:x['payee_name'].upper(),axis=1)
data['payee_name']=data['payee_name'].replace({',':' '}).replace({'.':' '})


def cosine_similarity(text1, text2):
    # 分词并移除停用词
    tokens1 = [word.lower() for word in word_tokenize(text1) if word.lower() not in stop_words]
    tokens2 = [word.lower() for word in word_tokenize(text2) if word.lower() not in stop_words]

    # 创建词频计数器
    count1 = Counter(tokens1)
    count2 = Counter(tokens2)

    # 计算余弦相似度
    dot_product = sum(count1[word] * count2[word] for word in count1 if word in count2)
    magnitude1 = math.sqrt(sum(count1[word] ** 2 for word in count1))
    magnitude2 = math.sqrt(sum(count2[word] ** 2 for word in count2))
    similarity = dot_product / (magnitude1 * magnitude2)

    return similarity

# # 测试代码
# text1 = "I like apples"
# text2 = "I love apples"
# similarity = cosine_similarity(text1, text2)
# print(similarity)
# 如果左边的词是3个字，右边的词是2个字段，，且只有一个词相同则返回0.4
# 如果左边的词是3个字，右边的词是3个字段，，且只有一个词相同则返回0.33

data['similarity'] = data.apply(lambda x:cosine_similarity(x['full_name'], x['payee_name']),axis=1)

#二、计算列表中，字符串出现的高频词
import re
from collections import Counter
row_list=['Hello, world!', 'This is a test.', 'Hello, hello, hello.']
def get_most_common_words(string_list, num_common=10):
    # 将列表中的所有字符串连接成一个大字符串
    text = ' '.join(string_list)
    # 将字符串转化为小写形式
    text = text.lower()
    # 使用正则表达式匹配单词
    words = re.findall(r'\b\w+\b', text)
    # 统计每个单词的频数
    word_counts = Counter(words)
    # 找到最常见的 num_common 个单词
    most_common_words = word_counts.most_common(num_common)
    return most_common_words

print(get_most_common_words(row_list, num_common=30))