相似推荐 + Tfidf + sklearn + 根据诗词译文推荐诗词

数据

下载链接:https://download.csdn.net/download/u010379996/15743786

诗词100首:Index(['id', 'title', 'era', 'author', 'content', 'translations'], dtype='object')

code

import pandas as pd
import numpy as np
import re
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

def get_recommendations(data, indices, search, cosine_sim, size=5):
    idx = indices[search]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    sim_scores = sim_scores[1:size+1]
    ids = [i[0] for i in sim_scores]
    sim = [i[1] for i in sim_scores]

    d = data.iloc[ids]
    d['sim'] = sim
    return d

def cut_word(text):
    cleaned_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', text))
    wordlist = jieba.lcut(cleaned_data)
    return wordlist

def cut_word_str(text):
    return " ".join(cut_word(text))

def cut_data(data, name, cut_name=None):
    words = []
    for d in data[name]:
        words.append(cut_word_str(d))
    if cut_name == None:
        data[name] = words
    return data

## 加载数据
datapath = ""
data = pd.read_csv(datapath + "content.csv")
print(data.columns)
d = data[['id', 'title', 'translations']]
## 分词
d = cut_data(d, 'translations')

## 获取相似值
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(d['translations'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(d.index, index=d['title']).drop_duplicates()

## 获取相似值高的
print(get_recommendations(d, indices, '行宫', cosine_sim)[['id', 'title', 'sim']])

结果

    id    title       sim
89  90  贼平后送人北归  0.091439
35  36   宫词/宫中词  0.082179
72  73      乌衣巷  0.081442
85  86     归嵩山作  0.050939
6    7      竹里馆  0.048772

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值