数据
下载链接:https://download.csdn.net/download/u010379996/15743786
诗词100首:Index(['id', 'title', 'era', 'author', 'content', 'translations'], dtype='object')
code
import pandas as pd import numpy as np import re import jieba from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import linear_kernel def get_recommendations(data, indices, search, cosine_sim, size=5): idx = indices[search] sim_scores = list(enumerate(cosine_sim[idx])) sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) sim_scores = sim_scores[1:size+1] ids = [i[0] for i in sim_scores] sim = [i[1] for i in sim_scores] d = data.iloc[ids] d['sim'] = sim return d def cut_word(text): cleaned_data = ''.join(re.findall(r'[\u4e00-\u9fa5]', text)) wordlist = jieba.lcut(cleaned_data) return wordlist def cut_word_str(text): return " ".join(cut_word(text)) def cut_data(data, name, cut_name=None): words = [] for d in data[name]: words.append(cut_word_str(d)) if cut_name == None: data[name] = words return data ## 加载数据 datapath = "" data = pd.read_csv(datapath + "content.csv") print(data.columns) d = data[['id', 'title', 'translations']] ## 分词 d = cut_data(d, 'translations') ## 获取相似值 tfidf = TfidfVectorizer() tfidf_matrix = tfidf.fit_transform(d['translations']) cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) indices = pd.Series(d.index, index=d['title']).drop_duplicates() ## 获取相似值高的 print(get_recommendations(d, indices, '行宫', cosine_sim)[['id', 'title', 'sim']])
结果
id title sim
89 90 贼平后送人北归 0.091439
35 36 宫词/宫中词 0.082179
72 73 乌衣巷 0.081442
85 86 归嵩山作 0.050939
6 7 竹里馆 0.048772