本文为大学生创新创业项目研究内容,仅供交流学习之用,请勿转载,请勿商用。
疫情后旅游业发展迅速,同时随着互联网在旅游方向的应用,研究发现游客对于景点真实体验及感受的网络评论,是可以挖掘其偏好的一种重要反映途径,结合游客评论信息挖掘游客偏好,形成响应用户实时、连续、个性化的服务需求成为当下研究的热点。潜在狄利克雷分配(LDA)主题模型可用于识别大规模文档集中潜藏的主题信息,在主题建模领域应用广泛。本研究通过搭建LDA游客偏好模型,形成推荐算法,通过研究游客的偏好,为游客提供个性化的信息服务。研究主要工作分为以下三个方面:一、将南京知名景点做出分类,通过调整主题数量,找到景点的最佳分类个数,使得不同类别的景点差异较大。二、搭建LDA模型,编程两种不同的算法,对旅客做出针对化的景点推荐。三、得到交互页面和软件,通过评分算法测试不同算法的性能,得到最终的推荐算法。本研究结果提供旅游服务网站的优化方向及满足游客个性化推荐的需求,具有实践意义和经济价值。
代码如下:
import os
import numpy as np
import pandas as pd
import re
import jieba
import jieba.posseg as psg
# region 文件路径
output_path = r'D:\PycharmProjects\pythonProject\homework\srtp\LDA\result'
file_path = 'data'
os.chdir(file_path)
df = pd.read_excel('南京旅游景点偏好调查.xlsx')
data=pd.read_excel(r"C:\Users\Lenovo\Desktop\携程网评论1.xlsx", engine='openpyxl')#content type
os.chdir(output_path)
dic_file = r"D:\PycharmProjects\pythonProject\homework\srtp\LDA\保留词.txt"
stop_file = r"D:\PycharmProjects\pythonProject\homework\srtp\LDA\停用词.txt"
# endregion
def accomodation(visited_spots):
jingdian = visited_spots
attraction = list(data['attraction'])
pinluns = ''
for jing in jingdian:
pinlun = str(data.loc[attraction.index(jing),'content'])
pinluns = pinluns + pinlun
dd = pd.DataFrame(['jindian',pinluns])
data._append(dd)
# region 文本预处理
def chinese_word_cut(mytext):
jieba.load_userdict(dic_file)
jieba.initialize()
try:
stopword_list = open(stop_file,encoding ='utf-8')
except:
stopword_list = []
print("error in stop_file")
stop_list = []
flag_list = ['n','nz','vn']
for line in stopword_list:
line = re.sub(u'\n|\\r', '', line)
stop_list.append(line)
word_list = []
#jieba分词
seg_list = psg.cut(mytext)
for seg_word in seg_list:
word = re.sub(u'[^\u4e00-\u9fa5]','',seg_word.word)
find = 0
for stop_word in stop_list:
if stop_word == word or len(word)<2: #this word is stopword
find = 1
break
if find == 0 and seg_word.flag in flag_list:
word_list.append(word)
return (" ").join(word_list)
data["content_cutted"] = data.content.apply(chinese_word_cut)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
def print_top_words(model, feature_names, n_top_words):
tword = []
for topic_idx, topic in enumerate(model.components_):
# print("Topic #%d:" % topic_idx)
topic_w = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
tword.append(topic_w)
# print(topic_w)
return tword
# endregion
# region 词频向量
n_features = 1000 #提取1000个特征词语
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
max_features=n_features,
stop_words='english',
max_df = 0.5,
min_df = 2)
tf = tf_vectorizer.fit_transform(data.content_cutted)
# endregion
# region 主题计算
n_topics = 7
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
learning_method='batch',
learning_offset=50,
random_state=0)
lda.fit(tf)
n_top_words = 25
tf_feature_names = tf_vectorizer.get_feature_names_out()
topic_word = print_top_words(lda, tf_feature_names, n_top_words)
# endregion
# region 推荐景点
import numpy as np
topics=lda.transform(tf)
topic = []
for t in topics:
topic.append("Topic #"+str(list(t).index(np.max(t))))
data['概率最大的主题序号']=topic
data['每个主题对应概率']=list(topics)
w= data[data['概率最大的主题序号']==topic[len(data.index)-1]]
tuijian = []
for jing in list(w['attraction']):
if jing in jingdian or jing == 'jindian':
continue
else:
tuijian.append(jing)
data.to_excel("分类结果.xlsx",index=False)
return tuijian
# endregion
# region 打分
# tot_score = 0;
# scores = {}
# for index, row in df.iterrows():
# # 获取每个人前 n-1 个景点的访问历史
# new_list = row.tolist()
# while np.nan in new_list:
# new_list.remove(np.nan)
# visited_spots = new_list[:-1]
#
# print(visited_spots)
# # 获取accomodation函数的推荐景点
# recommended_spots = accomodation(visited_spots)
#
# # 获取每个人实际访问的第 n 个景点
# actual_spot = new_list[-1]
#
# # 计算评分
# score = 1 if actual_spot in recommended_spots else 0
# # 存储分数
# scores[index] = score
# # 输出每个人的分数
# for index, score in scores.items():
# tot_score += score
# print(tot_score)
# print(f'Person {index + 1}: Score - {score}')
# accuracy = ((tot_score / 125) * 100)
# print(accuracy)
#endregion
# region 交互界面
import PySimpleGUI as sg
layout = [
[sg.Text("欢迎来到南京市景点推荐系统,请输入您已经去过的景点名及偏好")],
[sg.Text("景点名"), sg.InputText("例如:牛首山文化旅游区,夫子庙,红山森林动物园")],
[sg.Button('确定'), sg.Button('取消')]
]
window = sg.Window('南京市景点推荐系统', layout)
while True:
event, values = window.read()
if event == sg.WINDOW_CLOSED:
break
if event in ('Exit', '取消'):
break
if event == "确定":
jingdian1 = values[0].split(",")
print(jingdian1)
result_list = accomodation(jingdian1)
result_str = ",".join(result_list)
sg.popup(result_str, title = "推荐结果")
window.close()
# endregion