大学生创新创业项目：基于LDA模型的游客主题偏好及个性化推荐

最新推荐文章于 2025-05-29 10:27:40 发布

xycoisini

最新推荐文章于 2025-05-29 10:27:40 发布

阅读量383

点赞数

文章标签： python

本文链接：https://blog.csdn.net/xycoisini/article/details/133937227

版权

本文探讨了疫情后旅游业中，利用互联网数据和LDA主题模型分析游客评论以挖掘偏好，提出了一种个性化景点推荐算法。研究通过分类景点、构建LDA模型和测试推荐效果，为旅游服务提供优化方向和个性化服务。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

本文为大学生创新创业项目研究内容，仅供交流学习之用，请勿转载，请勿商用。

疫情后旅游业发展迅速，同时随着互联网在旅游方向的应用，研究发现游客对于景点真实体验及感受的网络评论，是可以挖掘其偏好的一种重要反映途径，结合游客评论信息挖掘游客偏好，形成响应用户实时、连续、个性化的服务需求成为当下研究的热点。潜在狄利克雷分配（LDA）主题模型可用于识别大规模文档集中潜藏的主题信息，在主题建模领域应用广泛。本研究通过搭建LDA游客偏好模型，形成推荐算法，通过研究游客的偏好，为游客提供个性化的信息服务。研究主要工作分为以下三个方面：一、将南京知名景点做出分类，通过调整主题数量，找到景点的最佳分类个数，使得不同类别的景点差异较大。二、搭建LDA模型，编程两种不同的算法，对旅客做出针对化的景点推荐。三、得到交互页面和软件，通过评分算法测试不同算法的性能，得到最终的推荐算法。本研究结果提供旅游服务网站的优化方向及满足游客个性化推荐的需求，具有实践意义和经济价值。

代码如下：

import os
import numpy as np
import pandas as pd
import re
import jieba
import jieba.posseg as psg

# region 文件路径
output_path = r'D:\PycharmProjects\pythonProject\homework\srtp\LDA\result'
file_path = 'data'
os.chdir(file_path)
df = pd.read_excel('南京旅游景点偏好调查.xlsx')
data=pd.read_excel(r"C:\Users\Lenovo\Desktop\携程网评论1.xlsx", engine='openpyxl')#content type
os.chdir(output_path)
dic_file = r"D:\PycharmProjects\pythonProject\homework\srtp\LDA\保留词.txt"
stop_file = r"D:\PycharmProjects\pythonProject\homework\srtp\LDA\停用词.txt"
# endregion

def accomodation(visited_spots):

    jingdian = visited_spots
    attraction = list(data['attraction'])
    pinluns = ''
    for jing in jingdian:
        pinlun = str(data.loc[attraction.index(jing),'content'])
        pinluns = pinluns + pinlun
    dd = pd.DataFrame(['jindian',pinluns])
    data._append(dd)

    # region 文本预处理
    def chinese_word_cut(mytext):
        jieba.load_userdict(dic_file)
        jieba.initialize()
        try:
            stopword_list = open(stop_file,encoding ='utf-8')
        except:
            stopword_list = []
            print("error in stop_file")
        stop_list = []
        flag_list = ['n','nz','vn']
        for line in stopword_list:
            line = re.sub(u'\n|\\r', '', line)
            stop_list.append(line)

        word_list = []
        #jieba分词
        seg_list = psg.cut(mytext)
        for seg_word in seg_list:
            word = re.sub(u'[^\u4e00-\u9fa5]','',seg_word.word)
            find = 0
            for stop_word in stop_list:
                if stop_word == word or len(word)<2:     #this word is stopword
                        find = 1
                        break
            if find == 0 and seg_word.flag in flag_list:
                word_list.append(word)
        return (" ").join(word_list)

    data["content_cutted"] = data.content.apply(chinese_word_cut)

    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation

    def print_top_words(model, feature_names, n_top_words):
        tword = []

        for topic_idx, topic in enumerate(model.components_):
            # print("Topic #%d:" % topic_idx)
            topic_w = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
            tword.append(topic_w)
            # print(topic_w)
        return tword
    # endregion

    # region 词频向量
    n_features = 1000 #提取1000个特征词语
    tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                    max_features=n_features,
                                    stop_words='english',
                                    max_df = 0.5,
                                    min_df = 2)
    tf = tf_vectorizer.fit_transform(data.content_cutted)

    # endregion

    # region 主题计算
    n_topics = 7
    lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
                                    learning_method='batch',
                                    learning_offset=50,
                                   random_state=0)
    lda.fit(tf)

    n_top_words = 25
    tf_feature_names = tf_vectorizer.get_feature_names_out()
    topic_word = print_top_words(lda, tf_feature_names, n_top_words)
    # endregion

    # region 推荐景点
    import numpy as np

    topics=lda.transform(tf)
    topic = []
    for t in topics:
        topic.append("Topic #"+str(list(t).index(np.max(t))))
    data['概率最大的主题序号']=topic
    data['每个主题对应概率']=list(topics)
    w= data[data['概率最大的主题序号']==topic[len(data.index)-1]]
    tuijian = []
    for jing in list(w['attraction']):
        if jing in jingdian or jing == 'jindian':
            continue
        else:
            tuijian.append(jing)
    data.to_excel("分类结果.xlsx",index=False)

    return tuijian
    # endregion
# region 打分
# tot_score = 0;
# scores = {}
# for index, row in df.iterrows():
#     # 获取每个人前 n-1 个景点的访问历史
#     new_list = row.tolist()
#     while np.nan in new_list:
#         new_list.remove(np.nan)
#     visited_spots = new_list[:-1]
#
#     print(visited_spots)
#     # 获取accomodation函数的推荐景点
#     recommended_spots = accomodation(visited_spots)
#
#     # 获取每个人实际访问的第 n 个景点
#     actual_spot = new_list[-1]
#
#     # 计算评分
#     score = 1 if actual_spot in recommended_spots else 0
#     # 存储分数
#     scores[index] = score
# # 输出每个人的分数
# for index, score in scores.items():
#     tot_score += score
#     print(tot_score)
#     print(f'Person {index + 1}: Score - {score}')
# accuracy = ((tot_score / 125) * 100)
# print(accuracy)
#endregion

# region 交互界面
import PySimpleGUI as sg

layout = [
    [sg.Text("欢迎来到南京市景点推荐系统，请输入您已经去过的景点名及偏好")],
    [sg.Text("景点名"), sg.InputText("例如：牛首山文化旅游区,夫子庙,红山森林动物园")],
    [sg.Button('确定'), sg.Button('取消')]
         ]
window = sg.Window('南京市景点推荐系统', layout)
while True:
    event, values = window.read()
    if event == sg.WINDOW_CLOSED:
        break
    if event in ('Exit', '取消'):
        break
    if event == "确定":
        jingdian1 = values[0].split(",")
        print(jingdian1)
        result_list = accomodation(jingdian1)
        result_str = ",".join(result_list)
        sg.popup(result_str, title = "推荐结果")
window.close()
# endregion