基于内容的电影推荐：为用户产生TOP-N推荐结果

最新推荐文章于 2024-01-09 14:39:32 发布

IT瘾君

最新推荐文章于 2024-01-09 14:39:32 发布

阅读量1k

点赞数

分类专栏：推荐系统-大数据文章标签： python 机器学习数据挖掘

本文链接：https://blog.csdn.net/u012441595/article/details/122322717

版权

推荐系统-大数据专栏收录该内容

23 篇文章 0 订阅

订阅专栏

python编程快速上手（持续更新中…）

1.加载数据

def get_movie_dataset3():
    # 加载基于所有电影的标签
    # all-tags.csv来自ml-latest数据集中
    # 由于ml-latest-small中标签数据太多，因此借助其来扩充
    _tags = pd.read_csv("../data/ml-latest-small/all-tags.csv", usecols=range(1, 3)).dropna()
    tags = _tags.groupby("movieId").agg(list)

    # 加载电影列表数据集
    movies = pd.read_csv("../data/ml-latest-small/movies.csv", index_col="movieId")
    # 将类别词分开
    movies["genres"] = movies["genres"].apply(lambda x: x.split("|"))
    # 为每部电影匹配对应的标签数据，如果没有将会是NAN
    movies_index = set(movies.index) & set(tags.index)
    new_tags = tags.loc[list(movies_index)]
    ret = movies.join(new_tags)

    # 构建电影数据集，包含电影Id、电影名称、类别、标签四个字段
    # 如果电影没有标签数据，那么就替换为空列表
    # map(fun,可迭代对象)
    movie_dataset = pd.DataFrame(
        map(
            lambda x: (x[0], x[1], x[2], x[2]+x[3]) if x[3] is not np.nan else (x[0], x[1], x[2], []), ret.itertuples())
        , columns=["movieId", "title", "genres","tags"]
    )

    movie_dataset.set_index("movieId", inplace=True)
    return movie_dataset

2.提取用户观看列表

def create_user_profile3():
    watch_record = pd.read_csv("../data/ml-latest-small/ratings.csv", usecols=range(2), dtype={"userId":np.int32, "movieId": np.int32})

    watch_record = watch_record.groupby("userId").agg(list)
    # print(watch_record)

    movie_dataset = get_movie_dataset3()
    movie_profile = create_movie_profile(movie_dataset)

    user_profile = {}
    for uid, mids in watch_record.itertuples():
        record_movie_prifole = movie_profile.loc[list(mids)]
        counter = collections.Counter(reduce(lambda x, y: list(x)+list(y), record_movie_prifole["profile"].values))
        # 取出出现次数最多的前50个词
        interest_words = counter.most_common(50)
        # 取出出现次数最多的词 出现的次数
        maxcount = interest_words[0][1]
        # 利用次数计算权重 出现次数最多的词权重为1
        interest_words = [(w,round(c/maxcount, 4)) for w,c in interest_words]
        user_profile[uid] = interest_words

    return user_profile

3.为用户产生TOP-N推荐结果

user_profile3 = create_user_profile3()
user_profile3

for uid, interest_words in user_profile3.items():
    result_table = {} # 电影id:[0.2,0.5,0.7]
    for interest_word, interest_weight in interest_words:
        related_movies = inverted_table[interest_word]
        for mid, related_weight in related_movies:
            _ = result_table.get(mid, [])
            _.append(interest_weight)    # 只考虑用户的兴趣程度
            # _.append(related_weight)    # 只考虑兴趣词与电影的关联程度
            # _.append(interest_weight*related_weight)    # 二者都考虑
            result_table.setdefault(mid, _)

    rs_result = map(lambda x: (x[0], sum(x[1])), result_table.items())
    rs_result = sorted(rs_result, key=lambda x:x[1], reverse=True)[:100]
    print(uid)
    print(rs_result)
    break