1.加载数据
def get_movie_dataset3():
_tags = pd.read_csv("../data/ml-latest-small/all-tags.csv", usecols=range(1, 3)).dropna()
tags = _tags.groupby("movieId").agg(list)
movies = pd.read_csv("../data/ml-latest-small/movies.csv", index_col="movieId")
movies["genres"] = movies["genres"].apply(lambda x: x.split("|"))
movies_index = set(movies.index) & set(tags.index)
new_tags = tags.loc[list(movies_index)]
ret = movies.join(new_tags)
movie_dataset = pd.DataFrame(
map(
lambda x: (x[0], x[1], x[2], x[2]+x[3]) if x[3] is not np.nan else (x[0], x[1], x[2], []), ret.itertuples())
, columns=["movieId", "title", "genres","tags"]
)
movie_dataset.set_index("movieId", inplace=True)
return movie_dataset
2.提取用户观看列表
def create_user_profile3():
watch_record = pd.read_csv("../data/ml-latest-small/ratings.csv", usecols=range(2), dtype={"userId":np.int32, "movieId": np.int32})
watch_record = watch_record.groupby("userId").agg(list)
movie_dataset = get_movie_dataset3()
movie_profile = create_movie_profile(movie_dataset)
user_profile = {}
for uid, mids in watch_record.itertuples():
record_movie_prifole = movie_profile.loc[list(mids)]
counter = collections.Counter(reduce(lambda x, y: list(x)+list(y), record_movie_prifole["profile"].values))
interest_words = counter.most_common(50)
maxcount = interest_words[0][1]
interest_words = [(w,round(c/maxcount, 4)) for w,c in interest_words]
user_profile[uid] = interest_words
return user_profile
3.为用户产生TOP-N推荐结果
user_profile3 = create_user_profile3()
user_profile3
for uid, interest_words in user_profile3.items():
result_table = {}
for interest_word, interest_weight in interest_words:
related_movies = inverted_table[interest_word]
for mid, related_weight in related_movies:
_ = result_table.get(mid, [])
_.append(interest_weight)
result_table.setdefault(mid, _)
rs_result = map(lambda x: (x[0], sum(x[1])), result_table.items())
rs_result = sorted(rs_result, key=lambda x:x[1], reverse=True)[:100]
print(uid)
print(rs_result)
break