报错:AttributeError: ‘DataFrame’ object has no attribute ‘dtype’
错误原因:scikit-learn从0.20.1不再自动修改numpy语法。所以需要养成更加规范的书写习惯解决
pairwise_distances(df, metric="jaccard")`修改为pairwise_distances(df.values, metric="jaccard")`
from sklearn.metrics import jaccard_similarity_score
修改为:from sklearn.metrics import jaccard_score
pandas:ix 、loc 、 iloc区别、.at、.iat和.get_value
.ix()运行可能会报错,需要修改为loc,上面的链接为几个函数的区别
案例1:基于用户的协议过滤
import pandas as pd
import numpy as np
from pprint import pprint
users = ["User1", "User2", "User3", "User4", "User5"]
items = ["Item A", "Item B", "Item C", "Item D", "Item E"]
# 用户购买记录数据集
datasets = [
[1, 0, 1, 1, 0],
[1, 0, 0, 1, 1],
[1, 0, 1, 0, 0],
[0, 1, 0, 1, 1],
[1, 1, 1, 0, 1],
]
df = pd.DataFrame(datasets,
columns=items,
index=users)
""" Item A Item B Item C Item D Item E
User1 1 0 1 1 0
User2 1 0 0 1 1
User3 1 0 1 0 0
User4 0 1 0 1 1
User5 1 1 1 0 1
"""
# 计算所有的数据两两的杰卡德相似系数
from sklearn.metrics.pairwise import pairwise_distances
# 计算用户间相似度
user_similar = 1 - pairwise_distances(df.values, metric="jaccard")
user_similar = pd.DataFrame(user_similar, columns=users, index=users)
print("用户之间的两两相似度:")
print(user_similar)
topN_users = {}
# 遍历每一行数据
for i in user_similar.index:
# 取出每一列数据,并删除自身,然后排序数据
_df = user_similar.loc[i].drop([i]) # 拿到每一行的数据并且剔除自己那一行(自己和自己相似度一定是一摸一样)
_df_sorted = _df.sort_values(ascending=False) # 降序排列
top2 = list(_df_sorted.index[:2]) # 切片转化list
topN_users[i] = top2
print("Top2相似用户:")
pprint(topN_users)
rs_results = {}
# 构建推荐结果
for user, sim_users in topN_users.items():
rs_result = set() # 存储推荐结果 set()集合能去重
for sim_user in sim_users:
# 构建初始的推荐结果
rs_result = rs_result.union(set(df.loc[sim_user].replace(0, np.nan).dropna().index))
# 过滤掉已经购买过的物品
rs_result -= set(df.loc[user].replace(0, np.nan).dropna().index)
rs_results[user] = rs_result
print("最终推荐结果:")
pprint(rs_results)
案例2:基于协同过滤的电影推荐
import pandas as pd
import numpy as np
def load_data(data_path):
print("开始加载数据集...")
if os.path.exists(cache_path): # 判断是否存在缓存文件
print("加载缓存中...")
ratings_matrix = pd.read_pickle(cache_path)
print("从缓存加载数据集完毕")
else:
print("加载新数据中...")
# 设置要加载的数据字段的类型
dtype = {"userId": np.int32, "movieId": np.int32, "rating": np.float32}
# 加载数据,我们只用前三列数据,分别是用户ID,电影ID,已经用户对电影的对应评分
ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3))
# 透视表,将电影ID转换为列名称,转换成为一个User-Movie的评分矩阵
ratings_matrix = ratings.pivot_table(index=["userId"], columns=["movieId"], values="rating")
# 存入缓存文件
ratings_matrix.to_pickle(cache_path)
print("数据集加载完毕")
return ratings_matrix
def compute_pearson_similarity(ratings_matrix, based="user"):
user_similarity_cache_path = os.path.join(CACHE_DIR, "user_similarity.cache")
item_similarity_cache_path = os.path.join(CACHE_DIR, "item_similarity.cache")
if based == "user":
if os.path.exists(user_similarity_cache_path):
print("正从缓存加载用户相似度矩阵")
similarity = pd.read_pickle(user_similarity_cache_path)
else:
print("开始计算用户相似度矩阵")
similarity = ratings_matrix.T.corr()
similarity.to_pickle(user_similarity_cache_path)
elif based == "item":
if os.path.exists(item_similarity_cache_path):
print("正从缓存加载物品相似度矩阵")
similarity = pd.read_pickle(item_similarity_cache_path)
else:
print("开始计算物品相似度矩阵")
similarity = ratings_matrix.corr()
similarity.to_pickle(item_similarity_cache_path)
else:
raise Exception("Unhandled 'based' Value: %s"%based)
print("相似度矩阵计算/加载完毕")
return similarity
if __name__ == '__main__':
ratings_matrix = load_data(DATA_PATH)
user_similar = compute_pearson_similarity(ratings_matrix, based="user")
print(user_similar)
item_similar = compute_pearson_similarity(ratings_matrix, based="item")
print(item_similar)