参考文章:
用户推荐协同过滤算法(UserCF)的R实现
上述文章有部分,在此处已经修正。
- 数据源如下,命名
testCF.csv
,放在我的桌面,路径:C:/Users/Administrator/Desktop/。
A11111 101 5
A11111 102 3
A11111 103 2.5
A22222 101 2
A22222 102 2.5
A22222 103 5
A22222 104 2
A33333 101 2.5
A33333 104 4
A33333 105 4.5
A33333 107 5
A44444 101 5
A44444 103 3
A44444 104 4.5
A44444 106 4
A55555 101 4
A55555 102 3
A55555 103 2
A55555 104 4
A55555 105 3.5
A55555 106 4
- 代码如下:
运行完毕后直接执行UserBasedRecommender(“A11111”) #参数为用户id,即输入用户id即可以得到若干推荐。UserBasedRecommender中有4个变量,分别是UserBasedRecommender(uid,data=..,n=2,N=3)
.
其中,uid为用户id,data为数据集,n代表与目标用户距离最近的n个用户,N代表最终推荐的商品数。
#载入数据
org_data = read.csv('C:\\Users\\Administrator\\Desktop\\testCF.csv',header=FALSE)
#1)将源数据转化为矩阵
FileDataModel = function(org_data){
names(org_data) = c("uid","iid","pref")
M = dcast(uid ~ iid,data=org_data)
rownames(M) = unique(org_data$uid)
M = as.matrix(M[-1])
colnames(M) = unique(sort(org_data$iid))
M
}
#M = FileDataModel(org_data)
# 101 102 103 104 105 106 107
# 1 5.0 3.0 2.5 NA NA NA NA
# 2 2.0 2.5 5.0 2.0 NA NA NA
# 3 2.5 NA NA 4.0 4.5 NA 5
# 4 5.0 NA 3.0 4.5 NA 4 NA
# 5 4.0 3.0 2.0 4.0 3.5 4 NA
#原文是把NA替换为0,但是考虑到后面计算相似性的方便,这里把它保留了。
#2). 欧氏距离相似度算法
EuclideanDistanceSimilarity = function(M){
simple = function(x,y){
num = intersect(which(M[x,]!=0),which(M[y,]!=0))
s = sum((M[x,] - M[y,])^2,na.rm = T)
s = length(num)/(1 + sqrt(s))
if(s > 1) s = 1
if(s < -1) s = -1
return(s)
}
outer(1:nrow(M),1:nrow(M),Vectorize(simple)) - diag(nrow(M))
}
# S
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0.0000000 0.6076560 0.2857143 1.0000000 1.0000000
# [2,] 0.6076560 0.0000000 0.6532633 0.5568464 0.7761999
# [3,] 0.2857143 0.6532633 0.0000000 0.5634581 1.0000000
# [4,] 1.0000000 0.5568464 0.5634581 0.0000000 1.0000000
# [5,] 1.0000000 0.7761999 1.0000000 1.0000000 0.0000000
# 3). 最近邻算法
NearestNUserNeighborhood = function(S,n){
sapply(1:n,function(i) {m = apply(S,2,which.max)
ij = cbind(m,seq_along(m))
S[ij] <<- 0
m})
}
# #取距离最近的两位用户
# NEIGHBORHOOD_NUM = 2
# N = NearestNUserNeighborhood(S,NEIGHBORHOOD_NUM)
# # N
# # [,1] [,2]
# # [1,] 4 5
# # [2,] 5 3
# # [3,] 5 2
# # [4,] 1 5
# # [5,] 1 3
# # 4). 推荐算法
# #RECOMMENDER_NUM是推荐产品的个数
# RECOMMENDER_NUM = 3
# UserBasedRecommender = function(uid,n,M,S,N){
# part = colnames(M)[is.na(M[uid,])] #用户A没有的商品
# m = S[uid, N[uid,]]
# md = M[N[uid,],part]
# if(length(dim(md)) >= 1) {
# a = colnames(md)[colSums(md,na.rm=TRUE)!=0]
# score = colSums(md*m,na.rm = T)/apply(!is.na(md),2,function(x) sum(m[x]))
# res = score[order(score,decreasing=T)][1:n]
# res = res[intersect(names(res),a)]
# return(res)
# } else { res = NA}
# }
#org_data是源数据集,uid为用户id排序,n为推荐多少个产品,N为取多少个距离最近的几个用户
UserBasedRecommender=function(uid,data=org_data,n=2,N=3){
library(reshape2)
M = FileDataModel(org_data)#转化矩阵形式
S = EuclideanDistanceSimilarity(M) #计算用户之间的距离相似度,这里用欧式距离
N = NearestNUserNeighborhood(S,N) #取出距离最近的几个用户
uid_N=which(rownames(M)==uid)
part = colnames(M)[is.na(M[uid_N,])] #用户A没有的商品
m = S[uid_N, N[uid_N,]]
md = M[N[uid_N,],part]
if(length(dim(md)) >= 1) {
a = colnames(md)[colSums(md,na.rm=TRUE)!=0]
score = colSums(md*m,na.rm = T)/apply(!is.na(md),2,function(x) sum(m[x]))
res = score[order(score,decreasing=T)][1:n]
res = res[intersect(names(res),a)]
return(res)
} else { res = NA}
}
上述代码用Rmarkdown实现,实现代码如下:
想进一步了解R与数据挖掘可以关注我的知乎QQ ZHOU.