朴素贝叶斯分类算法是一种常用的分类方法,应用非常广泛,譬如垃圾邮件判断,电子商务反作弊(作弊卖家等等)。
# 数据集来自Tom Mitchell's book "Machine Learning".
#定义数据矩阵matrix,matrix(vector, nrow=r, ncol=c, byrow=logical_value, dimnames=list(char_vector_rownames, char_vector_colnames))
#nrow表示行数
#ncol表示列数
#byrow表示矩阵组织方式,按行或者按列
#dimnames表示行标识,列标识
data <- matrix(c("sunny","hot","high","weak","no",
"sunny","hot","high","strong","no",
"overcast","hot","high","weak","yes",
"rain","mild","high","weak","yes",
"rain","cool","normal","weak","yes",
"rain","cool","normal","strong","no",
"overcast","cool","normal","strong","yes",
"sunny","mild","high","weak","no",
"sunny","cool","normal","weak","yes",
"rain","mild","normal","weak","yes",
"sunny","mild","normal","strong","yes",
"overcast","mild","high","strong","yes",
"overcast","hot","normal","weak","yes",
"rain","mild","high","strong","no"), byrow = TRUE,
dimnames = list(day = c(),
condition = c("outlook","temperature",
"humidity","wind","playtennis")), nrow=14, ncol=5);
#统计yes,no出现的概率
prior.yes = sum(data[,5] == "yes") / length(data[,5]);
prior.no = sum(data[,5] == "no") / length(data[,5]);
#输入条件向量
###################################################
naive.bayes.prediction <- function(condition.vec) {
###################################################
# Calculate unnormlized posterior probability for playtennis = yes.
playtennis.yes <-
sum((data[,1] == condition.vec[1]) & (data[,5] == "yes")) / sum(data[,5] == "yes") * # P(outlook = f_1 | playtennis = yes)
sum((data[,2] == condition.vec[2]) & (data[,5] == "yes")) / sum(data[,5] == "yes") * # P(temperature = f_2 | playtennis = yes)
sum((data[,3] == condition.vec[3]) & (data[,5] == "yes")) / sum(data[,5] == "yes") * # P(humidity = f_3 | playtennis = yes)
sum((data[,4] == condition.vec[4]) & (data[,5] == "yes")) / sum(data[,5] == "yes") * # P(wind = f_4 | playtennis = yes)
prior.yes; # P(playtennis = yes)
# Calculate unnormlized posterior probability for playtennis = no.
playtennis.no <-
sum((data[,1] == condition.vec[1]) & (data[,5] == "no")) / sum(data[,5] == "no") * # P(outlook = f_1 | playtennis = no)
sum((data[,2] == condition.vec[2]) & (data[,5] == "no")) / sum(data[,5] == "no") * # P(temperature = f_2 | playtennis = no)
sum((data[,3] == condition.vec[3]) & (data[,5] == "no")) / sum(data[,5] == "no") * # P(humidity = f_3 | playtennis = no)
sum((data[,4] == condition.vec[4]) & (data[,5] == "no")) / sum(data[,5] == "no") * # P(wind = f_4 | playtennis = no)
prior.no; # P(playtennis = no)
return(list(post.pr.yes = playtennis.yes,
post.pr.no = playtennis.no,
prediction = ifelse(playtennis.yes >=
playtennis.no, "yes", "no")));
}
naive.bayes.prediction(c("rain","hot","high","strong"));
naive.bayes.prediction(c("sunny","mild","normal","weak"));
naive.bayes.prediction(c("overcast","mild","normal","weak"));
执行后,输出结果:
> naive.bayes.prediction(c("rain","hot","high","strong"));
$post.pr.yes
[1] 0.005291005
$post.pr.no
[1] 0.02742857
$prediction
[1] "no"
> naive.bayes.prediction(c("sunny","mild","normal","weak"));
$post.pr.yes
[1] 0.02821869
$post.pr.no
[1] 0.006857143
$prediction
[1] "yes"
> naive.bayes.prediction(c("overcast","mild","normal","weak"));
$post.pr.yes
[1] 0.05643739
$post.pr.no
[1] 0
$prediction
[1] "yes"