#Naive Bayes朴素贝叶斯
library(R6)
NBayes <- R6Class(“NBayes”)
loadDataSet <- function(){
postingList<-list(
c(‘my’,‘dog’,‘has’,‘flea’,‘problems’,‘help’,‘please’),
c(‘maybe’,‘not’,‘take’,‘him’,‘to’,‘dog’,‘park’,‘stupid’),
c(‘my’,‘dalmation’,‘is’,‘so’,‘cute’,‘I’,‘love’,‘him’),
c(‘stop’,‘posting’,‘stupid’,‘worthless’,‘garbage’),
c(‘mr’,‘licks’,‘ate’,‘my’,‘steak’,‘how’,‘to’,‘stop’,‘him’),
c(‘quit’,‘buying’,‘worthless’,‘dog’,‘food’,‘stupid’)
)
postingList
}#文本分析第一步构建词汇,实例的特征、类别
NBayes$set(“private”,“loadDataSet”,loadDataSet,overwrite=T)
createVocabList <- function(dataset){
vocabset <- c()
for(document in dataset){
vocabset <- union(vocabset,document)
}
return(vocabset) }
NBayes$set(“private”,“createVocabList”,createVocabList,overwrite = T)
#依据词汇表构建特征,tm包文本分析
createData <- function(){
setofWords2Vec <- function(vocabList,inputset){
as.numeric(vocabList %in% inputset)
}
tmp <- private
l
o
a
d
D
a
t
a
S
e
t
(
)
v
o
c
a
b
L
i
s
t
<
−
p
r
i
v
a
t
e
loadDataSet() vocabList <- private
loadDataSet()vocabList<−privatecreateVocabList(tmp)
tmp1 <- lapply(tmp, setofWords2Vec,vocabList=vocabList)
res <- as.data.frame(t(as.data.frame(tmp1)))
row.names(res) <- 1:nrow(res)
names(res) <- vocabList
resKaTeX parse error: Expected 'EOF', got '}' at position 71: …") res }̲ NBayesset(“public”,“createData”,createData,overwrite = T)
nbayes <- NBayes
n
e
w
(
)
n
b
a
y
e
s
new() nbayes
new()nbayescreateData()
#条件独立
nbayes <- NBayesKaTeX parse error: Expected 'EOF', got '#' at position 7: new() #̲构建实例 demoData<-…createData()#6行若干列的数据
rn1<-nrow(subset(demoData,my0&cls’abusive’))#分子两个事件出现的次数
rn2<-nrow(subset(demoData,cls==‘abusive’))
rn1/rn2
featPropTbl <- function(dataset){
featProb <- function(feat){
tbl <- table(dataset[,ncol(dataset)],dataset[,feat])
prop.table(tbl,margin = 1)
}
res <- lapply(names(dataset)[1:(ncol(dataset)-1)],featProb)
names(res) <- names(dataset)[1:(ncol(dataset)-1)]
res
} #res <- lapply(names(demoData)[1:(ncol(demoData)-1)],featProb)
NBayes
s
e
t
(
"
p
u
b
l
i
c
"
,
"
f
e
a
t
P
r
o
p
T
b
l
"
,
f
e
a
t
P
r
o
p
T
b
l
,
o
v
e
r
w
r
i
t
e
=
T
)
n
b
a
y
e
s
<
−
N
B
a
y
e
s
set("public","featPropTbl",featPropTbl,overwrite = T) nbayes <- NBayes
set("public","featPropTbl",featPropTbl,overwrite=T)nbayes<−NBayesnew()
nbayesProbTbl <- nbayes$featPropTbl(demoData)
tbl <- nbayesProbTbl[[“flea”]]
tbl[row.names(tbl)“abusive”,colnames(tbl)“1”]
trainNB0<-function(trainingData){
model<-selfKaTeX parse error: Expected 'EOF', got '#' at position 74: …trainingData)])#̲不同类别出现的次数 tbl…set(“public”,“trainNB0”,trainNB0,overwrite=T)
nbbayes<-NBayes
n
e
w
(
)
m
o
d
e
l
<
−
n
b
b
a
y
e
s
new() model<-nbbayes
new()model<−nbbayestrainNB0(demoData)
attr(model,“priorProb”)
classifyNB<-function(model,w){
lookupProbTbl<-function(cls,feat,featVal){ #内部函数-查询条件概率表
tbl<-model[[feat]]
tbl[row.names(tbl)==as.character(cls),colnames(tbl)==as.character(featVal)]
}
getProb4Cls<-function(w,cls){ #内部函数
sapply(1:length(w),function(i){lookupProbTbl(cls,names(w)[i],w[i])}) #i取1:length(w)
}
priorTbl<-attr(model,“priorProb”)
res<-c()
for(cls in names(priorTbl)){
res<-c(res,prod(getProb4Cls(w,cls))*priorTbl[cls])
}
print(res)
names(priorTbl)[which.max(res)[1]]
}
NBayes$set(“public”,“classifyNB”,classifyNB,overwrite=T)
via<-c(“yes”,“yes”,“no”,“yes”,“no”,“no”,“no”)
mon<-c(“yes”,“no”,“yes”,“no”,“yes”,“no”,“no”)
groc<-c(“no”,“no”,“yes”,“no”,“yes”,“yes”,“no”)
unsub<-c(“yes”,“no”,“no”,“no”,“yes”,“no”,“yes”)
cls<-c(“spam”,“spam”,“ham”,“ham”,“ham”,“ham”,“spam”)
mydf<-data.frame(via=via,mon=mon,groc=groc,unsub=unsub,cls=cls,stringsAsFactors = F)
mydf
newemail<-c(‘yes’,‘no’,‘no’,‘yes’)
names(newemail)<-names(mydf)[1:(ncol(mydf)-1)]
nbayes<-NBayes
n
e
w
(
)
m
o
d
e
l
<
−
n
b
a
y
e
s
new() model<-nbayes
new()model<−nbayestrainNB0(mydf)
nbayes$classifyNB(model,newemail)
nbayes <- NBayes
n
e
w
(
)
m
o
d
e
l
<
−
n
b
a
y
e
s
new() model <- nbayes
new()model<−nbayestrainNB0(demoData)
nbayes$classifyNB(model,unlist(demoData[1,-ncol(demoData)]))
classifyNB <- function(model,w){
lookupProbTbl <- function(cls,feat,featVal){
tbl <- model[[feat]]
tbl[row.names(tbl)==as.character(cls),colnames(tbl)==as.character(featVal)]
}
getProb4Cls <- function(w,cls){
sapply(1:length(w),function(i){lookupProbTbl(cls,names(w)[i],w[i])})
}
priorTbl <- attr(model,“priorProb”)
res <- c()
for(cls in names(priorTbl)){
res <- c(res,sum(log(getProb4Cls(w,cls)))+log(priorTbl[cls]))
}
print(res)
names(priorTbl)[which.max(res)[1]]
}
NBayes
s
e
t
(
"
p
u
b
l
i
c
"
,
"
c
l
a
s
s
i
f
y
N
B
"
,
c
l
a
s
s
i
f
y
N
B
,
o
v
e
r
w
r
i
t
e
=
T
)
n
b
a
y
e
s
<
−
N
B
a
y
e
s
set("public","classifyNB",classifyNB,overwrite = T) nbayes <- NBayes
set("public","classifyNB",classifyNB,overwrite=T)nbayes<−NBayesnew()
model <- nbayes
t
r
a
i
n
N
B
0
(
d
e
m
o
D
a
t
a
)
n
b
a
y
e
s
trainNB0(demoData) nbayes
trainNB0(demoData)nbayesclassifyNB(model,unlist(demoData[1,-ncol(demoData)]))
newemail <- c(‘yes’, ‘no’, ‘no’, ‘yes’)
names(newemail) <- names(mydf)[1:(ncol(mydf)-1)]
nbayes <- NBayes
n
e
w
(
)
m
o
d
e
l
<
−
n
b
a
y
e
s
new() model <- nbayes
new()model<−nbayestrainNB0(mydf)
nbayes$classifyNB(model,newemail)