完整工程代码下载地址如下:
https://download.csdn.net/download/luohualiushui1/10949880
首先大家了解一下贝努力模型,如下:
n重贝努利实验:重复进行n次独立的贝努利试验,这里“重复”的意思是指各次试验的条件是相同的,它意味着各次试验中事件发生的概率保持不变。“独立是指是指各次试验的结果是相互独立的。基于n重贝努利试验建立的模型,即为贝努利模型。
然后大家去了解一下朴素贝叶斯算法,如下:
贝叶斯分类是一系列分类算法的总称,这类算法均以贝叶斯定理为基础,故统称为贝叶斯分类。朴素贝叶斯算法(Naive Bayesian) 是其中应用最为广泛的分类算法之一。
朴素贝叶斯分类器基于一个简单的假定:给定目标值时属性之间相互条件独立。
通过以上定理和“朴素”的假定,我们知道:
P( Category | Document) = P ( Document | Category ) * P( Category) / P(Document)
接下来是我从资料中查阅到的基于贝努力模型的朴素贝叶斯算法的python代码,如下:
#对数据进行预处理
def createVocabList(dataSet):
vocabSet=set([])
for document in dataSet:
vocabSet=vocabSet|set(document)
return list(vocabSet)
def setOfWords2Vec(vocabList,inputSet):
returnVec=[0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
print("the word:%s is not in my Vocabulary!" % word)
return returnVec
#对数据进行训练
def trainNBO(trainMatrix,trainCategory):
numTrainDocs =len(trainMatrix)
numWords=len(trainMatrix[0])
pAbusive=sum(trainCategory)/float(numTrainDocs)
p0Num = ones(numWords);p1Num = ones(numWords)
pODenom=2;p1Denom=2
for i in range(numTrainDocs):
if trainCategory[i]==1:
p1Num+=trainMatrix[i]
p1Denom+=sum(trainMatrix[i])
else:
p0Num +=trainMatrix[i]
pODenom +=sum(trainMatrix[i])
p1Vect=log(p1Num/p1Denom)
p0Vect=log(p0Num/pODenom)
return p0Vect,p1Vect,pAbusive
#分类器
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
pl=sum(vec2Classify*p1Vec)+log(pClass1)
p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1)
if pl>p0:
return 1
else:
return 0
大家仔细看该python代码可以发现它实际上在代码里面写死了事件的类型只有2种,并不通用,而我用java实现的时候是支持多种事件类型的情况的。
现在开始,首先是数据预处理
public static String [] createVocabList(List<String[]> datas) {
List<String> rs = new ArrayList<String>();
for(int i=0;i<datas.size();i++) {
for(int j=0;j<datas.get(i).length;j++) {
if(!rs.contains(datas.get(i)[j])) {
rs.add(datas.get(i)[j]);
}
}
}
String[] strs = new String[rs.size()];
rs.toArray(strs);
return strs;
}
public static int[] setOfWords2Vec(String [] vocabList,String [] inputSet) {
int[] rs = new int[vocabList.length];
List<String> inputList = Arrays.asList(inputSet);
for(int i=0;i<vocabList.length;i++) {
if(inputList.contains(vocabList[i])) {
rs[i] = 1;
}else {
rs[i] = 0;
}
}
return rs;
}
然后是训练模型
public static Map<Integer,double[]> trainNBO(DenseMatrix64F matdatas,int[] labels){
Map<Integer,double[]> rs = new HashMap<Integer,double[]>();
Map<Integer,double[]> pNum = new HashMap<Integer,double[]>();
Map<Integer,Double> pDenom = new HashMap<Integer,Double>();
for(int i=0;i<matdatas.numRows;i++) {
if(pNum.containsKey(labels[i])) {
double tmp = pDenom.get(labels[i]);
pDenom.remove(labels[i]);
pDenom.put(labels[i], tmp+sumMatrixRow(matdatas,i));
double[] item = pNum.get(labels[i]);
pNum.remove(labels[i]);
for(int j=0;j<matdatas.numCols;j++) {
item[j] = item[j]+ matdatas.get(i, j);
}
pNum.put(labels[i], item);
}else {
pDenom.put(labels[i], 2+sumMatrixRow(matdatas,i));
double[] item = new double[matdatas.numCols];
for(int j=0;j<matdatas.numCols;j++) {
item[j] = 1+matdatas.get(i, j);
}
pNum.put(labels[i], item);
}
}
for (Map.Entry<Integer,double[]> entry : pNum.entrySet()){
double[] tmp = entry.getValue();
for(int i=0;i<tmp.length;i++) {
tmp[i] = Math.log(tmp[i]/pDenom.get(entry.getKey()));
}
rs.put(entry.getKey(), tmp);
}
return rs;
}
接下来便是分类器
public static int classifyNB(int[] vec2Classify,Map<Integer,double[]> trans,Map<Integer,Double> labelsPossibility) {
int label = -1;
double pLabel = -1 * Double.MAX_VALUE;
for (Map.Entry<Integer,double[]> entry : trans.entrySet()){
double sumPossibility = 0;
for(int i=0;i<entry.getValue().length;i++) {
sumPossibility += (double)vec2Classify[i] * entry.getValue()[i];
}
sumPossibility += Math.log(labelsPossibility.get(entry.getKey()));
if(sumPossibility > pLabel) {
pLabel = sumPossibility;
label = entry.getKey();
}
}
return label;
}
到这里代码基本写得差不多了,开始测试。
测试的业务环境是对词组进行分类,测试代码如下:
List<String[]> datas = new ArrayList<String[]>();
datas.add(new String[]{"my","dog","has","flea","problems","help","please"});
datas.add(new String[]{"maybe","not","take","him","to","dog","park","stupid"});
datas.add(new String[]{"my","dalmation","is","so","cute","I","love","him"});
datas.add(new String[]{"stop","posting","stupid","worthless","garbage"});
datas.add(new String[]{"mr","licks","ate","my","steak","how","to","stop","him"});
datas.add(new String[]{"quit","buying","worthless","dog","food","stupid"});
int[] labels =new int[] {0,1,0,1,0,1};
String [] createVocabList = createVocabList(datas);
DenseMatrix64F matdatas = new DenseMatrix64F(datas.size(),createVocabList.length);
for(int i=0;i<datas.size();i++) {
int [] tmp = setOfWords2Vec(createVocabList,datas.get(i));
for(int j=0;j<tmp.length;j++) {
matdatas.set(i, j, tmp[j]);
}
}
Map<Integer,Double> labelsPossibility = getLabelsPossibility(labels);
Map<Integer,double[]> trainDatas = trainNBO(matdatas,labels);
String [] testEntry = new String[]{"love","my","dalmation"};
int [] thisDoc= setOfWords2Vec(createVocabList,testEntry);
System.out.println("classifiedas:"+classifyNB(thisDoc,trainDatas,labelsPossibility));
testEntry=new String[] {"stupid","garbage"};
thisDoc=setOfWords2Vec(createVocabList,testEntry);
System.out.println("classifiedas:"+classifyNB(thisDoc,trainDatas,labelsPossibility));
测试效果如下:
测试成功