邮件透明过滤-贝叶斯垃圾邮件过滤

贝叶斯过滤器的理论介绍









代码示例:

#include "ProbC.h"
#include "common.h"

#define SPAMSIZE 50        
#define PAMSIZE 50           
#define MAXWORDS 15          

FindWordNum(char *word,map<string,int> maillib)
{	
	int wordnumslib;
	wordnumslib = FindWordNumsLib(word,maillib);
	return wordnumslib;
}


double SinWordProb(char *word,map<string,int>spam_map,map<string,int>pam_map)
{
	/*p(s|w) = p(w|s)/(p(w|s) + p(w|h)) */
	int wordspamnum;                 
	int wordpamnum;                      
	double wordspamlibprob = 0.0;   
	double wordpamlibprob =0.0;     
	double mailspamprob;
	wordspamnum = FindWordNum(word,spam_map);
	wordpamnum = FindWordNum(word,pam_map);
	printf("wordspamnum = %d,wordpamnum = %d\n",wordspamnum,wordpamnum);
	
	if((wordpamnum !=0) && (wordspamnum == 0))   
	{
		wordspamlibprob = 0.1;
		wordpamlibprob = double(wordpamnum)/PAMSIZE;
		mailspamprob = wordspamlibprob/(wordspamlibprob + wordpamlibprob);
		return mailspamprob;
	}
	else if((wordpamnum ==0) && (wordspamnum != 0))  
	{
		wordpamlibprob = 0.1;
		wordspamlibprob = double(wordspamnum)/SPAMSIZE;
		mailspamprob = wordspamlibprob/(wordspamlibprob + wordpamlibprob);
		return mailspamprob;
	} 
	else if((wordpamnum !=0) && (wordspamnum != 0))   
	{
		wordpamlibprob = double(wordpamnum)/PAMSIZE;
		wordspamlibprob = double(wordspamnum)/SPAMSIZE;
		mailspamprob = wordspamlibprob/(wordspamlibprob + wordpamlibprob);
		return mailspamprob;
	}
	else if((wordpamnum==0)&&(wordspamnum==0)) 
    {
    	mailspamprob = 0.4;    
    	return mailspamprob;
    }
 
}

double *WordsMaxProb(map<string,int> mailmap,map<string,int> spam_map,map<string,int> pam_map)
{
	int mailwordsize = 0;
	double *wordsprob = NULL; 
	int index = 0;
	char word[30] = {0};      
	double *result = NULL;  
	
	map <string,int> ::iterator iter;
	
	mailwordsize = MapWordSizeStr(mailmap);
	printf("mailwordsize = %d\n",mailwordsize);
	wordsprob = (double*)calloc(mailwordsize,sizeof(double));
	result = (double*)calloc(MAXWORDS,sizeof(double));
	for(iter = mailmap.begin();iter!=mailmap.end();iter++)
	{	
		strcpy(word,iter->first.c_str());
		wordsprob[index++] = SinWordProb(word,spam_map,pam_map);	
	}
	sort(wordsprob,wordsprob + mailwordsize);
	for(index = mailwordsize-1;index >=0;index--)
	{
		printf("wordsprob[%d]:%f\n",index,wordsprob[index]);
	}
	if(MAXWORDS <=mailwordsize)
	{
		for(index = 0;index <MAXWORDS;index++)
		{
			result[index] = wordsprob[mailwordsize--];
		}
	}
	else
	{
		result = wordsprob;
	}
	return result;
	//free(wordsprob);
}


double WordsJointProb(double wordprob[],int wordnums)
{
	//(p1 * p2 * ...p15)/(p1 * p2 * ...p15 +(1-p1) *(1- p2) * ...(1-p15))
	double mailprob;
	double tmpprob1 = wordprob[0];
	double tmpprob2 = 1-wordprob[0];
	int index;
	for(index = 1;index < wordnums;index++)
	{
		tmpprob1 *= wordprob[index]; 
		tmpprob2 *=(1-wordprob[index]);
	}
	mailprob = tmpprob1 / (tmpprob1 + tmpprob2);
	return mailprob;
}

参考链接: http://www.ruanyifeng.com/blog/2011/08/bayesian_inference_part_two.html


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值