基于朴素贝叶斯的垃圾邮件检测

package cn.zhf.test;

import java.io.*;
import java.util.*;

public class SpamMailDetection {
    public static final String BASE_PATH = "C:\\Users\\zhf\\Desktop\\mail";
    public static final String SPAM_PATH = BASE_PATH + "\\train_illegal.txt";//垃圾邮件语料
    public static final String OK_PATH = BASE_PATH + "\\train_legal.txt";//正常邮件语料
    public static final String EMAIL_PATH = BASE_PATH + "\\to_judge.txt";//要判别的邮件
    public static final String DICT_PATH = BASE_PATH + "\\dict.txt";//分词用的词典

    public static void main(String[] args) {
    	SpamMailDetection smc = new SpamMailDetection();
        //<word,(word/NonSpamCorpus)>
        Map<String, Double> okmap = smc.createMailMap(OK_PATH);
        //<word,(word/SpamCorpus)>
        Map<String, Double> spammap = smc.createMailMap(SPAM_PATH);
        Map<String, Double> ratemap = smc.createSpamProbabilityMap(spammap, okmap);
        double probability = smc.judgeMail(EMAIL_PATH, ratemap);
        if (probability > 0.5)//概率大于0.5则判定为垃圾
            System.out.println("It's an ok mail.");
        else
            System.out.println("It's a spam mail.");

    }

    /**
     * 给定邮件,分词,根据分词结果判断是垃圾邮件的概率 
     * P(Spam|t1,t2,t3……tn)=(P1*P2*……PN)/(P1*P2*……PN+(1-P1)*(1-P2)*……(1-PN))
     */
    public double judgeMail(String emailPath, Map<String, Double> ratemap) {
        List<String> list = segment(readFile(emailPath));
        double rate = 1.0;
        double tempRate = 1.0;
        for (String str : list) {
            if (ratemap.containsKey(str)) {
                double tmp = ratemap.get(str);
                tempRate *= 1 - tmp;
                rate *= tmp;
            }
        }
        return rate / (rate + tempRate);
    }

    /**
     * 从给定的垃圾邮件、正常邮件语料中建立map <切出来的词,出现的频率>
     */
    public Map<String, Double> createMailMap(String filePath) {
        String str = readFile(filePath);
        List<String> list = segment(str);
        Map<String, Integer> tmpmap = new HashMap<String, Integer>();
        Map<String, Double> retmap = new HashMap<String, Double>();
        double rate = 0.0;
        int count = 0;
        for (String s : list) {
            tmpmap.put(s, tmpmap.containsKey(s) ? count + 1 : 1);
        }
        for (Iterator iter = tmpmap.keySet().iterator(); iter.hasNext();) {
            String key = (String) iter.next();
            rate = tmpmap.get(key) / list.size();
            retmap.put(key, rate);
        }
        return retmap;
    }

    /**
     * 建立map,<str,rate> 邮件中出现ti时,该邮件为垃圾邮件的概率
     * P( Spam|ti) =P2(ti )/((P1 (ti ) +P2 ( ti ))
     */
    public Map<String, Double> createSpamProbabilityMap(Map<String, Double> spammap,
            Map<String, Double> okmap) {
        Map<String, Double> retmap = new HashMap<String, Double>();
        for (Iterator iter = spammap.keySet().iterator(); iter.hasNext();) {
            String key = (String) iter.next();
            double rate = spammap.get(key);
            double allRate = rate;
            if (okmap.containsKey(key)) {
                allRate += okmap.get(key);
            }
            retmap.put(key, rate / allRate);
        }
        return retmap;
    }

    /**
     * 中文分词
     */
    public List<String> segment(String str) {
        Map<String, Integer> map = loadDict();
        List<String> list = new ArrayList<String>();
        int len = str.length();
        String term;
        int maxSize = 6;
        int i = 0, j = 0;
        while (i < len) {
            int n = i + maxSize < len ? i + maxSize : len + 1;
            boolean findFlag = false;
            for (j = n - 1; j > i; j--) {
                term = str.substring(i, j);
                if (map.containsKey(term)) {
                    list.add(term);
                    findFlag = true;
                    i = j;
                    break;
                }
            }
            if (findFlag == false)
                i = j + 1;
        }
        return list;
    }

    /**
     * 加载词典文件
     */
    public Map<String, Integer> loadDict() {
        Map<String, Integer> map = new HashMap<String, Integer>();
        String[] str;
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(
                    new FileInputStream(new File(DICT_PATH)), "gbk"));
            String tmp = "";
            while ((tmp = br.readLine()) != null) {
                str = tmp.split("\t");
                map.put(str[0], 0);
            }
            br.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return map;
    }

    /**
     * 读文件
     */
    public String readFile(String filePath) {
        String str = "";
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(
                    new FileInputStream(new File(filePath)), "gbk"));
            String tmp = "";
            while ((tmp = br.readLine()) != null)
                str += tmp;
            br.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return str;
    }

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值