3、机器学习算法Bayes -- Java代码

标签: 机器学习
1413人阅读 评论(0) 收藏 举报
分类:


朴素贝叶斯属于监督学习的分类算法。


package algorithm.machine;
/**
 * 问题:求先验概率   词汇表不存在的单词概率为0,怎么处理
 */
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 3、Bayes:机器学习算法
 * 
 * @author baolibin 朴素贝叶斯是分类算法、属于监督学习:
 *         路径下有25封正常邮件以及25封垃圾邮件,随机选取20封正常邮件以及20封垃圾邮件作为训练数据,剩下作为测试数据;
 *         E:\machinedata\bayes\email
 */
public class _03_Bayes {
	List<String> wordli = new ArrayList<String>(); // 存词汇表
	
	int[] wordVector1; //正常邮件的词向量
	int[] wordVector2; //垃圾邮件的词向量

	public static void main(String[] args) throws IOException {
		_03_Bayes _03_Bayes = new _03_Bayes();
		String pathHam = "E:\\machinedata\\bayes\\email\\ham"; // 正常邮件根路径
		String pathSpam = "E:\\machinedata\\bayes\\email\\spam"; // 垃圾邮件根路径
		algorithm.machine._03_Bayes _03_Bayes2 = _03_Bayes;
		_03_Bayes2.wordList(pathHam, pathSpam); // 生成词汇表
		_03_Bayes2.wordVector(pathHam, pathSpam); //构建词向量
		
		/**
		 * 贝叶斯模型已生成,进行测试
		 * 测试数据:随机选取的测试数据
		 */
 		DecimalFormat df = new DecimalFormat("##0.00");    //double保留小数点后6位
 		String tmpPath1=null;
 		int accuracy=0;
 		for (int i = 1; i <=25; i++) {
 			tmpPath1="E:\\machinedata\\bayes\\email\\ham\\"+i+".txt";
 			String result=_03_Bayes2.classify(tmpPath1);
 			if ("正常邮件".equals(result)) {
 				accuracy++;
			}
		}
 		System.out.println("\n\n正常邮件判断正确率为:"+(Double.parseDouble(df.format((double)accuracy/(double)25))));
 		
 		accuracy=0;
 		for (int i = 1; i <=25; i++) {
 			tmpPath1="E:\\machinedata\\bayes\\email\\spam\\"+i+".txt";
 			String result=_03_Bayes2.classify(tmpPath1);
 			if ("垃圾邮件".equals(result)) {
 				accuracy++;
			}
		}
 		System.out.println("垃圾邮件判断正确率为:"+(Double.parseDouble(df.format((double)accuracy/(double)25))));
	}

	/**
	 * 判断一个字符串里是否包含字母
	 * 
	 * @param cardNum
	 * @return
	 */
	public boolean judgeContainsStr(String cardNum) {
		String regex = ".*[a-zA-Z]+.*";
		Matcher m = Pattern.compile(regex).matcher(cardNum);
		return m.matches();
	}

	/**
	 * 1、 将所有训练数据转换为词汇表
	 * 
	 * @param pathHam
	 *            正常邮件根路径
	 * @param pathSpam
	 *            垃圾邮件根路径
	 * @throws IOException
	 */
	public void wordList(String pathHam, String pathSpam) throws IOException {
		File file1 = new File(pathHam);
		File file2 = new File(pathSpam);
		String[] fileName1 = file1.list(); // 获取正常邮件目录下的所有文件
		String[] fileName2 = file2.list();
		String[] split = null;
		BufferedReader reader1 = null;
		BufferedReader reader2 = null;
		String tmpStrLine = null; // 存临时读取每一封邮件的每一行
		String tmpStr = null; // 存切分出来的单词字符串
		/**
		 * 读取训练数据邮件信息
		 */
		for (int i = 0; i < 20; i++) { // 读取20个正常邮件和垃圾邮件
			/**
			 * 读取正常邮件
			 */
			File tmpfile = new File((pathHam + "\\" + fileName1[i])); // 对每一封邮件内容进行切分
			reader1 = new BufferedReader(new FileReader(tmpfile));
			while ((tmpStrLine = reader1.readLine()) != null) { // 读取一封邮件的每一行
				split = tmpStrLine.split("\\s+"); // 进行切分
				if (split.length > 0) {
					for (String sp : split) { // 把切分的每一个单词去重加入词汇表
						tmpStr = sp.trim().toUpperCase(); // 全部转换为大写
						if (judgeContainsStr(tmpStr)) { // 如果这个单词里包含字母
							tmpStr=tmpStr.replaceAll("[^a-z^A-Z^0-9]", "");
							if (!wordli.contains(tmpStr)) { // 添加进词汇表里
								wordli.add(tmpStr); // 把字符串里非字母和数字都去掉
							}
						}
					}
				}
			}
			/**
			 * 读取垃圾邮件
			 */
			File tmpfile2 = new File((pathSpam + "\\" + fileName2[i])); // 对每一封邮件内容进行切分
			reader2 = new BufferedReader(new FileReader(tmpfile2));
			while ((tmpStrLine = reader2.readLine()) != null) { // 读取一封邮件的每一行
				split = tmpStrLine.split("\\s+"); // 进行切分
				if (split.length > 0) {
					for (String sp : split) { // 把切分的每一个单词去重加入词汇表
						tmpStr = sp.trim().toUpperCase(); // 全部转换为大写
						if (judgeContainsStr(tmpStr)) { // 如果这个单词里包含字母
							tmpStr=tmpStr.replaceAll("[^a-z^A-Z^0-9]", "");
							if (!wordli.contains(tmpStr)) { // 添加进词汇表里
								wordli.add(tmpStr); // 把字符串里非字母和数字都去掉
							}
						}
					}
				}
			}
		}
		reader1.close();
		reader2.close();
		/**
		 * 测试输出词汇表内容
		 */
		System.out.println("词汇表长度为:" + wordli.size());
		System.out.println("词汇表内容为:");
		int i=1;
		for (String spl : wordli) {
			if (i==10) {
				System.out.println();
				i=1;
			}
			System.out.print(spl + " ");
			i++;
		}
	}
	/**
	 * 2、构建词向量
	 * 词集模型:每个词的出现与否作为一个特征
	 * 词袋模型:每个词在文档中出现不止一次,要计算次数,不能计算是否出现
	 * @param pathHam
	 * @param pathSpam
	 * @throws IOException 
	 */
	public void wordVector(String pathHam, String pathSpam) throws IOException{
		wordVector1=new int[wordli.size()]; //正常邮件的词向量
		wordVector2=new int[wordli.size()]; //垃圾邮件的词向量
		
		File file1 = new File(pathHam);
		File file2 = new File(pathSpam);
		String[] fileName1 = file1.list(); // 获取正常邮件目录下的所有文件
		String[] fileName2 = file2.list();
		String[] split = null;
		BufferedReader reader1 = null;
		BufferedReader reader2 = null;
		String tmpStrLine = null; // 存临时读取每一封邮件的每一行
		String tmpStr = null; // 存切分出来的单词字符串
		/**
		 * 构建词向量
		 * 采用词袋模型
		 */
		for (int i = 0; i < 20; i++) { // 读取20个正常邮件和垃圾邮件
			/**
			 * 构建正常邮件的词向量
			 */
			File tmpfile = new File((pathHam + "\\" + fileName1[i])); // 对每一封邮件内容进行切分
			reader1 = new BufferedReader(new FileReader(tmpfile));
			while ((tmpStrLine = reader1.readLine()) != null) { // 读取一封邮件的每一行
				split = tmpStrLine.split("\\s+"); // 进行切分
				if (split.length > 0) {
					for (String sp : split) { // 切分的每一个单词
						tmpStr = sp.trim().toUpperCase(); // 全部转换为大写
						if (judgeContainsStr(tmpStr)) { // 如果这个单词里包含字母
							tmpStr=tmpStr.replaceAll("[^a-z^A-Z^0-9]", "");
							int tmpindex=wordli.indexOf(tmpStr);
							wordVector1[tmpindex]++; //对应的出现该元素次数加加
						}
					}
				}
			}
			/**
			 * 构建垃圾邮件的词向量
			 */
			File tmpfile2 = new File((pathSpam + "\\" + fileName2[i])); // 对每一封邮件内容进行切分
			reader2 = new BufferedReader(new FileReader(tmpfile2));
			while ((tmpStrLine = reader2.readLine()) != null) { // 读取一封邮件的每一行
				split = tmpStrLine.split("\\s+"); // 进行切分
				if (split.length > 0) {
					for (String sp : split) { // 把切分的每一个单词去重加入词汇表
						tmpStr = sp.trim().toUpperCase(); // 全部转换为大写
						if (judgeContainsStr(tmpStr)) { // 如果这个单词里包含字母
							tmpStr=tmpStr.replaceAll("[^a-z^A-Z^0-9]", "");
							int tmpindex=wordli.indexOf(tmpStr);
							wordVector2[tmpindex]++; //对应的出现该元素次数加加
						}
					}
				}
			}
		}
		reader1.close();
		reader2.close();
		/**
		 * 输出词向量
		 */
		System.out.println("\n正常邮件词向量为:");
		for (Integer in : wordVector1) {
			System.out.print(in+" ");
		}
		System.out.println("\n垃圾邮件词向量为:");
		for (Integer in : wordVector2) {
			System.out.print(in+" ");
		}
	}
	/**
	 * 对指定邮件进行分类:正常邮件还是垃圾邮件
	 * p(A|B)=P(B|A)*P(A)/P(B)
	 * 
	 * 本例子公式:
	 * P1:P(正常邮件|待求邮件)=P(待求邮件|正常邮件)*P(正常邮件)/P(待求邮件)
	 * P2:P(垃圾邮件|待求邮件)=P(待求邮件|垃圾邮件)*P(垃圾邮件)/P(待求邮件)
	 * 若P1>P2,则该邮件为正常邮件,否则为垃圾邮件
	 * 
	 * 分母一样,一次只需要比较分子大小
	 * P(待求邮件|正常邮件)*P(正常邮件)  与    P(待求邮件|垃圾邮件)*P(垃圾邮件)
	 * 训练数据一共40封邮件,20封正常邮件,20封垃圾邮件,因此 P(正常邮件)=P(垃圾邮件)=0.5
	 * 
	 * 最后只需要求 P(待求邮件|正常邮件)   与   P(待求邮件|垃圾邮件)
	 * @throws IOException 
	 */
	public String classify(String filePath) throws IOException{
		/**
		 * 依次为:后验概率、先验概率、类条件概率、总体概率密度
		 * p(c|w)=p(w|c)p(c)/p(w)
		 * 
		 * 假设所有词都相互独立
		 * p(w|c)=p(w0,w1,w2...wn|c)=p(w0|c)p(w1|c)p(w2|c)...p(wn|c)
		 */
		File file=new File(filePath);
		BufferedReader reader=null;
		String tmpStrLine=null;
		String tmpStr=null;
		String[] spl=null;
		reader=new BufferedReader(new FileReader(file));
		
		double p1=1; //正常邮件先验概率
		double p2=1; //垃圾邮件先验概率
		int word1=0; //训练数据正常邮件单词个数
		int word2=0; //训练数据垃圾邮件单词个数
		
		for (Integer in1 : wordVector1) { //计算训练数据正常邮件单词个数
			word1+=in1;
		}
		for (Integer in2 : wordVector2) { //计算训练数据垃圾邮件单词个数
			word2+=in2;
		}
		/**
		 * 计算先验概率
		 */
//		System.out.println("\n训练数据正常邮件单词个数为:"+word1);
//		System.out.println("训练数据垃圾邮件单词个数为:"+word2);
//		System.out.println("\n");
		DecimalFormat df = new DecimalFormat("##0.00");    //double保留小数点后6位
		while ((tmpStrLine=reader.readLine())!=null) {
			spl=tmpStrLine.split("\\s+");
			for (String str : spl) {
				tmpStr = str.trim().toUpperCase();
				if (judgeContainsStr(tmpStr)) { // 如果这个单词里包含字母
					tmpStr=tmpStr.replaceAll("[^a-z^A-Z^0-9]", "");
					/**
					 * 分子是0情况
					 * 无限相乘趋近于0
					 */
					if (wordli.contains(tmpStr)) { //词汇表中存在该单词
//						System.out.println("单词存在");
						/**
						 * 正常邮件先验概率
						 */
						int index=wordVector1[wordli.indexOf(tmpStr)]; //该单词在训练数据正常邮件出现的个数
						if(index>0){
							p1=Double.parseDouble(df.format((p1*Math.log((double)index/(double)word1))));
//							System.out.println("p1="+p1);
						}else {
							p1=Double.parseDouble(df.format(p1*(1.0/(double)(word1+2))));
						}
						/**
						 * 垃圾邮件先验概率
						 */
						int index2=wordVector2[wordli.indexOf(tmpStr)]; //该单词在训练数据正常邮件出现的个数
						if(index2>0){
							p2=Double.parseDouble(df.format((p2*Math.log((double)index2/(double)word2))));
//							System.out.println("p2="+p2);
						}else {
							p2=Double.parseDouble(df.format(p2*(1.0/(double)(word2+2))));
						}
					}else {
//						System.out.println("单词不存在!");
					}
				}
			}
		}
		/**
		 * 打印两个所求的先验概率
		 * 类条件概率、总体概率密度 两者相等,所以只需比较先验概率就可以了。
		 */
//		System.out.println("正常邮件的先验概率为:"+p1);
//		System.out.println("垃圾邮件的先验概率为:"+p2);
		reader.close();
//		System.out.print("\n该邮件为:");
//		System.out.println((p1>p2)?"正常邮件":"垃圾邮件");
		if (p1>p2) {
			return "正常邮件";
		}else{
			return "垃圾邮件";
		}
	}
}


输入的正常邮件:



输入的垃圾邮件:


输出结果:

词汇表长度为:556
词汇表内容为:
HI PETER WITH JOSE OUT OF TOWN DO YOU 
WANT TO MEET ONCE IN A WHILE KEEP THINGS 
GOING AND SOME INTERESTING STUFF LET ME KNOW EUGENE 
CODEINE 15MG FOR VISA ONLY METHYLMORPHINE IS NARCOTIC OPIOID 
PAIN RELIEVER WE HAVE 30MG PILLS 3015MG 6015MG 9015MG 
RYAN WHYBREW COMMENTED ON YOUR STATUS WROTE TURD FERGUSON 
OR BUTT HORN ORDERCIALIZVIAGRA ONLINE SAVE 0NLINE PHARMACY NOPRESCRIPTION 
REQUIRED BUY CANADIAN DRUGS AT WHOLESALE PRICES FDAAPPROVED SUPERB 
QUALITY ACCEPT ALL MAJOR CREDIT CARDS ARVIND THIRUMALAI REPLY 
THIS EMAIL COMMENT EVERYTHING GAIN INCREDIB1E GAINS LENGTH INCHES 
YOURPENIS PERMANANTLY AMAZING INCREASE THICKNESS UP BETTEREJACU1ATION CONTROL EXPERIENCE 
ROCKHARDERECETIONS EXPLOSIVE INTENSEORGASNS VOLUME OFEJACU1ATE DOCTOR DESIGNED ENDORSED HERBAL 
NATURAL SAFE THE PROVEN NATURALPENISENHANCEMENT THAT WORKS MONEYBACK GUARANTEEED 
THANKS ILL DEFINITELY CHECK HOW BOOK I HEARD CHAPTER 
CAME IT WAS GOOD SHAPE HOPE ARE DOING WELL 
CHEERS TROY AMBIEM ZOLPIDEM 5MG10MG PILL X MG JAY 
STEPP SEE THREAD FOLLOW LINK BELOW ORDER TODAY FROM 
LINKEDIN KERRY HALONEY REQUESTED ADD AS CONNECTION ID LIKE 
MY PROFESSIONAL NETWORK BUYVIAGRA 25MG 50MG 100MG BRANDVIAGRA FEMALEVIAGRA 
PER VIAGRANOPRESCRIPTION NEEDED CERTIFIED HERE AMEX ECHECK WORLDWIDE DELIVERY 
HOTELS ONES RENT TENT THEY LINED HOTEL GROUNDS SO 
MUCH BEING ONE NATURE MORE COUPLE DOZEN TOUR GROUPS 
ABOUT 100M PICTURES TRIP CAN GO THROUGH THEM GET 
JPGS FAVORITE SCENIC WHERE JOCELYN NOW NEW YORK WILL 
COME TOKYO CHINESE YEAR PERHAPS TWO THEN THAILAND WINTER 
HOLIDAY MOM TAKE CARE D YEAH AM READY MAY 
NOT BE BECAUSE JAR HAS PLANE TICKETS GERMANY BENOIT 
MANDELBROT WILMOTT TEAM MATHEMATICIAN FATHER FRACTAL MATHEMATICS ADVOCATE SOPHISTICATED 
MODELLING QUANTITATIVE FINANCE DIED 14TH OCTOBER AGED MAGAZINE OFTEN 
FEATURED HIS IDEAS WORK OTHERS INSPIRED BY FUNDAMENTAL INSIGHTS 
MUST LOGGED VIEW THESE ARTICLES PAST ISSUES HOME BASED 
BUSINESS OPPORTUNITY KNOCKING DOOR DONT RUDE CHANCE EARN GREAT 
INCOME FIND FINANCIAL LIFE TRANSFORMED LEARN SUCCESS FINDER EXPERTS 
SURE THING SOUNDS WHAT TIME WOULD PREPARED THERE REGARDS 
VIVEK MOST COMPETITIVE PRICE NET WILSON FREEVIAGRA JULIUS O 
LOOKING FORWARD INVITATION OFF WATCHESSTORE DISCOUNT WATCHES FAMOUS BRANDS 
AROLEXBVLGARI DIOR HERMES ORIS CARTIER AP LOUIS VUITTON BAGS 
WALLETS GUCCI TIFFANY CO JEWERLY ENJOY FULL WARRANTY SHIPMENT 
VIA REPUTABLE COURIER FEDEX UPS DHL EMS SPEEDPOST RECIEVE 
YAY BOTH FINE IM WORKING AN MBA DESIGN STRATEGY 
CCA TOP ART SCHOOL ITS PROGRAM FOCUSING RIGHTBRAINED CREATIVE 
STRATEGIC APPROACH MANAGEMENT WAY DONE HYDROCODONEVICODIN ESBRAND WATSON VICODIN 
ES BRAND FREE EXPRESS DAYS OVER IVE THOUGHT THINK 
POSSIBLE SHOULD ANOTHER LUNCH CAR COULD PICK DOES WEDNESDAY 
SIGNED COPY SAW COASTTHOUGHT U MIGHT HANGZHOU HUGE DAY 
WASNT ENOUGH BUT GOT GLIMPSE WENT INSIDE CHINA PAVILION 
EXPO PRETTY EACH PROVINCE EXHIBIT PERCOCET WITHOUTPRESCRIPTION TABS ANALGESIC 
USED TREAT MODERATE MODERATELY SEVEREPAIN SHIPPING DISCREET PRIVATE CHEAP 
HOMMIES JUST PHONE CALL ROOFER SPAYING FOAMING DUSTY PLS 
CLOSE DOORS WINDOWS HELP BATHROOM WINDOW CAT SLIDING BEHIND 
TV THOSE CATS SURVIVE SORRY ANY INCONVENIENCE SCIFINANCE AUTOMATICALLY 
GENERATES GPUENABLED PRICING RISK MODEL SOURCE CODE RUNS 50300X 
FASTER THAN SERIAL USING NVIDIA FERMICLASS TESLA 20SERIES GPU 
DERIVATIVES DEVELOPMENT TOOL CC CONCISE HIGHLEVEL SPECIFICATIONS NO PARALLEL 
COMPUTING CUDA PROGRAMMING EXPERTISE SCIFINANCES AUTOMATIC MONTE CARLO GENERATION 
CAPABILITIES BEEN SIGNIFICANTLY EXTENDED LATEST RELEASE INCLUDES OK COLD 
RETIREMENT PARTY LEAVES CHANGING COLOR BIGGERPENIS GROW 3INCHES SAFEST 
EFFECTIVE METHODS OFPENISEN1ARGEMENT MONEY BETTERERECTIONS MA1EENHANCEMENT PRODUCTS SUPPLEMENT TRUSTED 
MILLIONS TALKED JOHN COMPUTER THATS BIKE RIDING RAIN MUSEUM 
SF YESTERDAY HAD FOOD SAME GIANTS GAME WHEN TRAIN 
FANS DRUNK YO RUNNING WEBSITE JQUERY JQPLOT PLUGIN TOO 
FAR AWAY HAVING PROTOTYPE LAUNCH RIGHT IF 
正常邮件词向量为:
5 6 5 1 2 11 1 2 18 1 30 1 1 10 15 1 1 1 4 18 3 2 2 2 4 4 1 0 0 6 0 0 0 8 0 0 0 0 9 4 0 0 0 0 0 2 1 3 18 4 5 3 1 1 2 1 1 0 0 0 0 0 0 1 0 0 0 4 0 0 0 0 0 1 3 0 0 0 2 1 2 10 3 3 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 30 0 0 6 0 0 0 1 1 1 1 2 3 15 1 1 1 8 4 3 1 1 6 2 1 1 1 0 0 0 0 0 0 2 1 3 1 1 1 1 0 2 7 5 2 2 2 3 2 2 1 4 5 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 1 4 1 1 1 1 1 2 3 2 3 1 1 1 1 3 1 2 1 4 3 1 1 3 1 1 1 1 1 3 4 1 6 4 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 4 6 1 2 3 1 1 1 3 4 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 2 1 1 1 1 2 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 3 2 1 4 1 1 0 0 0 0 0 0 2 2 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 4 2 3 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 0 0 0 0 0 0 1 0 0 0 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 1 1 1 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 2 3 3 2 4 3 4 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 2 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1 1 1 1 1 1 2 1 2 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 
垃圾邮件词向量为:
0 0 2 0 0 12 0 0 10 0 27 0 0 12 8 0 0 0 0 17 0 0 0 1 0 0 0 7 2 8 3 4 1 4 3 1 1 1 2 7 5 9 1 1 1 0 0 0 1 7 0 0 0 0 0 0 0 2 7 6 2 3 3 3 7 3 6 6 2 2 2 2 5 3 5 3 3 3 0 0 0 1 0 0 6 6 6 6 6 6 12 6 6 12 6 10 6 6 7 6 6 6 6 6 6 6 6 6 6 8 4 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 2 9 10 0 0 0 0 0 0 0 5 3 4 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 1 1 0 0 3 0 0 0 0 0 0 0 0 3 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0 0 0 0 0 0 0 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0 2 1 1 1 4 3 0 0 0 0 0 4 3 3 7 3 6 3 3 3 3 3 3 3 3 6 3 3 3 3 3 3 3 3 3 3 3 3 4 3 3 3 3 3 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 1 1 3 1 1 2 1 3 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 2 1 1 1 1 2 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

正常邮件判断正确率为:0.56
垃圾邮件判断正确率为:0.76


查看评论

25个JAVA 机器学习工具包

本列表总结了25个Java机器学习工具&库: Weka集成了数据挖掘工作的机器学习算法。这些算法可以直接应用于一个数据集上或者你可以自己编写代码来调用。Weka包括一系列的工具,如数据预处理、分类、回...
  • zhangjianjaEE
  • zhangjianjaEE
  • 2017-11-20 17:54:57
  • 202

机器学习入门算法及其java实现-KNN算法

1、算法基本原理: 对于一个新点X0(x0,y0)X_{0}(x_{0},y_{0}),它的分类y0y_{0}由离它最近的k个点的类别决定; 其中训练集为T{(x1,y1),(x2,y2),...,(...
  • fre0sty
  • fre0sty
  • 2017-10-17 21:23:48
  • 467

java实现一个简单的机器学习和数据挖掘的demo

因为最近忙着一个比赛,想用机器学习的方法来实现,因为我们用的是java,所以就用到了weka,weka的jar包可以去官网下载. 1,第一步先准备数据在项目里创建一个txt文件,然后把下面的...
  • u013078669
  • u013078669
  • 2016-08-06 18:04:29
  • 6536

经典的机器学习方面源代码库

编程语言:搞实验个人认为当然matlab最灵活了(但是正版很贵),但是更为前途的是python(numpy+scipy+matplotlib)和C/C++,这样组合既可搞研究,也可搞商业开发,易用性不...
  • sun2728
  • sun2728
  • 2016-08-11 13:47:58
  • 2739

超全!基于Java的机器学习项目、环境、库...

https://yq.aliyun.com/articles/278837?utm_source=tuicool&utm_medium=referral 摘要: 你是一名希望开始或者正在学习...
  • u011001084
  • u011001084
  • 2017-12-07 16:36:29
  • 390

[机器学习]用Java实现梯度下降

这是在coursea的解释 多元的梯度下降 运算过程实际也就是求偏导数本测试用例为2元但适用于多元的数据数据如下X1,2,3Y1,2,3代码如下package hello; import java....
  • jidong2622
  • jidong2622
  • 2018-02-21 13:27:26
  • 93

PLA算法Java实现——机器学习基石

package Machine_learning.PLA;/** * Created by unclewang on 2017/3/22. */ public class Weight { ...
  • u014277388
  • u014277388
  • 2017-03-22 23:12:19
  • 469

机器学习知识点(七)决策树学习算法Java实现

为理解机器学习第四章节决策树学习算法,通过网上找到的一份现成代码,主要实现了最优划分属性选择和决策树构造,其中最优划分属性选择采用信息增益准则,代码如下: package sk.ml; import...
  • fjssharpsword
  • fjssharpsword
  • 2017-02-04 11:21:31
  • 2686

java实现:机器人视觉与环境感知基于八叉树算法(待开源)

java实现的机器人视觉与环境感知,基于八叉树算法。 随后我会整理,增加注释,在GitHub开源https://github.com/hjwang1/robot...
  • hjwang1
  • hjwang1
  • 2016-09-14 18:35:15
  • 961

机器学习算法-java版

  • 2008年09月19日 00:47
  • 318KB
  • 下载
    个人资料
    专栏达人 持之以恒
    等级:
    访问量: 55万+
    积分: 9215
    排名: 2535
    博客专栏