Java学习日记（51-60天，kNN 与 NB）

本文链接：https://blog.csdn.net/xjx19991226/article/details/118423808

第 51 天: kNN 分类器
第 52 天: kNN 分类器 (续)
第 53 天: kNN 分类器 (续)
第 54 天: 基于 M-distance 的推荐
第 55 天: 基于 M-distance 的推荐 (续)
第 56 天: kMeans 聚类
第 57 天: kMeans 聚类 (续)
第 58 天: 符号型数据的 NB 算法
第 59 天: 数值型数据的 NB 算法
第 60 天: 小结

第 51 天: kNN 分类器

今天将代码抄写并运行
下载安装了weka.jar包，加载arff文件
代码：

package xjx;

import java.io.FileReader;
import java.util.Arrays;
import java.util.Random;

import weka.core.*;

public class KnnClassification {

	/**
	 * Manhattan distance.
	 */
	public static final int MANHATTAN = 0;

	/**
	 * Euclidean distance.
	 */
	public static final int EUCLIDEAN = 1;

	/**
	 * The distance measure.
	 */
	public int distanceMeasure = EUCLIDEAN;

	/**
	 * A random instance;
	 */
	public static final Random random = new Random();

	/**
	 * The number of neighbors.
	 */
	int numNeighbors = 7;

	/**
	 * The whole dataset.
	 */
	Instances dataset;

	/**
	 * The training set. Represented by the indices of the data.
	 */
	int[] trainingSet;

	/**
	 * The testing set. Represented by the indices of the data.
	 */
	int[] testingSet;

	/**
	 * The predictions.
	 */
	int[] predictions;

	/**
	 *********************
	 * The first constructor.
	 * 
	 * @param paraFilename
	 *            The arff filename.
	 *********************
	 */
	public KnnClassification(String paraFilename) {
		try {
			FileReader fileReader = new FileReader(paraFilename);
			dataset = new Instances(fileReader);
			// The last attribute is the decision class.
			dataset.setClassIndex(dataset.numAttributes() - 1);
			fileReader.close();
		} catch (Exception ee) {
			System.out.println("Error occurred while trying to read \'" + paraFilename
					+ "\' in KnnClassification constructor.\r\n" + ee);
			System.exit(0);
		}
	}

	/**
	 *********************
	 * Get a random indices for data randomization.
	 * 
	 * @param paraLength
	 *            The length of the sequence.
	 * @return An array of indices, e.g., {4, 3, 1, 5, 0, 2} with length 6.
	 *********************
	 */
	public static int[] getRandomIndices(int paraLength) {
		int[] resultIndices = new int[paraLength];

		// Step 1. Initialize.
		for (int i = 0; i < paraLength; i++) {
			resultIndices[i] = i;
		}

		// Step 2. Randomly swap.
		int tempFirst, tempSecond, tempValue;
		for (int i = 0; i < paraLength; i++) {
			// Generate two random indices.
			tempFirst = random.nextInt(paraLength);
			tempSecond = random.nextInt(paraLength);

			// Swap.
			tempValue = resultIndices[tempFirst];
			resultIndices[tempFirst] = resultIndices[tempSecond];
			resultIndices[tempSecond] = tempValue;
		}

		return resultIndices;
	}

	/**
	 *********************
	 * Split the data into training and testing parts.
	 * 
	 * @param paraTrainingFraction
	 *            The fraction of the training set.
	 *********************
	 */
	public void splitTrainingTesting(double paraTrainingFraction) {
		int tempSize = dataset.numInstances();
		int[] tempIndices = getRandomIndices(tempSize);
		int tempTrainingSize = (int) (tempSize * paraTrainingFraction);

		trainingSet = new int[tempTrainingSize];
		testingSet = new int[tempSize - tempTrainingSize];

		for (int i = 0; i < tempTrainingSize; i++) {
			trainingSet[i] = tempIndices[i];
		}

		for (int i = 0; i < tempSize - tempTrainingSize; i++) {
			testingSet[i] = tempIndices[tempTrainingSize + i];
		}
	}/

	/**
	 *********************
	 * Predict for the whole testing set. The results are stored in predictions.
	 * #see predictions.
	 *********************
	 */
	public void predict() {
		predictions = new int[testingSet.length];
		for (int i = 0; i < predictions.length; i++) {
			predictions[i] = predict(testingSet[i]);
		}
	}

	/**
	 *********************
	 * Predict for given instance.
	 * 
	 * @return The prediction.
	 *********************
	 */
	public int predict(int paraIndex) {
		int[] tempNeighbors = computeNearests(paraIndex);
		int resultPrediction = simpleVoting(tempNeighbors);

		return resultPrediction;
	}

	/**
	 *********************
	 * The distance between two instances.
	 * 
	 * @param paraI
	 *            The index of the first instance.
	 * @param paraJ
	 *            The index of the second instance.
	 * @return The distance.
	 *********************
	 */
	public double distance(int paraI, int paraJ) {
		int resultDistance = 0;
		double tempDifference;
		switch (distanceMeasure) {
		case MANHATTAN:
			for (int i = 0; i < dataset.numAttributes() - 1; i++) {
				tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
				if (tempDifference < 0) {
					resultDistance -= tempDifference;
				} else {
					resultDistance += tempDifference;
				}
			}
			break;

		case EUCLIDEAN:
			for (int i = 0; i < dataset.numAttributes() - 1; i++) {
				tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
				resultDistance += tempDifference * tempDifference;
			}
			break;
		default:
			System.out.println("Unsupported distance measure: " + distanceMeasure);
		}

		return resultDistance;
	}

	/**
	 *********************
	 * Get the accuracy of the classifier.
	 * 
	 * @return The accuracy.
	 *********************
	 */
	public double getAccuracy() {
		// A double divides an int gets another double.
		double tempCorrect = 0;
		for (int i = 0; i < predictions.length; i++) {
			if (predictions[i] == dataset.instance(testingSet[i]).classValue()) {
				tempCorrect++;
			}
		}

		return tempCorrect / testingSet.length;
	}

	/**
	 ************************************
	 * Compute the nearest k neighbors. Select one neighbor in each scan. In
	 * fact we can scan only once. You may implement it by yourself.
	 * 
	 * @param paraK
	 *            the k value for kNN.
	 * @param paraCurrent
	 *            current instance. We are comparing it with all others.
	 * @return the indices of the nearest instances.
	 ************************************
	 */
	public int[] computeNearests(int paraCurrent) {
		int[] resultNearests = new int[numNeighbors];
		boolean[] tempSelected = new boolean[trainingSet.length];
		double tempDistance;
		double tempMinimalDistance;
		int tempMinimalIndex = 0;

		// Select the nearest paraK indices.
		for (int i = 0; i < numNeighbors; i++) {
			tempMinimalDistance = Double.MAX_VALUE;

			for (int j = 0; j < trainingSet.length; j++) {
				if (tempSelected[j]) {
					continue;
				}

				tempDistance = distance(paraCurrent, trainingSet[j]);
				if (tempDistance < tempMinimalDistance) {
					tempMinimalDistance = tempDistance;
					tempMinimalIndex = j;
				}
			}

			resultNearests[i] = trainingSet[tempMinimalIndex];
			tempSelected[tempMinimalIndex] = true;
		}

		System.out.println("The nearest of " + paraCurrent + " are: " + Arrays.toString(resultNearests));
		return resultNearests;
	}

	/**
	 ************************************
	 * Voting using the instances.
	 * 
	 * @param paraNeighbors
	 *            The indices of the neighbors.
	 * @return The predicted label.
	 ************************************
	 */
	public int simpleVoting(int[] paraNeighbors) {
		int[] tempVotes = new int[dataset.numClasses()];
		for (int i = 0; i < paraNeighbors.length; i++) {
			tempVotes[(int) dataset.instance(paraNeighbors[i]).classValue()]++;
		}

		int tempMaximalVotingIndex = 0;
		int tempMaximalVoting = 0;
		for (int i = 0; i < dataset.numClasses(); i++) {
			if (tempVotes[i] > tempMaximalVoting) {
				tempMaximalVoting = tempVotes[i];
				tempMaximalVotingIndex = i;
			}
		} 

		return tempMaximalVotingIndex;
	}

	/**
	 *********************
	 * The entrance of the program.
	 * 
	 * @param args
	 *            Not used now.
	 *********************
	 */
	public static void main(String args[]) {
		KnnClassification tempClassifier = new KnnClassification("D:/data/iris.arff");
		tempClassifier.splitTrainingTesting(0.8);
		tempClassifier.predict();
		System.out.println("The accuracy of the classifier is: " + tempClassifier.getAccuracy());
	}

}

求邻居和精确度结果：
在这里插入图片描述

第 52 天: kNN 分类器 (续)

重新实现 computeNearests, 仅需要扫描一遍训练集, 即可获得 k kk 个邻居. 提示: 现代码与插入排序思想相结合.
增加 setDistanceMeasure() 方法，选择距离计算的方法.
增加 setNumNeighors() 方法，设置邻居的数量.
代码：

public int[] computeNearests(int paraCurrent) {
   int[] resultNearests = new int[numNeighbors];
   boolean[] tempSelected = new boolean[trainingSet.length];
   double tempDistance;
   double tempMinimalDistance;
   int tempMinimalIndex = 0;

   //直接插入排序
   double[][] tempDistanceArray = new double[trainingSet.length][2];
   tempDistanceArray[0][0] = 0;
   tempDistanceArray[0][1] = distance(paraCurrent, trainingSet[0]);
   int j;
   for (int i = 1; i < trainingSet.length; i++) {
       tempDistance = distance(paraCurrent, trainingSet[i]);
       for (j = i - 1; j >= 0; j--) {
           if (tempDistance < tempDistanceArray[j][1]) {
               tempDistanceArray[j + 1] = tempDistanceArray[j];
           } else {
               break;
           }
       }
       tempDistanceArray[j + 1][0] = i;
       tempDistanceArray[j + 1][1] = tempDistance;
   }

   for (int i = 0; i < numNeighbors; i++) {
       resultNearests[i] = trainingSet[(int)tempDistanceArray[i][0]];
   }

   System.out.println("The nearest of " + paraCurrent + " are: " + Arrays.toString(resultNearests));
   return resultNearests;
}
public void setDistanceMeasure(int paraType) {
   if (paraType == 0) {
       distanceMeasure = MANHATTAN;
   } else if (paraType == 1) {
       distanceMeasure = EUCLIDEAN;
   } else {
       System.out.println("Wrong Distance Measure.");
   }
}

public void setNumNeighbors(int paraNumNeighbors) {
   if (paraNumNeighbors > dataset.numInstances()) {
       System.out.println("The number of neighbors is too big.");
       return;
   }

   numNeighbors = paraNumNeighbors;
}


/**
*********************
* The entrance of the program.
* 
* @param args
*            Not used now.
*********************
*/
public static void main(String args[]) {
	KnnClassification tempClassifier = new KnnClassification("D:/data/iris.arff");
	tempClassifier.setDistanceMeasure(0);
	tempClassifier.setNumNeighbors(5);
	tempClassifier.splitTrainingTesting(0.8);
	tempClassifier.predict();
	System.out.println("The accuracy of the classifier is: " + tempClassifier.getAccuracy());
}

结果：
在这里插入图片描述

第 53 天: kNN 分类器 (续)

增加 weightedVoting() 方法, 距离越短话语权越大. 支持两种以上的加权方式.
实现 leave-one-out 测试.

留一法交叉验证法，就是把一个大的数据集分为k个小数据集，其中k-1个作为训练集，剩下的一个作为测试集，然后选择下一个作为测试集，剩下的k-1个作为训练集，以此类推。

训练集的精确度：
在这里插入图片描述
测试集的精确度：

代码：

public int weightedVoting(int paraCurrent, int[] paraNeighbors) {
     double[] tempVotes = new double[dataset.numClasses()];

     double tempDistance;
     //a越大，b越小，效果越好
     int a = 2, b = 1;
    
     for (int i = 0; i < paraNeighbors.length; i++) {
         tempDistance = distance(paraCurrent, paraNeighbors[i]);
         tempVotes[(int) dataset.instance(paraNeighbors[i]).classValue()]
                 += getWeightedNum(a, b, tempDistance);
     }

     int tempMaximalVotingIndex = 0;
     double tempMaximalVoting = 0;
     for (int i = 0; i < dataset.numClasses(); i++) {
         if (tempVotes[i] > tempMaximalVoting) {
             tempMaximalVoting = tempVotes[i];
             tempMaximalVotingIndex = i;
         }
     }

     return tempMaximalVotingIndex;
 }

 public double getWeightedNum(int a, int b, double paraDistance) {
     return b / (paraDistance + a);
 }

 public void leave_one_out() {
 	//留一法交叉验证
     int tempSize = dataset.numInstances();
     int[] tempIndices = getRandomIndices(tempSize);
     double tempCorrect = 0;
     for (int i = 0; i < tempSize; i++) {
         trainingSet = new int[tempSize - 1];
         testingSet = new int[1];

         int tempIndex = 0;
         for (int j = 0; j < tempSize; j++) {
             if (j == i) {
                 continue;
             }
             trainingSet[tempIndex++] = tempIndices[j];
         }

         testingSet[0] = tempIndices[i];

         this.predict();

         if (predictions[0] == dataset.instance(testingSet[0]).classValue()) {
             tempCorrect++;
         }
     }

     System.out.println("The accuracy is：" + tempCorrect / tempSize);
 }

 public static void main(String[] args) {
     KnnClassification tempClassifier = new KnnClassification("D:\\data\\iris.arff");
     tempClassifier.setDistanceMeasure(0);
     tempClassifier.setNumNeighbors(5);
     tempClassifier.splitTrainingTesting(0.8);
     tempClassifier.predict();
     System.out.println("The accuracy of the classifier is: " + tempClassifier.getAccuracy());

     //测试
     System.out.println("\r\n-------leave_one_out-------");
     tempClassifier.leave_one_out();
 }

第 54 天: 基于 M-distance 的推荐

所谓 M-distance, 就是根据平均分来计算两个用户 (或项目) 之间的距离.
采用 item-based recommendation, 则第 j 个项目关于第 i 个用户的邻居项目集合为
第 i 个用户对 j 个项目的评分预测为:
邻居不用 k 控制. 距离小于 radius 的都是邻居. 使用 M-distance 时, 这种方式效果更好.
使用 leave-one-out的测试方式, 很高效的算法才能使用这种方式.

结果：
在这里插入图片描述
代码：

package xjx;
import java.io.*;

public class MBR {

	/**
	 * Default rating for 1-5 points.
	 */
	public static final double DEFAULT_RATING = 3.0;

	/**
	 * The total number of users.
	 */
	private int numUsers;

	/**
	 * The total number of items.
	 */
	private int numItems;

	/**
	 * The total number of ratings (non-zero values)
	 */
	private int numRatings;

	/**
	 * The predictions.
	 */
	private double[] predictions;

	/**
	 * Compressed rating matrix. User-item-rating triples.
	 */
	private int[][] compressedRatingMatrix;

	/**
	 * The degree of users (how many item he has rated).
	 */
	private int[] userDegrees;

	/**
	 * The average rating of the current user.
	 */
	private double[] userAverageRatings;

	/**
	 * The degree of users (how many item he has rated).
	 */
	private int[] itemDegrees;

	/**
	 * The average rating of the current item.
	 */
	private double[] itemAverageRatings;

	/**
	 * The first user start from 0. Let the first user has x ratings, the second
	 * user will start from x.
	 */
	private int[] userStartingIndices;

	/**
	 * Number of non-neighbor objects.
	 */
	private int numNonNeighbors;

	/**
	 * The radius (delta) for determining the neighborhood.
	 */
	private double radius;

	/**
	 ************************* 
	 * Construct the rating matrix.
	 * 
	 * @param paraRatingFilename
	 *            the rating filename.
	 * @param paraNumUsers
	 *            number of users
	 * @param paraNumItems
	 *            number of items
	 * @param paraNumRatings
	 *            number of ratings
	 ************************* 
	 */
	public MBR(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings) throws Exception {
		// Step 1. Initialize these arrays
		numItems = paraNumItems;
		numUsers = paraNumUsers;
		numRatings = paraNumRatings;

		userDegrees = new int[numUsers];
		userStartingIndices = new int[numUsers + 1];
		userAverageRatings = new double[numUsers];
		itemDegrees = new int[numItems];
		compressedRatingMatrix = new int[numRatings][3];
		itemAverageRatings = new double[numItems];

		predictions = new double[numRatings];

		System.out.println("Reading " + paraFilename);

		// Step 2. Read the data file.
		File tempFile = new File(paraFilename);
		if (!tempFile.exists()) {
			System.out.println("File " + paraFilename + " does not exists.");
			System.exit(0);
		} // Of if
		BufferedReader tempBufReader = new BufferedReader(new FileReader(tempFile));
		String tempString;
		String[] tempStrArray;
		int tempIndex = 0;
		userStartingIndices[0] = 0;
		userStartingIndices[numUsers] = numRatings;
		while ((tempString = tempBufReader.readLine()) != null) {
			// Each line has three values
			tempStrArray = tempString.split(",");
			compressedRatingMatrix[tempIndex][0] = Integer.parseInt(tempStrArray[0]);
			compressedRatingMatrix[tempIndex][1] = Integer.parseInt(tempStrArray[1]);
			compressedRatingMatrix[tempIndex][2] = Integer.parseInt(tempStrArray[2]);

			userDegrees[compressedRatingMatrix[tempIndex][0]]++;
			itemDegrees[compressedRatingMatrix[tempIndex][1]]++;

			if (tempIndex > 0) {
				// Starting to read the data of a new user.
				if (compressedRatingMatrix[tempIndex][0] != compressedRatingMatrix[tempIndex - 1][0]) {
					userStartingIndices[compressedRatingMatrix[tempIndex][0]] = tempIndex;
				} // Of if
			} // Of if
			tempIndex++;
		} // Of while
		tempBufReader.close();

		double[] tempUserTotalScore = new double[numUsers];
		double[] tempItemTotalScore = new double[numItems];
		for (int i = 0; i < numRatings; i++) {
			tempUserTotalScore[compressedRatingMatrix[i][0]] += compressedRatingMatrix[i][2];
			tempItemTotalScore[compressedRatingMatrix[i][1]] += compressedRatingMatrix[i][2];
		} // Of for i

		for (int i = 0; i < numUsers; i++) {
			userAverageRatings[i] = tempUserTotalScore[i] / userDegrees[i];
		} // Of for i
		for (int i = 0; i < numItems; i++) {
			itemAverageRatings[i] = tempItemTotalScore[i] / itemDegrees[i];
		} // Of for i
	}// Of the first constructor

	/**
	 ************************* 
	 * Set the radius (delta).
	 * 
	 * @param paraRadius
	 *            The given radius.
	 ************************* 
	 */
	public void setRadius(double paraRadius) {
		if (paraRadius > 0) {
			radius = paraRadius;
		} else {
			radius = 0.1;
		} // Of if
	}// Of setRadius

	/**
	 ************************* 
	 * Leave-one-out prediction. The predicted values are stored in predictions.
	 * 
	 * @see predictions
	 ************************* 
	 */
	public void leaveOneOutPrediction() {
		double tempItemAverageRating;
		// Make each line of the code shorter.
		int tempUser, tempItem, tempRating;
		System.out.println("\r\nLeaveOneOutPrediction for radius " + radius);

		numNonNeighbors = 0;
		for (int i = 0; i < numRatings; i++) {
			tempUser = compressedRatingMatrix[i][0];
			tempItem = compressedRatingMatrix[i][1];
			tempRating = compressedRatingMatrix[i][2];

			// Step 1. Recompute average rating of the current item.
			tempItemAverageRating = (itemAverageRatings[tempItem] * itemDegrees[tempItem] - tempRating)
					/ (itemDegrees[tempItem] - 1);

			// Step 2. Recompute neighbors, at the same time obtain the ratings
			// Of neighbors.
			int tempNeighbors = 0;
			double tempTotal = 0;
			int tempComparedItem;
			for (int j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++) {
				tempComparedItem = compressedRatingMatrix[j][1];
				if (tempItem == tempComparedItem) {
					continue;// Ignore itself.
				} // Of if

				if (Math.abs(tempItemAverageRating - itemAverageRatings[tempComparedItem]) < radius) {
					tempTotal += compressedRatingMatrix[j][2];
					tempNeighbors++;
				} // Of if
			} // Of for j

			// Step 3. Predict as the average value of neighbors.
			if (tempNeighbors > 0) {
				predictions[i] = tempTotal / tempNeighbors;
			} else {
				predictions[i] = DEFAULT_RATING;
				numNonNeighbors++;
			} // Of if
		} // Of for i
	}// Of leaveOneOutPrediction

	/**
	 ************************* 
	 * Compute the MAE based on the deviation of each leave-one-out.
	 * 
	 * @author Fan Min
	 ************************* 
	 */
	public double computeMAE() throws Exception {
		double tempTotalError = 0;
		for (int i = 0; i < predictions.length; i++) {
			tempTotalError += Math.abs(predictions[i] - compressedRatingMatrix[i][2]);
		} // Of for i

		return tempTotalError / predictions.length;
	}// Of computeMAE

	/**
	 ************************* 
	 * Compute the MAE based on the deviation of each leave-one-out.
	 * 
	 ************************* 
	 */
	public double computeRSME() throws Exception {
		double tempTotalError = 0;
		for (int i = 0; i < predictions.length; i++) {
			tempTotalError += (predictions[i] - compressedRatingMatrix[i][2])
					* (predictions[i] - compressedRatingMatrix[i][2]);
		} // Of for i

		double tempAverage = tempTotalError / predictions.length;

		return Math.sqrt(tempAverage);
	}// Of computeRSME

	/**
	 ************************* 
	 * The entrance of the program.
	 * 
	 * @param args
	 *            Not used now.
	 ************************* 
	 */
	public static void main(String[] args) {
		try {
			MBR tempRecommender = new MBR("D:/data/movielens943u1682m.txt", 943, 1682, 100000);

			for (double tempRadius = 0.2; tempRadius < 0.6; tempRadius += 0.1) {
				tempRecommender.setRadius(tempRadius);

				tempRecommender.leaveOneOutPrediction();
				double tempMAE = tempRecommender.computeMAE();
				double tempRSME = tempRecommender.computeRSME();

				System.out.println("Radius = " + tempRadius + ", MAE = " + tempMAE + ", RSME = " + tempRSME
						+ ", numNonNeighbors = " + tempRecommender.numNonNeighbors);
			} // Of for tempRadius
		} catch (Exception ee) {
			System.out.println(ee);
		} // Of try
	}// Of main
}// Of class MBR

第 55 天: 基于 M-distance 的推荐 (续)

昨天实现的是 item-based recommendation. 今天自己来实现一下 user-based recommendation. 只需要在原有基础上增加即可.
代码：

package xjx;

import java.io.*;

public class MBR {

	/**
	 *评分为1-5分
	 */
	public static final double DEFAULT_RATING = 3.0;

	/**
	 * 用户数量
	 */
	private int numUsers;

	/**
	 * 项目数量
	 */
	private int numItems;

	/**
	 * 评分数量（非零值）
	 */
	private int numRatings;

	/**
	 * 预测数组
	 */
	private double[] predictions;

	/**
	 * 压缩评级矩阵。
	 */
	private int[][] compressedRatingMatrix;

	/**
	 *有多少用户对项目进行了评分
	 */
	private int[] userDegrees;

	/**
	 * 当前用户的平均分级
	 */
	private double[] userAverageRatings;

	/**
	 * 多少项目被评分
	 */
	private int[] itemDegrees;

	/**
	 * 当前项目平均评级
	 */
	private double[] itemAverageRatings;

	/**
	 * 第一个用户从0开始。让第一个用户有x评级，第二个用户将从x开始。
	 */
	private int[] userStartingIndices;

	/**
	 * 没有邻居的对象
	 */
	private int numNonNeighbors;

	/**
	 * 确定邻居的半径（增量）
	 */
	private double radius;

	/**
	 ************************* 
	 * 创建评分矩阵
	 * 
	 * @param paraRatingFilename
	 *            the rating filename.
	 * @param paraNumUsers
	 *            number of users
	 * @param paraNumItems
	 *            number of items
	 * @param paraNumRatings
	 *            number of ratings
	 ************************* 
	 */
	public MBR(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings) throws Exception {
		//初始化三个数组
		numItems = paraNumItems;
		numUsers = paraNumUsers;
		numRatings = paraNumRatings;

		userDegrees = new int[numUsers];
		userStartingIndices = new int[numUsers + 1];
		userAverageRatings = new double[numUsers];
		itemDegrees = new int[numItems];
		compressedRatingMatrix = new int[numRatings][3];
		itemAverageRatings = new double[numItems];

		predictions = new double[numRatings];

		System.out.println("Reading " + paraFilename);

		//读取数据
		File tempFile = new File(paraFilename);
		if (!tempFile.exists()) {
			System.out.println("File " + paraFilename + " does not exists.");
			System.exit(0);
		}
		BufferedReader tempBufReader = new BufferedReader(new FileReader(tempFile));
		String tempString;
		String[] tempStrArray;
		int tempIndex = 0;
		userStartingIndices[0] = 0;
		userStartingIndices[numUsers] = numRatings;
		while ((tempString = tempBufReader.readLine()) != null) {
			//每一行有三个值
			tempStrArray = tempString.split(",");
			compressedRatingMatrix[tempIndex][0] = Integer.parseInt(tempStrArray[0]);//用户
			compressedRatingMatrix[tempIndex][1] = Integer.parseInt(tempStrArray[1]);//项目
			compressedRatingMatrix[tempIndex][2] = Integer.parseInt(tempStrArray[2]);//评级

			userDegrees[compressedRatingMatrix[tempIndex][0]]++;
			itemDegrees[compressedRatingMatrix[tempIndex][1]]++;

			if (tempIndex > 0) {
				// 开始读取新用户数据
				if (compressedRatingMatrix[tempIndex][0] != compressedRatingMatrix[tempIndex - 1][0]) {
					userStartingIndices[compressedRatingMatrix[tempIndex][0]] = tempIndex;
				}
			}
			tempIndex++;
		}
		tempBufReader.close();

		double[] tempUserTotalScore = new double[numUsers];
		double[] tempItemTotalScore = new double[numItems];
		for (int i = 0; i < numRatings; i++) {
			tempUserTotalScore[compressedRatingMatrix[i][0]] += compressedRatingMatrix[i][2];
			tempItemTotalScore[compressedRatingMatrix[i][1]] += compressedRatingMatrix[i][2];
		}

		for (int i = 0; i < numUsers; i++) {
			userAverageRatings[i] = tempUserTotalScore[i] / userDegrees[i];
		}
		for (int i = 0; i < numItems; i++) {
			itemAverageRatings[i] = tempItemTotalScore[i] / itemDegrees[i];
		}
	}

	/**
	 ************************* 
	 * 设置半径（增量）。
	 * 
	 * @param paraRadius
	 *            The given radius.
	 ************************* 
	 */
	public void setRadius(double paraRadius) {
		if (paraRadius > 0) {
			radius = paraRadius;
		} else {
			radius = 0.1;
		}
	}

	/**
	 ************************* 
	 * 留一法。预测值存储在预测中。
	 * 
	 * @see predictions
	 ************************* 
	 */
	public void leaveOneOutPrediction() {
		double tempUserAverageRating;
		int tempUser, tempItem, tempRating;
		System.out.println("\r\nLeaveOneOutPrediction for radius " + radius);

		numNonNeighbors = 0;
		for (int i = 0; i < numRatings; i++) {
			tempUser = compressedRatingMatrix[i][0];
			tempItem = compressedRatingMatrix[i][1];
			tempRating = compressedRatingMatrix[i][2];

			//重新计算当前的平均评分。
			tempUserAverageRating = (userAverageRatings[tempUser] * userDegrees[tempUser] - tempRating)
					/ (userDegrees[tempUser] - 1);

			//重新计算邻居，同时获得评分
			//邻居
			int tempNeighbors = 0;
			double tempTotal = 0;
			int tempComparedUser;
			for (int j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++) {
				tempComparedUser = compressedRatingMatrix[j][0];
				if (tempUser == tempComparedUser) {
					continue;
				}

				if (Math.abs(tempUserAverageRating - userAverageRatings[tempComparedUser]) < radius) {
					tempTotal += compressedRatingMatrix[j][2];
					tempNeighbors++;
				}
			}

			//预测为邻居的平均值。
			if (tempNeighbors > 0) {
				predictions[i] = tempTotal / tempNeighbors;
			} else {
				predictions[i] = DEFAULT_RATING;
				numNonNeighbors++;
			}
		}
	}

	/**
	 ************************* 
	 * 根据每个遗漏的偏差计算MAE。
	 ************************* 
	 */
	public double computeMAE() throws Exception {
		double tempTotalError = 0;
		for (int i = 0; i < predictions.length; i++) {
			tempTotalError += Math.abs(predictions[i] - compressedRatingMatrix[i][2]);
		}

		return tempTotalError / predictions.length;
	}

	/**
	 ************************* 
	 * 根据每个遗漏的偏差计算MAE。
	 ************************* 
	 */
	public double computeRSME() throws Exception {
		double tempTotalError = 0;
		for (int i = 0; i < predictions.length; i++) {
			tempTotalError += (predictions[i] - compressedRatingMatrix[i][2])
					* (predictions[i] - compressedRatingMatrix[i][2]);
		}

		double tempAverage = tempTotalError / predictions.length;

		return Math.sqrt(tempAverage);
	}

	/**
	 ************************* 
	 * The entrance of the program.
	 * 
	 * @param args
	 *            Not used now.
	 ************************* 
	 */
	public static void main(String[] args) {
		try {
			MBR tempRecommender = new MBR("D:/data/movielens943u1682m.txt", 10000, 1682, 1000000);

			for (double tempRadius = 0.2; tempRadius < 0.6; tempRadius += 0.1) {
				tempRecommender.setRadius(tempRadius);

				tempRecommender.leaveOneOutPrediction();
				double tempMAE = tempRecommender.computeMAE();
				double tempRSME = tempRecommender.computeRSME();

				System.out.println("Radius = " + tempRadius + ", MAE = " + tempMAE + ", RSME = " + tempRSME
						+ ", numNonNeighbors = " + tempRecommender.numNonNeighbors);
			}
		} catch (Exception ee) {
			System.out.println(ee);
		}
	}
}

第 56 天: kMeans 聚类

K-means 的算法步骤为：

1.选择初始化的 k 个样本作为初始聚类中心 a=a1,a2,…ak;
2.针对数据集中每个样本xi计算它到 k 个聚类中心的距离并将其分到距离最小的聚类中心所对应的类中；
3.针对每个类别aj ，重新计算它的聚类中心
在这里插入图片描述
（即属于该类的所有样本的质心）；
4.重复上面 2 3 两步操作，直到达到某个中止条件（迭代次数、最小误差变化等）。

聚类结果：

New loop ...
Now the new centers are: [[6.017142857142856, 2.7971428571428567, 4.545714285714286, 1.5214285714285716], [6.964285714285715, 3.089285714285714, 5.932142857142857, 2.107142857142857], [5.005769230769231, 3.3807692307692316, 1.5288461538461537, 0.2749999999999999]]
New loop ...
Now the new centers are: [[6.022666666666666, 2.804, 4.544, 1.5333333333333332], [6.980000000000001, 3.0759999999999996, 5.991999999999998, 2.1039999999999996], [5.005999999999999, 3.4180000000000006, 1.464, 0.2439999999999999]]
New loop ...
Now the new centers are: [[6.022666666666666, 2.804, 4.544, 1.5333333333333332], [6.980000000000001, 3.0759999999999996, 5.991999999999998, 2.1039999999999996], [5.005999999999999, 3.4180000000000006, 1.464, 0.2439999999999999]]
The clusters are: [[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 106, 110, 111, 112, 113, 114, 115, 116, 119, 121, 123, 126, 127, 133, 137, 138, 139, 141, 142, 145, 146, 147, 148, 149], [100, 102, 103, 104, 105, 107, 108, 109, 117, 118, 120, 122, 124, 125, 128, 129, 130, 131, 132, 134, 135, 136, 140, 143, 144], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]

代码：

package xjx;

import java.io.FileReader;
import java.util.Arrays;
import java.util.Random;
import weka.core.Instances;

 public class KMeans {

	/**
	 * 曼哈顿距离.
	 */
	public static final int MANHATTAN = 0;

	/**
	 * 欧式距离.
	 */
	public static final int EUCLIDEAN = 1;

	/**
	 * 距离测量.
	 */
	public int distanceMeasure = EUCLIDEAN;

	/**
	 * 随机数;
	 */
	public static final Random random = new Random();

	/**
	 * 数据.
	 */
	Instances dataset;

	/**
	 * 聚类的数目.
	 */
	int numClusters = 2;

	/**
	 *聚类.
	 */
	int[][] clusters;

	/**
	 ******************************* 
	 * 第一个
	 * 
	 * @param paraFilename
	 *            The data filename.
	 ******************************* 
	 */
	public KMeans(String paraFilename) {
		dataset = null;
		try {
			FileReader fileReader = new FileReader(paraFilename);
			dataset = new Instances(fileReader);
			fileReader.close();
		} catch (Exception ee) {
			System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
			System.exit(0);
		}
	}

	public void setNumClusters(int paraNumClusters) {
		numClusters = paraNumClusters;
	}

	/**
	 *********************
	 * 为数据随机化得到一个随机指标。
	 * 
	 * @param paraLength
	 *            The length of the sequence.
	 * @return An array of indices, e.g., {4, 3, 1, 5, 0, 2} with length 6.
	 *********************
	 */
	public static int[] getRandomIndices(int paraLength) {
		int[] resultIndices = new int[paraLength];

		//初始化
		for (int i = 0; i < paraLength; i++) {
			resultIndices[i] = i;
		}

		//随机交换
		int tempFirst, tempSecond, tempValue;
		for (int i = 0; i < paraLength; i++) {
			// 生成两个随机索引
			tempFirst = random.nextInt(paraLength);
			tempSecond = random.nextInt(paraLength);

			// 交换
			tempValue = resultIndices[tempFirst];
			resultIndices[tempFirst] = resultIndices[tempSecond];
			resultIndices[tempSecond] = tempValue;
		}

		return resultIndices;
	}

	/**
	 *********************
	 * 两个实例间的距离
	 * 
	 * @param paraI
	 *            The index of the first instance.
	 * @param paraArray
	 *            The array representing a point in the space.
	 * @return The distance.
	 *********************
	 */
	public double distance(int paraI, double[] paraArray) {
		int resultDistance = 0;
		double tempDifference;
		switch (distanceMeasure) {
		case MANHATTAN:
			for (int i = 0; i < dataset.numAttributes() - 1; i++) {
				tempDifference = dataset.instance(paraI).value(i) - paraArray[i];
				if (tempDifference < 0) {
					resultDistance -= tempDifference;
				} else {
					resultDistance += tempDifference;
				}
			}
			break;

		case EUCLIDEAN:
			for (int i = 0; i < dataset.numAttributes() - 1; i++) {
				tempDifference = dataset.instance(paraI).value(i) - paraArray[i];
				resultDistance += tempDifference * tempDifference;
			} 
			break;
		default:
			System.out.println("Unsupported distance measure: " + distanceMeasure);
		}

		return resultDistance;
	}

	/**
	 ******************************* 
	 * 聚类
	 ******************************* 
	 */
	public void clustering() {
		int[] tempOldClusterArray = new int[dataset.numInstances()];
		tempOldClusterArray[0] = -1;
		int[] tempClusterArray = new int[dataset.numInstances()];
		Arrays.fill(tempClusterArray, 0);
		double[][] tempCenters = new double[numClusters][dataset.numAttributes() - 1];

		//初始化中心
		int[] tempRandomOrders = getRandomIndices(dataset.numInstances());
		for (int i = 0; i < numClusters; i++) {
			for (int j = 0; j < tempCenters[0].length; j++) {
				tempCenters[i][j] = dataset.instance(tempRandomOrders[i]).value(j);
			} 
		} 

		int[] tempClusterLengths = null;
		while (!Arrays.equals(tempOldClusterArray, tempClusterArray)) {
			System.out.println("New loop ...");
			tempOldClusterArray = tempClusterArray;
			tempClusterArray = new int[dataset.numInstances()];

			// 最小化。为每个实例分配集群。
			int tempNearestCenter;
			double tempNearestDistance;
			double tempDistance;

			for (int i = 0; i < dataset.numInstances(); i++) {
				tempNearestCenter = -1;
				tempNearestDistance = Double.MAX_VALUE;

				for (int j = 0; j < numClusters; j++) {
					tempDistance = distance(i, tempCenters[j]);
					if (tempNearestDistance > tempDistance) {
						tempNearestDistance = tempDistance;
						tempNearestCenter = j;
					}
				}
				tempClusterArray[i] = tempNearestCenter;
			}

			//找到新中心
			tempClusterLengths = new int[numClusters];
			Arrays.fill(tempClusterLengths, 0);
			double[][] tempNewCenters = new double[numClusters][dataset.numAttributes() - 1];
			// Arrays.fill(tempNewCenters, 0);
			for (int i = 0; i < dataset.numInstances(); i++) {
				for (int j = 0; j < tempNewCenters[0].length; j++) {
					tempNewCenters[tempClusterArray[i]][j] += dataset.instance(i).value(j);
				}
				tempClusterLengths[tempClusterArray[i]]++;
			}

			// 求平均
			for (int i = 0; i < tempNewCenters.length; i++) {
				for (int j = 0; j < tempNewCenters[0].length; j++) {
					tempNewCenters[i][j] /= tempClusterLengths[i];
				}
			}

			System.out.println("Now the new centers are: " + Arrays.deepToString(tempNewCenters));
			tempCenters = tempNewCenters;
		}

		//形成聚类
		clusters = new int[numClusters][];
		int[] tempCounters = new int[numClusters];
		for (int i = 0; i < numClusters; i++) {
			clusters[i] = new int[tempClusterLengths[i]];
		}

		for (int i = 0; i < tempClusterArray.length; i++) {
			clusters[tempClusterArray[i]][tempCounters[tempClusterArray[i]]] = i;
			tempCounters[tempClusterArray[i]]++;
		}

		System.out.println("The clusters are: " + Arrays.deepToString(clusters));
	}

	/**
	 ******************************* 
	 * 测试.
	 ******************************* 
	 */
	public static void testClustering() {
		KMeans tempKMeans = new KMeans("D:/data/iris.arff");
		tempKMeans.setNumClusters(3);
		tempKMeans.clustering();
	}

	public static void main(String arags[]) {
		testClustering();
	}

}

第 57 天: kMeans 聚类 (续)

获得虚拟中心后, 换成与其最近的点作为实际中心, 再聚类.
今天主要是想控制下节奏. 毕竟 kMeans 也值得两天的工作量.

找中心点结果：
在这里插入图片描述
代码：

package xjx;

import java.io.FileReader;
import java.util.Arrays;
import java.util.Random;
import weka.core.Instances;

 public class KMeans {

	/**
	 * 曼哈顿距离.
	 */
	public static final int MANHATTAN = 0;

	/**
	 * 欧式距离.
	 */
	public static final int EUCLIDEAN = 1;

	/**
	 * 距离测量.
	 */
	public int distanceMeasure = EUCLIDEAN;

	/**
	 * 随机数;
	 */
	public static final Random random = new Random();

	/**
	 * 数据.
	 */
	Instances dataset;

	/**
	 * 聚类的数目.
	 */
	int numClusters = 2;

	/**
	 *聚类.
	 */
	int[][] clusters;

	/**
	 ******************************* 
	 * 第一个
	 * 
	 * @param paraFilename
	 *            The data filename.
	 ******************************* 
	 */
	public KMeans(String paraFilename) {
		dataset = null;
		try {
			FileReader fileReader = new FileReader(paraFilename);
			dataset = new Instances(fileReader);
			fileReader.close();
		} catch (Exception ee) {
			System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
			System.exit(0);
		}
	}

	public void setNumClusters(int paraNumClusters) {
		numClusters = paraNumClusters;
	}

	/**
	 *********************
	 * 为数据随机化得到一个随机指标。
	 * 
	 * @param paraLength
	 *            The length of the sequence.
	 * @return An array of indices, e.g., {4, 3, 1, 5, 0, 2} with length 6.
	 *********************
	 */
	public static int[] getRandomIndices(int paraLength) {
		int[] resultIndices = new int[paraLength];

		//初始化
		for (int i = 0; i < paraLength; i++) {
			resultIndices[i] = i;
		}

		//随机交换
		int tempFirst, tempSecond, tempValue;
		for (int i = 0; i < paraLength; i++) {
			// 生成两个随机索引
			tempFirst = random.nextInt(paraLength);
			tempSecond = random.nextInt(paraLength);

			// 交换
			tempValue = resultIndices[tempFirst];
			resultIndices[tempFirst] = resultIndices[tempSecond];
			resultIndices[tempSecond] = tempValue;
		}

		return resultIndices;
	}

	/**
	 *********************
	 * 两个实例间的距离
	 * 
	 * @param paraI
	 *            The index of the first instance.
	 * @param paraArray
	 *            The array representing a point in the space.
	 * @return The distance.
	 *********************
	 */
	public double distance(int paraI, double[] paraArray) {
		int resultDistance = 0;
		double tempDifference;
		switch (distanceMeasure) {
		case MANHATTAN:
			for (int i = 0; i < dataset.numAttributes() - 1; i++) {
				tempDifference = dataset.instance(paraI).value(i) - paraArray[i];
				if (tempDifference < 0) {
					resultDistance -= tempDifference;
				} else {
					resultDistance += tempDifference;
				}
			}
			break;

		case EUCLIDEAN:
			for (int i = 0; i < dataset.numAttributes() - 1; i++) {
				tempDifference = dataset.instance(paraI).value(i) - paraArray[i];
				resultDistance += tempDifference * tempDifference;
			} 
			break;
		default:
			System.out.println("Unsupported distance measure: " + distanceMeasure);
		}

		return resultDistance;
	}

	/**
	 ******************************* 
	 * 聚类
	 ******************************* 
	 */
	public void clustering() {
		int[] tempOldClusterArray = new int[dataset.numInstances()];
		tempOldClusterArray[0] = -1;
		int[] tempClusterArray = new int[dataset.numInstances()];
		Arrays.fill(tempClusterArray, 0);
		double[][] tempCenters = new double[numClusters][dataset.numAttributes() - 1];

		//初始化中心
		int[] tempRandomOrders = getRandomIndices(dataset.numInstances());
		for (int i = 0; i < numClusters; i++) {
			for (int j = 0; j < tempCenters[0].length; j++) {
				tempCenters[i][j] = dataset.instance(tempRandomOrders[i]).value(j);
			} 
		} 

		int[] tempClusterLengths = null;
		while (!Arrays.equals(tempOldClusterArray, tempClusterArray)) {
			System.out.println("New loop ...");
			tempOldClusterArray = tempClusterArray;
			tempClusterArray = new int[dataset.numInstances()];

			// 最小化。为每个实例分配集群。
			int tempNearestCenter;
			double tempNearestDistance;
			double tempDistance;

			for (int i = 0; i < dataset.numInstances(); i++) {
				tempNearestCenter = -1;
				tempNearestDistance = Double.MAX_VALUE;

				for (int j = 0; j < numClusters; j++) {
					tempDistance = distance(i, tempCenters[j]);
					if (tempNearestDistance > tempDistance) {
						tempNearestDistance = tempDistance;
						tempNearestCenter = j;
					}
				}
				tempClusterArray[i] = tempNearestCenter;
			}

//			//找到新中心
//			tempClusterLengths = new int[numClusters];
//			Arrays.fill(tempClusterLengths, 0);
			double[][] tempNewCenters = new double[numClusters][dataset.numAttributes() - 1];
//			// Arrays.fill(tempNewCenters, 0);
//			for (int i = 0; i < dataset.numInstances(); i++) {
//				for (int j = 0; j < tempNewCenters[0].length; j++) {
//					tempNewCenters[tempClusterArray[i]][j] += dataset.instance(i).value(j);
//				}
//				tempClusterLengths[tempClusterArray[i]]++;
//			}
//
//			// 求平均
//			for (int i = 0; i < tempNewCenters.length; i++) {
//				for (int j = 0; j < tempNewCenters[0].length; j++) {
//					tempNewCenters[i][j] /= tempClusterLengths[i];
//				}
//			}
			//当前临时实际中心点与平均中心点的距离
            double[] tempNearestDistanceArray = new double[numClusters];
            //当前距离平均中心最近的实际点
            double[][] tempActualCenters = new double[numClusters][dataset.numAttributes() - 1];
            Arrays.fill(tempNearestDistanceArray, Double.MAX_VALUE);
            for (int i = 0; i < dataset.numInstances(); i++) {
                //用当前数据去与其分类的中心比较距离
                if (tempNearestDistanceArray[tempClusterArray[i]] > distance(i, tempCenters[tempClusterArray[i]])) {
                    tempNearestDistanceArray[tempClusterArray[i]] = distance(i, tempCenters[tempClusterArray[i]]);
                    //暂时存储当前距离平均中心最近的实际点
                    for (int j = 0; j < dataset.numAttributes() - 1; j++) {
                        tempActualCenters[tempClusterArray[i]][j] = dataset.instance((i)).value(j);
                    }
                }
            }

            for (int i = 0; i < tempNewCenters.length; i++) {
                tempNewCenters[i] = tempActualCenters[i];
            }

			System.out.println("Now the new centers are: " + Arrays.deepToString(tempNewCenters));
			tempCenters = tempNewCenters;
		}

		//形成聚类
		clusters = new int[numClusters][];
		int[] tempCounters = new int[numClusters];
		for (int i = 0; i < numClusters; i++) {
			clusters[i] = new int[tempClusterLengths[i]];
		}

		for (int i = 0; i < tempClusterArray.length; i++) {
			clusters[tempClusterArray[i]][tempCounters[tempClusterArray[i]]] = i;
			tempCounters[tempClusterArray[i]]++;
		}

		System.out.println("The clusters are: " + Arrays.deepToString(clusters));
	}

	/**
	 ******************************* 
	 * 测试.
	 ******************************* 
	 */
	public static void testClustering() {
		KMeans tempKMeans = new KMeans("D:/data/iris.arff");
		tempKMeans.setNumClusters(3);
		tempKMeans.clustering();
	}

	public static void main(String arags[]) {
		testClustering();
	}

}

第 58 天: 符号型数据的 NB 算法

NB（朴素贝叶斯）算法是一种分类算法。

分类任务是机器学习中最常见的任务。给定一个对象X，将X划分到预定好的某一类别y中。其中Y代表所有类别的一个有限集合，如新闻类别：{军事新闻，科技新闻，生活新闻}。y代表分类集合中的某一类别，如军事新闻。X代表待分类的对象，x代表该对象的特征，如X代表一篇待分类的文章则x就代表该文章中的单词。目标：输入X，输出y

设每个数据样本用一个n维特征向量来描述n个属性的值，即：X={x1，x2，…，xn}，
假定有m个类，分别用C1, C2,…，Cm表示。
给定一个未知的数据样本X（即没有类标号），若朴素贝叶斯分类法将未知的样本X分配给类Ci，则一定是：
P(Ci|X)>P(Cj|X) 1≤j≤m，j≠i

根据此方法，对一个未知类别的样本X，可以先分别计算出X属于每一个类别Ci的概率P(X|Ci)P(Ci)，然后选择其中概率最大的类别作为其类别。

朴素贝叶斯算法成立的前提是各属性之间互相独立。当数据集满足这种独立性假设时,分类的准确度较高，否则可能较低。另外，该算法没有分类规则输出。

package xjx;

import java.io.FileReader;
import java.util.Arrays;

import weka.core.*;


public class NaiveBayes {
	/**
	 ************************* 
	 * An inner class to store parameters.
	 ************************* 
	 */
	private class GaussianParamters {
		double mu;
		double sigma;

		public GaussianParamters(double paraMu, double paraSigma) {
			mu = paraMu;
			sigma = paraSigma;
		}// Of the constructor

		public String toString() {
			return "(" + mu + ", " + sigma + ")";
		}// Of toString
	}// Of GaussianParamters

	/**
	 * The data.
	 */
	Instances dataset;

	/**
	 * The number of classes. For binary classification it is 2.
	 */
	int numClasses;

	/**
	 * The number of instances.
	 */
	int numInstances;

	/**
	 * The number of conditional attributes.
	 */
	int numConditions;

	/**
	 * The prediction, including queried and predicted labels.
	 */
	int[] predicts;

	/**
	 * Class distribution.
	 */
	double[] classDistribution;

	/**
	 * Class distribution with Laplacian smooth.
	 */
	double[] classDistributionLaplacian;

	/**
	 * The conditional probabilities for all classes over all attributes on all
	 * values.
	 */
	double[][][] conditionalProbabilities;

	/**
	 * The conditional probabilities with Laplacian smooth.
	 */
	double[][][] conditionalProbabilitiesLaplacian;

	/**
	 * The Guassian parameters.
	 */
	GaussianParamters[][] gaussianParameters;

	/**
	 * Data type.
	 */
	int dataType;

	/**
	 * Nominal.
	 */
	public static final int NOMINAL = 0;

	/**
	 * Numerical.
	 */
	public static final int NUMERICAL = 1;

	/**
	 ********************
	 * The constructor.
	 * 
	 * @param paraFilename
	 *            The given file.
	 ********************
	 */
	public NaiveBayes(String paraFilename) {
		dataset = null;
		try {
			FileReader fileReader = new FileReader(paraFilename);
			dataset = new Instances(fileReader);
			fileReader.close();
		} catch (Exception ee) {
			System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
			System.exit(0);
		} // Of try

		dataset.setClassIndex(dataset.numAttributes() - 1);
		numConditions = dataset.numAttributes() - 1;
		numInstances = dataset.numInstances();
		numClasses = dataset.attribute(numConditions).numValues();
	}// Of the constructor

	/**
	 ********************
	 * Set the data type.
	 ********************
	 */
	public void setDataType(int paraDataType) {
		dataType = paraDataType;
	}// Of setDataType

	/**
	 ********************
	 * Calculate the class distribution with Laplacian smooth.
	 ********************
	 */
	public void calculateClassDistribution() {
		classDistribution = new double[numClasses];
		classDistributionLaplacian = new double[numClasses];

		double[] tempCounts = new double[numClasses];
		for (int i = 0; i < numInstances; i++) {
			int tempClassValue = (int) dataset.instance(i).classValue();
			tempCounts[tempClassValue]++;
		} // Of for i

		for (int i = 0; i < numClasses; i++) {
			classDistribution[i] = tempCounts[i] / numInstances;
			classDistributionLaplacian[i] = (tempCounts[i] + 1) / (numInstances + numClasses);
		} // Of for i

		System.out.println("Class distribution: " + Arrays.toString(classDistribution));
		System.out.println(
				"Class distribution Laplacian: " + Arrays.toString(classDistributionLaplacian));
	}// Of calculateClassDistribution

	/**
	 ********************
	 * Calculate the conditional probabilities with Laplacian smooth. ONLY scan
	 * the dataset once. There was a simpler one, I have removed it because the
	 * time complexity is higher.
	 ********************
	 */
	public void calculateConditionalProbabilities() {
		conditionalProbabilities = new double[numClasses][numConditions][];
		conditionalProbabilitiesLaplacian = new double[numClasses][numConditions][];

		// Allocate space
		for (int i = 0; i < numClasses; i++) {
			for (int j = 0; j < numConditions; j++) {
				int tempNumValues = (int) dataset.attribute(j).numValues();
				conditionalProbabilities[i][j] = new double[tempNumValues];
				conditionalProbabilitiesLaplacian[i][j] = new double[tempNumValues];
			} // Of for j
		} // Of for i

		// Count the numbers
		int[] tempClassCounts = new int[numClasses];
		for (int i = 0; i < numInstances; i++) {
			int tempClass = (int) dataset.instance(i).classValue();
			tempClassCounts[tempClass]++;
			for (int j = 0; j < numConditions; j++) {
				int tempValue = (int) dataset.instance(i).value(j);
				conditionalProbabilities[tempClass][j][tempValue]++;
			} // Of for j
		} // Of for i

		// Now for the real probability with Laplacian
		for (int i = 0; i < numClasses; i++) {
			for (int j = 0; j < numConditions; j++) {
				int tempNumValues = (int) dataset.attribute(j).numValues();
				for (int k = 0; k < tempNumValues; k++) {
					conditionalProbabilitiesLaplacian[i][j][k] = (conditionalProbabilities[i][j][k]
							+ 1) / (tempClassCounts[i] + numClasses);
				} // Of for k
			} // Of for j
		} // Of for i

		System.out.println(Arrays.deepToString(conditionalProbabilities));
	}// Of calculateConditionalProbabilities

	/**
	 ********************
	 * Calculate the conditional probabilities with Laplacian smooth.
	 ********************
	 */
	public void calculateGausssianParameters() {
		gaussianParameters = new GaussianParamters[numClasses][numConditions];

		double[] tempValuesArray = new double[numInstances];
		int tempNumValues = 0;
		double tempSum = 0;

		for (int i = 0; i < numClasses; i++) {
			for (int j = 0; j < numConditions; j++) {
				tempSum = 0;

				// Obtain values for this class.
				tempNumValues = 0;
				for (int k = 0; k < numInstances; k++) {
					if ((int) dataset.instance(k).classValue() != i) {
						continue;
					} // Of if

					tempValuesArray[tempNumValues] = dataset.instance(k).value(j);
					tempSum += tempValuesArray[tempNumValues];
					tempNumValues++;
				} // Of for k

				// Obtain parameters.
				double tempMu = tempSum / tempNumValues;

				double tempSigma = 0;
				for (int k = 0; k < tempNumValues; k++) {
					tempSigma += (tempValuesArray[k] - tempMu) * (tempValuesArray[k] - tempMu);
				} // Of for k
				tempSigma /= tempNumValues;
				tempSigma = Math.sqrt(tempSigma);

				gaussianParameters[i][j] = new GaussianParamters(tempMu, tempSigma);
			} // Of for j
		} // Of for i

		System.out.println(Arrays.deepToString(gaussianParameters));
	}// Of calculateGausssianParameters

	/**
	 ********************
	 * Classify all instances, the results are stored in predicts[].
	 ********************
	 */
	public void classify() {
		predicts = new int[numInstances];
		for (int i = 0; i < numInstances; i++) {
			predicts[i] = classify(dataset.instance(i));
		} // Of for i
	}// Of classify

	/**
	 ********************
	 * Classify an instances.
	 ********************
	 */
	public int classify(Instance paraInstance) {
		if (dataType == NOMINAL) {
			return classifyNominal(paraInstance);
		} else if (dataType == NUMERICAL) {
			return classifyNumerical(paraInstance);
		} // Of if

		return -1;
	}// Of classify

	/**
	 ********************
	 * Classify an instances with nominal data.
	 ********************
	 */
	public int classifyNominal(Instance paraInstance) {
		// Find the biggest one
		double tempBiggest = -10000;
		int resultBestIndex = 0;
		for (int i = 0; i < numClasses; i++) {
			double tempPseudoProbability = Math.log(classDistributionLaplacian[i]);
			for (int j = 0; j < numConditions; j++) {
				int tempAttributeValue = (int) paraInstance.value(j);

				// Laplacian smooth.
				tempPseudoProbability += Math
						.log(conditionalProbabilities[i][j][tempAttributeValue]);
			} // Of for j

			if (tempBiggest < tempPseudoProbability) {
				tempBiggest = tempPseudoProbability;
				resultBestIndex = i;
			} // Of if
		} // Of for i

		return resultBestIndex;
	}// Of classifyNominal

	/**
	 ********************
	 * Compute accuracy.
	 ********************
	 */
	public double computeAccuracy() {
		double tempCorrect = 0;
		for (int i = 0; i < numInstances; i++) {
			if (predicts[i] == (int) dataset.instance(i).classValue()) {
				tempCorrect++;
			} // Of if
		} // Of for i

		double resultAccuracy = tempCorrect / numInstances;
		return resultAccuracy;
	}// Of computeAccuracy

	/**
	 ************************* 
	 * Test nominal data.
	 ************************* 
	 */
	public static void testNominal() {
		System.out.println("Hello, Naive Bayes. I only want to test the nominal data.");
		String tempFilename = "D:/data/iris.arff";

		NaiveBayes tempLearner = new NaiveBayes(tempFilename);
		tempLearner.setDataType(NOMINAL);
		tempLearner.calculateClassDistribution();
		tempLearner.calculateConditionalProbabilities();
		tempLearner.classify();

		System.out.println("The accuracy is: " + tempLearner.computeAccuracy());
	}// Of testNominal
	
	public static void main(String[] args) {
		testNominal();
		//testNumerical();
	}// Of main

}// Of class NaiveBayes

第 59 天: 数值型数据的 NB 算法

今天把数值型数据处理的代码加上去.
假设所有属性的属性值都服从高斯分布. 也可以做其它假设.
将概率密度当成概率值直接使用 Bayes 公式.
可以看到, 数值型数据的处理并不会比符号型的复杂.
代码：

/**
 ********************
 * Classify an instances with numerical data.
 ********************
 */
public int classifyNumerical(Instance paraInstance) {
	// Find the biggest one
	double tempBiggest = -10000;
	int resultBestIndex = 0;

	for (int i = 0; i < numClasses; i++) {
		double tempPseudoProbability = Math.log(classDistributionLaplacian[i]);
		for (int j = 0; j < numConditions; j++) {
			double tempAttributeValue = paraInstance.value(j);
			double tempSigma = gaussianParameters[i][j].sigma;
			double tempMu = gaussianParameters[i][j].mu;

			tempPseudoProbability += -Math.log(tempSigma) - (tempAttributeValue - tempMu)
					* (tempAttributeValue - tempMu) / (2 * tempSigma * tempSigma);
		} // Of for j

		if (tempBiggest < tempPseudoProbability) {
			tempBiggest = tempPseudoProbability;
			resultBestIndex = i;
		} // Of if
	} // Of for i

	return resultBestIndex;
}// Of classifyNumerical

/**
 ************************* 
 * Test numerical data.
 ************************* 
 */
public static void testNumerical() {
	System.out.println(
			"Hello, Naive Bayes. I only want to test the numerical data with Gaussian assumption.");
	String tempFilename = "D:/data/iris.arff";

	NaiveBayes tempLearner = new NaiveBayes(tempFilename);
	tempLearner.setDataType(NUMERICAL);
	tempLearner.calculateClassDistribution();
	tempLearner.calculateGausssianParameters();
	tempLearner.classify();

	System.out.println("The accuracy is: " + tempLearner.computeAccuracy());
}// Of testNominal

public static void main(String[] args) {
	testNominal();
	testNumerical();
}// Of main

第 60 天: 小结

1.机器学习主要分为监督学习、无监督学习、半监督学习、强化学习。

2.监督学习：
In：有标签
Out：有反馈
目的：预测结果
案例：学认字
算法：分类（类别），回归（数字）

3.无监督学习：
In：无标签
Out：无反馈
目的：发现潜在结构
案例：自动聚类
算法：聚类，降维

4.半监督学习:
已知：训练样本Data和待分类的类别
未知：训练样本有无标签均可
应用：训练数据量过时，
监督学习效果不能满足需求，因此用来增强效果。

5.强化学习：
In：决策流程及激励系统
Out：一系列行动
目的：长期利益最大化，回报函数（只会提示你是否在朝着目标方向前进的延迟反映）
案例：学下棋
算法：马尔科夫决策，动态规划

6.KNN属于分类算法， K值确定标准：
K值过小：
k值小，特征空间被划分为更多子空间（模型的项越多），整体模型变复杂，容易发生过拟合，k值越小，选择的范围就比较小，训练的时候命中率较高，近似误差小，而用test的时候就容易出错，估计误差大，容易过拟合。
K值=N：无论输入实例是什么，都将简单的预测他属于训练实例中最多的类。

7.所谓聚类算法是指将一堆没有标签的数据自动划分成几类的方法，属于无监督学习方法，这个方法要保证同一类的数据有相似的特征。

8.Kmeans是聚类算法，K值选取：在实际应用中，由于Kmean一般作为数据预处理，或者用于辅助分类贴标签。所以k一般不会设置很大。可以通过枚举，令k从2到一个固定值如10，在每个k值上重复运行数次kmeans(避免局部最优解)，并计算当前k的平均轮廓系数，最后选取轮廓系数最大的值对应的k作为最终的集群数目。