目录
第 51 天: kNN 分类器
今天将代码抄写并运行
下载安装了weka.jar包,加载arff文件
代码:
package xjx;
import java.io.FileReader;
import java.util.Arrays;
import java.util.Random;
import weka.core.*;
public class KnnClassification {
/**
* Manhattan distance.
*/
public static final int MANHATTAN = 0;
/**
* Euclidean distance.
*/
public static final int EUCLIDEAN = 1;
/**
* The distance measure.
*/
public int distanceMeasure = EUCLIDEAN;
/**
* A random instance;
*/
public static final Random random = new Random();
/**
* The number of neighbors.
*/
int numNeighbors = 7;
/**
* The whole dataset.
*/
Instances dataset;
/**
* The training set. Represented by the indices of the data.
*/
int[] trainingSet;
/**
* The testing set. Represented by the indices of the data.
*/
int[] testingSet;
/**
* The predictions.
*/
int[] predictions;
/**
*********************
* The first constructor.
*
* @param paraFilename
* The arff filename.
*********************
*/
public KnnClassification(String paraFilename) {
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
// The last attribute is the decision class.
dataset.setClassIndex(dataset.numAttributes() - 1);
fileReader.close();
} catch (Exception ee) {
System.out.println("Error occurred while trying to read \'" + paraFilename
+ "\' in KnnClassification constructor.\r\n" + ee);
System.exit(0);
}
}
/**
*********************
* Get a random indices for data randomization.
*
* @param paraLength
* The length of the sequence.
* @return An array of indices, e.g., {4, 3, 1, 5, 0, 2} with length 6.
*********************
*/
public static int[] getRandomIndices(int paraLength) {
int[] resultIndices = new int[paraLength];
// Step 1. Initialize.
for (int i = 0; i < paraLength; i++) {
resultIndices[i] = i;
}
// Step 2. Randomly swap.
int tempFirst, tempSecond, tempValue;
for (int i = 0; i < paraLength; i++) {
// Generate two random indices.
tempFirst = random.nextInt(paraLength);
tempSecond = random.nextInt(paraLength);
// Swap.
tempValue = resultIndices[tempFirst];
resultIndices[tempFirst] = resultIndices[tempSecond];
resultIndices[tempSecond] = tempValue;
}
return resultIndices;
}
/**
*********************
* Split the data into training and testing parts.
*
* @param paraTrainingFraction
* The fraction of the training set.
*********************
*/
public void splitTrainingTesting(double paraTrainingFraction) {
int tempSize = dataset.numInstances();
int[] tempIndices = getRandomIndices(tempSize);
int tempTrainingSize = (int) (tempSize * paraTrainingFraction);
trainingSet = new int[tempTrainingSize];
testingSet = new int[tempSize - tempTrainingSize];
for (int i = 0; i < tempTrainingSize; i++) {
trainingSet[i] = tempIndices[i];
}
for (int i = 0; i < tempSize - tempTrainingSize; i++) {
testingSet[i] = tempIndices[tempTrainingSize + i];
}
}/
/**
*********************
* Predict for the whole testing set. The results are stored in predictions.
* #see predictions.
*********************
*/
public void predict() {
predictions = new int[testingSet.length];
for (int i = 0; i < predictions.length; i++) {
predictions[i] = predict(testingSet[i]);
}
}
/**
*********************
* Predict for given instance.
*
* @return The prediction.
*********************
*/
public int predict(int paraIndex) {
int[] tempNeighbors = computeNearests(paraIndex);
int resultPrediction = simpleVoting(tempNeighbors);
return resultPrediction;
}
/**
*********************
* The distance between two instances.
*
* @param paraI
* The index of the first instance.
* @param paraJ
* The index of the second instance.
* @return The distance.
*********************
*/
public double distance(int paraI, int paraJ) {
int resultDistance = 0;
double tempDifference;
switch (distanceMeasure) {
case MANHATTAN:
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
if (tempDifference < 0) {
resultDistance -= tempDifference;
} else {
resultDistance += tempDifference;
}
}
break;
case EUCLIDEAN:
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
resultDistance += tempDifference * tempDifference;
}
break;
default:
System.out.println("Unsupported distance measure: " + distanceMeasure);
}
return resultDistance;
}
/**
*********************
* Get the accuracy of the classifier.
*
* @return The accuracy.
*********************
*/
public double getAccuracy() {
// A double divides an int gets another double.
double tempCorrect = 0;
for (int i = 0; i < predictions.length; i++) {
if (predictions[i] == dataset.instance(testingSet[i]).classValue()) {
tempCorrect++;
}
}
return tempCorrect / testingSet.length;
}
/**
************************************
* Compute the nearest k neighbors. Select one neighbor in each scan. In
* fact we can scan only once. You may implement it by yourself.
*
* @param paraK
* the k value for kNN.
* @param paraCurrent
* current instance. We are comparing it with all others.
* @return the indices of the nearest instances.
************************************
*/
public int[] computeNearests(int paraCurrent) {
int[] resultNearests = new int[numNeighbors];
boolean[] tempSelected = new boolean[trainingSet.length];
double tempDistance;
double tempMinimalDistance;
int tempMinimalIndex = 0;
// Select the nearest paraK indices.
for (int i = 0; i < numNeighbors; i++) {
tempMinimalDistance = Double.MAX_VALUE;
for (int j = 0; j < trainingSet.length; j++) {
if (tempSelected[j]) {
continue;
}
tempDistance = distance(paraCurrent, trainingSet[j]);
if (tempDistance < tempMinimalDistance) {
tempMinimalDistance = tempDistance;
tempMinimalIndex = j;
}
}
resultNearests[i] = trainingSet[tempMinimalIndex];
tempSelected[tempMinimalIndex] = true;
}
System.out.println("The nearest of " + paraCurrent + " are: " + Arrays.toString(resultNearests));
return resultNearests;
}
/**
************************************
* Voting using the instances.
*
* @param paraNeighbors
* The indices of the neighbors.
* @return The predicted label.
************************************
*/
public int simpleVoting(int[] paraNeighbors) {
int[] tempVotes = new int[dataset.numClasses()];
for (int i = 0; i < paraNeighbors.length; i++) {
tempVotes[(int) dataset.instance(paraNeighbors[i]).classValue()]++;
}
int tempMaximalVotingIndex = 0;
int tempMaximalVoting = 0;
for (int i = 0; i < dataset.numClasses(); i++) {
if (tempVotes[i] > tempMaximalVoting) {
tempMaximalVoting = tempVotes[i];
tempMaximalVotingIndex = i;
}
}
return tempMaximalVotingIndex;
}
/**
*********************
* The entrance of the program.
*
* @param args
* Not used now.
*********************
*/
public static void main(String args[]) {
KnnClassification tempClassifier = new KnnClassification("D:/data/iris.arff");
tempClassifier.splitTrainingTesting(0.8);
tempClassifier.predict();
System.out.println("The accuracy of the classifier is: " + tempClassifier.getAccuracy());
}
}
求邻居和精确度结果:
第 52 天: kNN 分类器 (续)
重新实现 computeNearests, 仅需要扫描一遍训练集, 即可获得 k kk 个邻居. 提示: 现代码与插入排序思想相结合.
增加 setDistanceMeasure() 方法,选择距离计算的方法.
增加 setNumNeighors() 方法,设置邻居的数量.
代码:
public int[] computeNearests(int paraCurrent) {
int[] resultNearests = new int[numNeighbors];
boolean[] tempSelected = new boolean[trainingSet.length];
double tempDistance;
double tempMinimalDistance;
int tempMinimalIndex = 0;
//直接插入排序
double[][] tempDistanceArray = new double[trainingSet.length][2];
tempDistanceArray[0][0] = 0;
tempDistanceArray[0][1] = distance(paraCurrent, trainingSet[0]);
int j;
for (int i = 1; i < trainingSet.length; i++) {
tempDistance = distance(paraCurrent, trainingSet[i]);
for (j = i - 1; j >= 0; j--) {
if (tempDistance < tempDistanceArray[j][1]) {
tempDistanceArray[j + 1] = tempDistanceArray[j];
} else {
break;
}
}
tempDistanceArray[j + 1][0] = i;
tempDistanceArray[j + 1][1] = tempDistance;
}
for (int i = 0; i < numNeighbors; i++) {
resultNearests[i] = trainingSet[(int)tempDistanceArray[i][0]];
}
System.out.println("The nearest of " + paraCurrent + " are: " + Arrays.toString(resultNearests));
return resultNearests;
}
public void setDistanceMeasure(int paraType) {
if (paraType == 0) {
distanceMeasure = MANHATTAN;
} else if (paraType == 1) {
distanceMeasure = EUCLIDEAN;
} else {
System.out.println("Wrong Distance Measure.");
}
}
public void setNumNeighbors(int paraNumNeighbors) {
if (paraNumNeighbors > dataset.numInstances()) {
System.out.println("The number of neighbors is too big.");
return;
}
numNeighbors = paraNumNeighbors;
}
/**
*********************
* The entrance of the program.
*
* @param args
* Not used now.
*********************
*/
public static void main(String args[]) {
KnnClassification tempClassifier = new KnnClassification("D:/data/iris.arff");
tempClassifier.setDistanceMeasure(0);
tempClassifier.setNumNeighbors(5);
tempClassifier.splitTrainingTesting(0.8);
tempClassifier.predict();
System.out.println("The accuracy of the classifier is: " + tempClassifier.getAccuracy());
}
结果:
第 53 天: kNN 分类器 (续)
增加 weightedVoting() 方法, 距离越短话语权越大. 支持两种以上的加权方式.
实现 leave-one-out 测试.
留一法交叉验证法,就是把一个大的数据集分为k个小数据集,其中k-1个作为训练集,剩下的一个作为测试集,然后选择下一个作为测试集,剩下的k-1个作为训练集,以此类推。
训练集的精确度:
测试集的精确度:
代码:
public int weightedVoting(int paraCurrent, int[] paraNeighbors) {
double[] tempVotes = new double[dataset.numClasses()];
double tempDistance;
//a越大,b越小,效果越好
int a = 2, b = 1;
for (int i = 0; i < paraNeighbors.length; i++) {
tempDistance = distance(paraCurrent, paraNeighbors[i]);
tempVotes[(int) dataset.instance(paraNeighbors[i]).classValue()]
+= getWeightedNum(a, b, tempDistance);
}
int tempMaximalVotingIndex = 0;
double tempMaximalVoting = 0;
for (int i = 0; i < dataset.numClasses(); i++) {
if (tempVotes[i] > tempMaximalVoting) {
tempMaximalVoting = tempVotes[i];
tempMaximalVotingIndex = i;
}
}
return tempMaximalVotingIndex;
}
public double getWeightedNum(int a, int b, double paraDistance) {
return b / (paraDistance + a);
}
public void leave_one_out() {
//留一法交叉验证
int tempSize = dataset.numInstances();
int[] tempIndices = getRandomIndices(tempSize);
double tempCorrect = 0;
for (int i = 0; i < tempSize; i++) {
trainingSet = new int[tempSize - 1];
testingSet = new int[1];
int tempIndex = 0;
for (int j = 0; j < tempSize; j++) {
if (j == i) {
continue;
}
trainingSet[tempIndex++] = tempIndices[j];
}
testingSet[0] = tempIndices[i];
this.predict();
if (predictions[0] == dataset.instance(testingSet[0]).classValue()) {
tempCorrect++;
}
}
System.out.println("The accuracy is:" + tempCorrect / tempSize);
}
public static void main(String[] args) {
KnnClassification tempClassifier = new KnnClassification("D:\\data\\iris.arff");
tempClassifier.setDistanceMeasure(0);
tempClassifier.setNumNeighbors(5);
tempClassifier.splitTrainingTesting(0.8);
tempClassifier.predict();
System.out.println("The accuracy of the classifier is: " + tempClassifier.getAccuracy());
//测试
System.out.println("\r\n-------leave_one_out-------");
tempClassifier.leave_one_out();
}
第 54 天: 基于 M-distance 的推荐
所谓 M-distance, 就是根据平均分来计算两个用户 (或项目) 之间的距离.
采用 item-based recommendation, 则第 j 个项目关于第 i 个用户的邻居项目集合为
第 i 个用户对 j 个项目的评分预测为:
邻居不用 k 控制. 距离小于 radius 的都是邻居. 使用 M-distance 时, 这种方式效果更好.
使用 leave-one-out的测试方式, 很高效的算法才能使用这种方式.
结果:
代码:
package xjx;
import java.io.*;
public class MBR {
/**
* Default rating for 1-5 points.
*/
public static final double DEFAULT_RATING = 3.0;
/**
* The total number of users.
*/
private int numUsers;
/**
* The total number of items.
*/
private int numItems;
/**
* The total number of ratings (non-zero values)
*/
private int numRatings;
/**
* The predictions.
*/
private double[] predictions;
/**
* Compressed rating matrix. User-item-rating triples.
*/
private int[][] compressedRatingMatrix;
/**
* The degree of users (how many item he has rated).
*/
private int[] userDegrees;
/**
* The average rating of the current user.
*/
private double[] userAverageRatings;
/**
* The degree of users (how many item he has rated).
*/
private int[] itemDegrees;
/**
* The average rating of the current item.
*/
private double[] itemAverageRatings;
/**
* The first user start from 0. Let the first user has x ratings, the second
* user will start from x.
*/
private int[] userStartingIndices;
/**
* Number of non-neighbor objects.
*/
private int numNonNeighbors;
/**
* The radius (delta) for determining the neighborhood.
*/
private double radius;
/**
*************************
* Construct the rating matrix.
*
* @param paraRatingFilename
* the rating filename.
* @param paraNumUsers
* number of users
* @param paraNumItems
* number of items
* @param paraNumRatings
* number of ratings
*************************
*/
public MBR(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings) throws Exception {
// Step 1. Initialize these arrays
numItems = paraNumItems;
numUsers = paraNumUsers;
numRatings = paraNumRatings;
userDegrees = new int[numUsers];
userStartingIndices = new int[numUsers + 1];
userAverageRatings = new double[numUsers];
itemDegrees = new int[numItems];
compressedRatingMatrix = new int[numRatings][3];
itemAverageRatings = new double[numItems];
predictions = new double[numRatings];
System.out.println("Reading " + paraFilename);
// Step 2. Read the data file.
File tempFile = new File(paraFilename);
if (!tempFile.exists()) {
System.out.println("File " + paraFilename + " does not exists.");
System.exit(0);
} // Of if
BufferedReader tempBufReader = new BufferedReader(new FileReader(tempFile));
String tempString;
String[] tempStrArray;
int tempIndex = 0;
userStartingIndices[0] = 0;
userStartingIndices[numUsers] = numRatings;
while ((tempString = tempBufReader.readLine()) != null) {
// Each line has three values
tempStrArray = tempString.split(",");
compressedRatingMatrix[tempIndex][0] = Integer.parseInt(tempStrArray[0]);
compressedRatingMatrix[tempIndex][1] = Integer.parseInt(tempStrArray[1]);
compressedRatingMatrix[tempIndex][2] = Integer.parseInt(tempStrArray[2]);
userDegrees[compressedRatingMatrix[tempIndex][0]]++;
itemDegrees[compressedRatingMatrix[tempIndex][1]]++;
if (tempIndex > 0) {
// Starting to read the data of a new user.
if (compressedRatingMatrix[tempIndex][0] != compressedRatingMatrix[tempIndex - 1][0]) {
userStartingIndices[compressedRatingMatrix[tempIndex][0]] = tempIndex;
} // Of if
} // Of if
tempIndex++;
} // Of while
tempBufReader.close();
double[] tempUserTotalScore = new double[numUsers];
double[] tempItemTotalScore = new double[numItems];
for (int i = 0; i < numRatings; i++) {
tempUserTotalScore[compressedRatingMatrix[i][0]] += compressedRatingMatrix[i][2];
tempItemTotalScore[compressedRatingMatrix[i][1]] += compressedRatingMatrix[i][2];
} // Of for i
for (int i = 0; i < numUsers; i++) {
userAverageRatings[i] = tempUserTotalScore[i] / userDegrees[i];
} // Of for i
for (int i = 0; i < numItems; i++) {
itemAverageRatings[i] = tempItemTotalScore[i] / itemDegrees[i];
} // Of for i
}// Of the first constructor
/**
*************************
* Set the radius (delta).
*
* @param paraRadius
* The given radius.
*************************
*/
public void setRadius(double paraRadius) {
if (paraRadius > 0) {
radius = paraRadius;
} else {
radius = 0.1;
} // Of if
}// Of setRadius
/**
*************************
* Leave-one-out prediction. The predicted values are stored in predictions.
*
* @see predictions
*************************
*/
public void leaveOneOutPrediction() {
double tempItemAverageRating;
// Make each line of the code shorter.
int tempUser, tempItem, tempRating;
System.out.println("\r\nLeaveOneOutPrediction for radius " + radius);
numNonNeighbors = 0;
for (int i = 0; i < numRatings; i++) {
tempUser = compressedRatingMatrix[i][0];
tempItem = compressedRatingMatrix[i][1];
tempRating = compressedRatingMatrix[i][2];
// Step 1. Recompute average rating of the current item.
tempItemAverageRating = (itemAverageRatings[tempItem] * itemDegrees[tempItem] - tempRating)
/ (itemDegrees[tempItem] - 1);
// Step 2. Recompute neighbors, at the same time obtain the ratings
// Of neighbors.
int tempNeighbors = 0;
double tempTotal = 0;
int tempComparedItem;
for (int j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++) {
tempComparedItem = compressedRatingMatrix[j][1];
if (tempItem == tempComparedItem) {
continue;// Ignore itself.
} // Of if
if (Math.abs(tempItemAverageRating - itemAverageRatings[tempComparedItem]) < radius) {
tempTotal += compressedRatingMatrix[j][2];
tempNeighbors++;
} // Of if
} // Of for j
// Step 3. Predict as the average value of neighbors.
if (tempNeighbors > 0) {
predictions[i] = tempTotal / tempNeighbors;
} else {
predictions[i] = DEFAULT_RATING;
numNonNeighbors++;
} // Of if
} // Of for i
}// Of leaveOneOutPrediction
/**
*************************
* Compute the MAE based on the deviation of each leave-one-out.
*
* @author Fan Min
*************************
*/
public double computeMAE() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < predictions.length; i++) {
tempTotalError += Math.abs(predictions[i] - compressedRatingMatrix[i][2]);
} // Of for i
return tempTotalError / predictions.length;
}// Of computeMAE
/**
*************************
* Compute the MAE based on the deviation of each leave-one-out.
*
*************************
*/
public double computeRSME() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < predictions.length; i++) {
tempTotalError += (predictions[i] - compressedRatingMatrix[i][2])
* (predictions[i] - compressedRatingMatrix[i][2]);
} // Of for i
double tempAverage = tempTotalError / predictions.length;
return Math.sqrt(tempAverage);
}// Of computeRSME
/**
*************************
* The entrance of the program.
*
* @param args
* Not used now.
*************************
*/
public static void main(String[] args) {
try {
MBR tempRecommender = new MBR("D:/data/movielens943u1682m.txt", 943, 1682, 100000);
for (double tempRadius = 0.2; tempRadius < 0.6; tempRadius += 0.1) {
tempRecommender.setRadius(tempRadius);
tempRecommender.leaveOneOutPrediction();
double tempMAE = tempRecommender.computeMAE();
double tempRSME = tempRecommender.computeRSME();
System.out.println("Radius = " + tempRadius + ", MAE = " + tempMAE + ", RSME = " + tempRSME
+ ", numNonNeighbors = " + tempRecommender.numNonNeighbors);
} // Of for tempRadius
} catch (Exception ee) {
System.out.println(ee);
} // Of try
}// Of main
}// Of class MBR
第 55 天: 基于 M-distance 的推荐 (续)
昨天实现的是 item-based recommendation. 今天自己来实现一下 user-based recommendation. 只需要在原有基础上增加即可.
代码:
package xjx;
import java.io.*;
public class MBR {
/**
*评分为1-5分
*/
public static final double DEFAULT_RATING = 3.0;
/**
* 用户数量
*/
private int numUsers;
/**
* 项目数量
*/
private int numItems;
/**
* 评分数量(非零值)
*/
private int numRatings;
/**
* 预测数组
*/
private double[] predictions;
/**
* 压缩评级矩阵。
*/
private int[][] compressedRatingMatrix;
/**
*有多少用户对项目进行了评分
*/
private int[] userDegrees;
/**
* 当前用户的平均分级
*/
private double[] userAverageRatings;
/**
* 多少项目被评分
*/
private int[] itemDegrees;
/**
* 当前项目平均评级
*/
private double[] itemAverageRatings;
/**
* 第一个用户从0开始。让第一个用户有x评级,第二个用户将从x开始。
*/
private int[] userStartingIndices;
/**
* 没有邻居的对象
*/
private int numNonNeighbors;
/**
* 确定邻居的半径(增量)
*/
private double radius;
/**
*************************
* 创建评分矩阵
*
* @param paraRatingFilename
* the rating filename.
* @param paraNumUsers
* number of users
* @param paraNumItems
* number of items
* @param paraNumRatings
* number of ratings
*************************
*/
public MBR(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings) throws Exception {
//初始化三个数组
numItems = paraNumItems;
numUsers = paraNumUsers;
numRatings = paraNumRatings;
userDegrees = new int[numUsers];
userStartingIndices = new int[numUsers + 1];
userAverageRatings = new double[numUsers];
itemDegrees = new int[numItems];
compressedRatingMatrix = new int[numRatings][3];
itemAverageRatings = new double[numItems];
predictions = new double[numRatings];
System.out.println("Reading " + paraFilename);
//读取数据
File tempFile = new File(paraFilename);
if (!tempFile.exists()) {
System.out.println("File " + paraFilename + " does not exists.");
System.exit(0);
}
BufferedReader tempBufReader = new BufferedReader(new FileReader(tempFile));
String tempString;
String[] tempStrArray;
int tempIndex = 0;
userStartingIndices[0] = 0;
userStartingIndices[numUsers] = numRatings;
while ((tempString = tempBufReader.readLine()) != null) {
//每一行有三个值
tempStrArray = tempString.split(",");
compressedRatingMatrix[tempIndex][0] = Integer.parseInt(tempStrArray[0]);//用户
compressedRatingMatrix[tempIndex][1] = Integer.parseInt(tempStrArray[1]);//项目
compressedRatingMatrix[tempIndex][2] = Integer.parseInt(tempStrArray[2]);//评级
userDegrees[compressedRatingMatrix[tempIndex][0]]++;
itemDegrees[compressedRatingMatrix[tempIndex][1]]++;
if (tempIndex > 0) {
// 开始读取新用户数据
if (compressedRatingMatrix[tempIndex][0] != compressedRatingMatrix[tempIndex - 1][0]) {
userStartingIndices[compressedRatingMatrix[tempIndex][0]] = tempIndex;
}
}
tempIndex++;
}
tempBufReader.close();
double[] tempUserTotalScore = new double[numUsers];
double[] tempItemTotalScore = new double[numItems];
for (int i = 0; i < numRatings; i++) {
tempUserTotalScore[compressedRatingMatrix[i][0]] += compressedRatingMatrix[i][2];
tempItemTotalScore[compressedRatingMatrix[i][1]] += compressedRatingMatrix[i][2];
}
for (int i = 0; i < numUsers; i++) {
userAverageRatings[i] = tempUserTotalScore[i] / userDegrees[i];
}
for (int i = 0; i < numItems; i++) {
itemAverageRatings[i] = tempItemTotalScore[i] / itemDegrees[i];
}
}
/**
*************************
* 设置半径(增量)。
*
* @param paraRadius
* The given radius.
*************************
*/
public void setRadius(double paraRadius) {
if (paraRadius > 0) {
radius = paraRadius;
} else {
radius = 0.1;
}
}
/**
*************************
* 留一法。预测值存储在预测中。
*
* @see predictions
*************************
*/
public void leaveOneOutPrediction() {
double tempUserAverageRating;
int tempUser, tempItem, tempRating;
System.out.println("\r\nLeaveOneOutPrediction for radius " + radius);
numNonNeighbors = 0;
for (int i = 0; i < numRatings; i++) {
tempUser = compressedRatingMatrix[i][0];
tempItem = compressedRatingMatrix[i][1];
tempRating = compressedRatingMatrix[i][2];
//重新计算当前的平均评分。
tempUserAverageRating = (userAverageRatings[tempUser] * userDegrees[tempUser] - tempRating)
/ (userDegrees[tempUser] - 1);
//重新计算邻居,同时获得评分
//邻居
int tempNeighbors = 0;
double tempTotal = 0;
int tempComparedUser;
for (int j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++) {
tempComparedUser = compressedRatingMatrix[j][0];
if (tempUser == tempComparedUser) {
continue;
}
if (Math.abs(tempUserAverageRating - userAverageRatings[tempComparedUser]) < radius) {
tempTotal += compressedRatingMatrix[j][2];
tempNeighbors++;
}
}
//预测为邻居的平均值。
if (tempNeighbors > 0) {
predictions[i] = tempTotal / tempNeighbors;
} else {
predictions[i] = DEFAULT_RATING;
numNonNeighbors++;
}
}
}
/**
*************************
* 根据每个遗漏的偏差计算MAE。
*************************
*/
public double computeMAE() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < predictions.length; i++) {
tempTotalError += Math.abs(predictions[i] - compressedRatingMatrix[i][2]);
}
return tempTotalError / predictions.length;
}
/**
*************************
* 根据每个遗漏的偏差计算MAE。
*************************
*/
public double computeRSME() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < predictions.length; i++) {
tempTotalError += (predictions[i] - compressedRatingMatrix[i][2])
* (predictions[i] - compressedRatingMatrix[i][2]);
}
double tempAverage = tempTotalError / predictions.length;
return Math.sqrt(tempAverage);
}
/**
*************************
* The entrance of the program.
*
* @param args
* Not used now.
*************************
*/
public static void main(String[] args) {
try {
MBR tempRecommender = new MBR("D:/data/movielens943u1682m.txt", 10000, 1682, 1000000);
for (double tempRadius = 0.2; tempRadius < 0.6; tempRadius += 0.1) {
tempRecommender.setRadius(tempRadius);
tempRecommender.leaveOneOutPrediction();
double tempMAE = tempRecommender.computeMAE();
double tempRSME = tempRecommender.computeRSME();
System.out.println("Radius = " + tempRadius + ", MAE = " + tempMAE + ", RSME = " + tempRSME
+ ", numNonNeighbors = " + tempRecommender.numNonNeighbors);
}
} catch (Exception ee) {
System.out.println(ee);
}
}
}
第 56 天: kMeans 聚类
K-means 的算法步骤为:
1.选择初始化的 k 个样本作为初始聚类中心 a=a1,a2,…ak;
2.针对数据集中每个样本xi计算它到 k 个聚类中心的距离并将其分到距离最小的聚类中心所对应的类中;
3.针对每个类别aj ,重新计算它的聚类中心
(即属于该类的所有样本的质心);
4.重复上面 2 3 两步操作,直到达到某个中止条件(迭代次数、最小误差变化等)。
聚类结果:
New loop ...
Now the new centers are: [[6.017142857142856, 2.7971428571428567, 4.545714285714286, 1.5214285714285716], [6.964285714285715, 3.089285714285714, 5.932142857142857, 2.107142857142857], [5.005769230769231, 3.3807692307692316, 1.5288461538461537, 0.2749999999999999]]
New loop ...
Now the new centers are: [[6.022666666666666, 2.804, 4.544, 1.5333333333333332], [6.980000000000001, 3.0759999999999996, 5.991999999999998, 2.1039999999999996], [5.005999999999999, 3.4180000000000006, 1.464, 0.2439999999999999]]
New loop ...
Now the new centers are: [[6.022666666666666, 2.804, 4.544, 1.5333333333333332], [6.980000000000001, 3.0759999999999996, 5.991999999999998, 2.1039999999999996], [5.005999999999999, 3.4180000000000006, 1.464, 0.2439999999999999]]
The clusters are: [[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 106, 110, 111, 112, 113, 114, 115, 116, 119, 121, 123, 126, 127, 133, 137, 138, 139, 141, 142, 145, 146, 147, 148, 149], [100, 102, 103, 104, 105, 107, 108, 109, 117, 118, 120, 122, 124, 125, 128, 129, 130, 131, 132, 134, 135, 136, 140, 143, 144], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]
代码:
package xjx;
import java.io.FileReader;
import java.util.Arrays;
import java.util.Random;
import weka.core.Instances;
public class KMeans {
/**
* 曼哈顿距离.
*/
public static final int MANHATTAN = 0;
/**
* 欧式距离.
*/
public static final int EUCLIDEAN = 1;
/**
* 距离测量.
*/
public int distanceMeasure = EUCLIDEAN;
/**
* 随机数;
*/
public static final Random random = new Random();
/**
* 数据.
*/
Instances dataset;
/**
* 聚类的数目.
*/
int numClusters = 2;
/**
*聚类.
*/
int[][] clusters;
/**
*******************************
* 第一个
*
* @param paraFilename
* The data filename.
*******************************
*/
public KMeans(String paraFilename) {
dataset = null;
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
fileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
System.exit(0);
}
}
public void setNumClusters(int paraNumClusters) {
numClusters = paraNumClusters;
}
/**
*********************
* 为数据随机化得到一个随机指标。
*
* @param paraLength
* The length of the sequence.
* @return An array of indices, e.g., {4, 3, 1, 5, 0, 2} with length 6.
*********************
*/
public static int[] getRandomIndices(int paraLength) {
int[] resultIndices = new int[paraLength];
//初始化
for (int i = 0; i < paraLength; i++) {
resultIndices[i] = i;
}
//随机交换
int tempFirst, tempSecond, tempValue;
for (int i = 0; i < paraLength; i++) {
// 生成两个随机索引
tempFirst = random.nextInt(paraLength);
tempSecond = random.nextInt(paraLength);
// 交换
tempValue = resultIndices[tempFirst];
resultIndices[tempFirst] = resultIndices[tempSecond];
resultIndices[tempSecond] = tempValue;
}
return resultIndices;
}
/**
*********************
* 两个实例间的距离
*
* @param paraI
* The index of the first instance.
* @param paraArray
* The array representing a point in the space.
* @return The distance.
*********************
*/
public double distance(int paraI, double[] paraArray) {
int resultDistance = 0;
double tempDifference;
switch (distanceMeasure) {
case MANHATTAN:
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
tempDifference = dataset.instance(paraI).value(i) - paraArray[i];
if (tempDifference < 0) {
resultDistance -= tempDifference;
} else {
resultDistance += tempDifference;
}
}
break;
case EUCLIDEAN:
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
tempDifference = dataset.instance(paraI).value(i) - paraArray[i];
resultDistance += tempDifference * tempDifference;
}
break;
default:
System.out.println("Unsupported distance measure: " + distanceMeasure);
}
return resultDistance;
}
/**
*******************************
* 聚类
*******************************
*/
public void clustering() {
int[] tempOldClusterArray = new int[dataset.numInstances()];
tempOldClusterArray[0] = -1;
int[] tempClusterArray = new int[dataset.numInstances()];
Arrays.fill(tempClusterArray, 0);
double[][] tempCenters = new double[numClusters][dataset.numAttributes() - 1];
//初始化中心
int[] tempRandomOrders = getRandomIndices(dataset.numInstances());
for (int i = 0; i < numClusters; i++) {
for (int j = 0; j < tempCenters[0].length; j++) {
tempCenters[i][j] = dataset.instance(tempRandomOrders[i]).value(j);
}
}
int[] tempClusterLengths = null;
while (!Arrays.equals(tempOldClusterArray, tempClusterArray)) {
System.out.println("New loop ...");
tempOldClusterArray = tempClusterArray;
tempClusterArray = new int[dataset.numInstances()];
// 最小化。为每个实例分配集群。
int tempNearestCenter;
double tempNearestDistance;
double tempDistance;
for (int i = 0; i < dataset.numInstances(); i++) {
tempNearestCenter = -1;
tempNearestDistance = Double.MAX_VALUE;
for (int j = 0; j < numClusters; j++) {
tempDistance = distance(i, tempCenters[j]);
if (tempNearestDistance > tempDistance) {
tempNearestDistance = tempDistance;
tempNearestCenter = j;
}
}
tempClusterArray[i] = tempNearestCenter;
}
//找到新中心
tempClusterLengths = new int[numClusters];
Arrays.fill(tempClusterLengths, 0);
double[][] tempNewCenters = new double[numClusters][dataset.numAttributes() - 1];
// Arrays.fill(tempNewCenters, 0);
for (int i = 0; i < dataset.numInstances(); i++) {
for (int j = 0; j < tempNewCenters[0].length; j++) {
tempNewCenters[tempClusterArray[i]][j] += dataset.instance(i).value(j);
}
tempClusterLengths[tempClusterArray[i]]++;
}
// 求平均
for (int i = 0; i < tempNewCenters.length; i++) {
for (int j = 0; j < tempNewCenters[0].length; j++) {
tempNewCenters[i][j] /= tempClusterLengths[i];
}
}
System.out.println("Now the new centers are: " + Arrays.deepToString(tempNewCenters));
tempCenters = tempNewCenters;
}
//形成聚类
clusters = new int[numClusters][];
int[] tempCounters = new int[numClusters];
for (int i = 0; i < numClusters; i++) {
clusters[i] = new int[tempClusterLengths[i]];
}
for (int i = 0; i < tempClusterArray.length; i++) {
clusters[tempClusterArray[i]][tempCounters[tempClusterArray[i]]] = i;
tempCounters[tempClusterArray[i]]++;
}
System.out.println("The clusters are: " + Arrays.deepToString(clusters));
}
/**
*******************************
* 测试.
*******************************
*/
public static void testClustering() {
KMeans tempKMeans = new KMeans("D:/data/iris.arff");
tempKMeans.setNumClusters(3);
tempKMeans.clustering();
}
public static void main(String arags[]) {
testClustering();
}
}
第 57 天: kMeans 聚类 (续)
获得虚拟中心后, 换成与其最近的点作为实际中心, 再聚类.
今天主要是想控制下节奏. 毕竟 kMeans 也值得两天的工作量.
找中心点结果:
代码:
package xjx;
import java.io.FileReader;
import java.util.Arrays;
import java.util.Random;
import weka.core.Instances;
public class KMeans {
/**
* 曼哈顿距离.
*/
public static final int MANHATTAN = 0;
/**
* 欧式距离.
*/
public static final int EUCLIDEAN = 1;
/**
* 距离测量.
*/
public int distanceMeasure = EUCLIDEAN;
/**
* 随机数;
*/
public static final Random random = new Random();
/**
* 数据.
*/
Instances dataset;
/**
* 聚类的数目.
*/
int numClusters = 2;
/**
*聚类.
*/
int[][] clusters;
/**
*******************************
* 第一个
*
* @param paraFilename
* The data filename.
*******************************
*/
public KMeans(String paraFilename) {
dataset = null;
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
fileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
System.exit(0);
}
}
public void setNumClusters(int paraNumClusters) {
numClusters = paraNumClusters;
}
/**
*********************
* 为数据随机化得到一个随机指标。
*
* @param paraLength
* The length of the sequence.
* @return An array of indices, e.g., {4, 3, 1, 5, 0, 2} with length 6.
*********************
*/
public static int[] getRandomIndices(int paraLength) {
int[] resultIndices = new int[paraLength];
//初始化
for (int i = 0; i < paraLength; i++) {
resultIndices[i] = i;
}
//随机交换
int tempFirst, tempSecond, tempValue;
for (int i = 0; i < paraLength; i++) {
// 生成两个随机索引
tempFirst = random.nextInt(paraLength);
tempSecond = random.nextInt(paraLength);
// 交换
tempValue = resultIndices[tempFirst];
resultIndices[tempFirst] = resultIndices[tempSecond];
resultIndices[tempSecond] = tempValue;
}
return resultIndices;
}
/**
*********************
* 两个实例间的距离
*
* @param paraI
* The index of the first instance.
* @param paraArray
* The array representing a point in the space.
* @return The distance.
*********************
*/
public double distance(int paraI, double[] paraArray) {
int resultDistance = 0;
double tempDifference;
switch (distanceMeasure) {
case MANHATTAN:
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
tempDifference = dataset.instance(paraI).value(i) - paraArray[i];
if (tempDifference < 0) {
resultDistance -= tempDifference;
} else {
resultDistance += tempDifference;
}
}
break;
case EUCLIDEAN:
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
tempDifference = dataset.instance(paraI).value(i) - paraArray[i];
resultDistance += tempDifference * tempDifference;
}
break;
default:
System.out.println("Unsupported distance measure: " + distanceMeasure);
}
return resultDistance;
}
/**
*******************************
* 聚类
*******************************
*/
public void clustering() {
int[] tempOldClusterArray = new int[dataset.numInstances()];
tempOldClusterArray[0] = -1;
int[] tempClusterArray = new int[dataset.numInstances()];
Arrays.fill(tempClusterArray, 0);
double[][] tempCenters = new double[numClusters][dataset.numAttributes() - 1];
//初始化中心
int[] tempRandomOrders = getRandomIndices(dataset.numInstances());
for (int i = 0; i < numClusters; i++) {
for (int j = 0; j < tempCenters[0].length; j++) {
tempCenters[i][j] = dataset.instance(tempRandomOrders[i]).value(j);
}
}
int[] tempClusterLengths = null;
while (!Arrays.equals(tempOldClusterArray, tempClusterArray)) {
System.out.println("New loop ...");
tempOldClusterArray = tempClusterArray;
tempClusterArray = new int[dataset.numInstances()];
// 最小化。为每个实例分配集群。
int tempNearestCenter;
double tempNearestDistance;
double tempDistance;
for (int i = 0; i < dataset.numInstances(); i++) {
tempNearestCenter = -1;
tempNearestDistance = Double.MAX_VALUE;
for (int j = 0; j < numClusters; j++) {
tempDistance = distance(i, tempCenters[j]);
if (tempNearestDistance > tempDistance) {
tempNearestDistance = tempDistance;
tempNearestCenter = j;
}
}
tempClusterArray[i] = tempNearestCenter;
}
// //找到新中心
// tempClusterLengths = new int[numClusters];
// Arrays.fill(tempClusterLengths, 0);
double[][] tempNewCenters = new double[numClusters][dataset.numAttributes() - 1];
// // Arrays.fill(tempNewCenters, 0);
// for (int i = 0; i < dataset.numInstances(); i++) {
// for (int j = 0; j < tempNewCenters[0].length; j++) {
// tempNewCenters[tempClusterArray[i]][j] += dataset.instance(i).value(j);
// }
// tempClusterLengths[tempClusterArray[i]]++;
// }
//
// // 求平均
// for (int i = 0; i < tempNewCenters.length; i++) {
// for (int j = 0; j < tempNewCenters[0].length; j++) {
// tempNewCenters[i][j] /= tempClusterLengths[i];
// }
// }
//当前临时实际中心点与平均中心点的距离
double[] tempNearestDistanceArray = new double[numClusters];
//当前距离平均中心最近的实际点
double[][] tempActualCenters = new double[numClusters][dataset.numAttributes() - 1];
Arrays.fill(tempNearestDistanceArray, Double.MAX_VALUE);
for (int i = 0; i < dataset.numInstances(); i++) {
//用当前数据去与其分类的中心比较距离
if (tempNearestDistanceArray[tempClusterArray[i]] > distance(i, tempCenters[tempClusterArray[i]])) {
tempNearestDistanceArray[tempClusterArray[i]] = distance(i, tempCenters[tempClusterArray[i]]);
//暂时存储当前距离平均中心最近的实际点
for (int j = 0; j < dataset.numAttributes() - 1; j++) {
tempActualCenters[tempClusterArray[i]][j] = dataset.instance((i)).value(j);
}
}
}
for (int i = 0; i < tempNewCenters.length; i++) {
tempNewCenters[i] = tempActualCenters[i];
}
System.out.println("Now the new centers are: " + Arrays.deepToString(tempNewCenters));
tempCenters = tempNewCenters;
}
//形成聚类
clusters = new int[numClusters][];
int[] tempCounters = new int[numClusters];
for (int i = 0; i < numClusters; i++) {
clusters[i] = new int[tempClusterLengths[i]];
}
for (int i = 0; i < tempClusterArray.length; i++) {
clusters[tempClusterArray[i]][tempCounters[tempClusterArray[i]]] = i;
tempCounters[tempClusterArray[i]]++;
}
System.out.println("The clusters are: " + Arrays.deepToString(clusters));
}
/**
*******************************
* 测试.
*******************************
*/
public static void testClustering() {
KMeans tempKMeans = new KMeans("D:/data/iris.arff");
tempKMeans.setNumClusters(3);
tempKMeans.clustering();
}
public static void main(String arags[]) {
testClustering();
}
}
第 58 天: 符号型数据的 NB 算法
NB(朴素贝叶斯)算法是一种分类算法。
分类任务是机器学习中最常见的任务。给定一个对象X,将X划分到预定好的某一类别y中。其中Y代表所有类别的一个有限集合,如新闻类别:{军事新闻,科技新闻,生活新闻}。y代表分类集合中的某一类别,如军事新闻。X代表待分类的对象,x代表该对象的特征,如X代表一篇待分类的文章则x就代表该文章中的单词。目标:输入X,输出y
设每个数据样本用一个n维特征向量来描述n个属性的值,即:X={x1,x2,…,xn},
假定有m个类,分别用C1, C2,…,Cm表示。
给定一个未知的数据样本X(即没有类标号),若朴素贝叶斯分类法将未知的样本X分配给类Ci,则一定是:
P(Ci|X)>P(Cj|X) 1≤j≤m,j≠i
根据贝叶斯定理
由于P(X)对于所有类为常数,最大化后验概率P(Ci|X)可转化为最大化先验概率P(X|Ci)P(Ci)。
如果训练数据集有许多属性和元组,计算P(X|Ci)的开销可能非常大,为此,通常假设各属性的取值互相独立,这样
先验概率P(x1|Ci),P(x2|Ci),…,P(xn|Ci)可以从训练数据集求得。
根据此方法,对一个未知类别的样本X,可以先分别计算出X属于每一个类别Ci的概率P(X|Ci)P(Ci),然后选择其中概率最大的类别作为其类别。
朴素贝叶斯算法成立的前提是各属性之间互相独立。当数据集满足这种独立性假设时,分类的准确度较高,否则可能较低。另外,该算法没有分类规则输出。
package xjx;
import java.io.FileReader;
import java.util.Arrays;
import weka.core.*;
public class NaiveBayes {
/**
*************************
* An inner class to store parameters.
*************************
*/
private class GaussianParamters {
double mu;
double sigma;
public GaussianParamters(double paraMu, double paraSigma) {
mu = paraMu;
sigma = paraSigma;
}// Of the constructor
public String toString() {
return "(" + mu + ", " + sigma + ")";
}// Of toString
}// Of GaussianParamters
/**
* The data.
*/
Instances dataset;
/**
* The number of classes. For binary classification it is 2.
*/
int numClasses;
/**
* The number of instances.
*/
int numInstances;
/**
* The number of conditional attributes.
*/
int numConditions;
/**
* The prediction, including queried and predicted labels.
*/
int[] predicts;
/**
* Class distribution.
*/
double[] classDistribution;
/**
* Class distribution with Laplacian smooth.
*/
double[] classDistributionLaplacian;
/**
* The conditional probabilities for all classes over all attributes on all
* values.
*/
double[][][] conditionalProbabilities;
/**
* The conditional probabilities with Laplacian smooth.
*/
double[][][] conditionalProbabilitiesLaplacian;
/**
* The Guassian parameters.
*/
GaussianParamters[][] gaussianParameters;
/**
* Data type.
*/
int dataType;
/**
* Nominal.
*/
public static final int NOMINAL = 0;
/**
* Numerical.
*/
public static final int NUMERICAL = 1;
/**
********************
* The constructor.
*
* @param paraFilename
* The given file.
********************
*/
public NaiveBayes(String paraFilename) {
dataset = null;
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
fileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
System.exit(0);
} // Of try
dataset.setClassIndex(dataset.numAttributes() - 1);
numConditions = dataset.numAttributes() - 1;
numInstances = dataset.numInstances();
numClasses = dataset.attribute(numConditions).numValues();
}// Of the constructor
/**
********************
* Set the data type.
********************
*/
public void setDataType(int paraDataType) {
dataType = paraDataType;
}// Of setDataType
/**
********************
* Calculate the class distribution with Laplacian smooth.
********************
*/
public void calculateClassDistribution() {
classDistribution = new double[numClasses];
classDistributionLaplacian = new double[numClasses];
double[] tempCounts = new double[numClasses];
for (int i = 0; i < numInstances; i++) {
int tempClassValue = (int) dataset.instance(i).classValue();
tempCounts[tempClassValue]++;
} // Of for i
for (int i = 0; i < numClasses; i++) {
classDistribution[i] = tempCounts[i] / numInstances;
classDistributionLaplacian[i] = (tempCounts[i] + 1) / (numInstances + numClasses);
} // Of for i
System.out.println("Class distribution: " + Arrays.toString(classDistribution));
System.out.println(
"Class distribution Laplacian: " + Arrays.toString(classDistributionLaplacian));
}// Of calculateClassDistribution
/**
********************
* Calculate the conditional probabilities with Laplacian smooth. ONLY scan
* the dataset once. There was a simpler one, I have removed it because the
* time complexity is higher.
********************
*/
public void calculateConditionalProbabilities() {
conditionalProbabilities = new double[numClasses][numConditions][];
conditionalProbabilitiesLaplacian = new double[numClasses][numConditions][];
// Allocate space
for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
int tempNumValues = (int) dataset.attribute(j).numValues();
conditionalProbabilities[i][j] = new double[tempNumValues];
conditionalProbabilitiesLaplacian[i][j] = new double[tempNumValues];
} // Of for j
} // Of for i
// Count the numbers
int[] tempClassCounts = new int[numClasses];
for (int i = 0; i < numInstances; i++) {
int tempClass = (int) dataset.instance(i).classValue();
tempClassCounts[tempClass]++;
for (int j = 0; j < numConditions; j++) {
int tempValue = (int) dataset.instance(i).value(j);
conditionalProbabilities[tempClass][j][tempValue]++;
} // Of for j
} // Of for i
// Now for the real probability with Laplacian
for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
int tempNumValues = (int) dataset.attribute(j).numValues();
for (int k = 0; k < tempNumValues; k++) {
conditionalProbabilitiesLaplacian[i][j][k] = (conditionalProbabilities[i][j][k]
+ 1) / (tempClassCounts[i] + numClasses);
} // Of for k
} // Of for j
} // Of for i
System.out.println(Arrays.deepToString(conditionalProbabilities));
}// Of calculateConditionalProbabilities
/**
********************
* Calculate the conditional probabilities with Laplacian smooth.
********************
*/
public void calculateGausssianParameters() {
gaussianParameters = new GaussianParamters[numClasses][numConditions];
double[] tempValuesArray = new double[numInstances];
int tempNumValues = 0;
double tempSum = 0;
for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
tempSum = 0;
// Obtain values for this class.
tempNumValues = 0;
for (int k = 0; k < numInstances; k++) {
if ((int) dataset.instance(k).classValue() != i) {
continue;
} // Of if
tempValuesArray[tempNumValues] = dataset.instance(k).value(j);
tempSum += tempValuesArray[tempNumValues];
tempNumValues++;
} // Of for k
// Obtain parameters.
double tempMu = tempSum / tempNumValues;
double tempSigma = 0;
for (int k = 0; k < tempNumValues; k++) {
tempSigma += (tempValuesArray[k] - tempMu) * (tempValuesArray[k] - tempMu);
} // Of for k
tempSigma /= tempNumValues;
tempSigma = Math.sqrt(tempSigma);
gaussianParameters[i][j] = new GaussianParamters(tempMu, tempSigma);
} // Of for j
} // Of for i
System.out.println(Arrays.deepToString(gaussianParameters));
}// Of calculateGausssianParameters
/**
********************
* Classify all instances, the results are stored in predicts[].
********************
*/
public void classify() {
predicts = new int[numInstances];
for (int i = 0; i < numInstances; i++) {
predicts[i] = classify(dataset.instance(i));
} // Of for i
}// Of classify
/**
********************
* Classify an instances.
********************
*/
public int classify(Instance paraInstance) {
if (dataType == NOMINAL) {
return classifyNominal(paraInstance);
} else if (dataType == NUMERICAL) {
return classifyNumerical(paraInstance);
} // Of if
return -1;
}// Of classify
/**
********************
* Classify an instances with nominal data.
********************
*/
public int classifyNominal(Instance paraInstance) {
// Find the biggest one
double tempBiggest = -10000;
int resultBestIndex = 0;
for (int i = 0; i < numClasses; i++) {
double tempPseudoProbability = Math.log(classDistributionLaplacian[i]);
for (int j = 0; j < numConditions; j++) {
int tempAttributeValue = (int) paraInstance.value(j);
// Laplacian smooth.
tempPseudoProbability += Math
.log(conditionalProbabilities[i][j][tempAttributeValue]);
} // Of for j
if (tempBiggest < tempPseudoProbability) {
tempBiggest = tempPseudoProbability;
resultBestIndex = i;
} // Of if
} // Of for i
return resultBestIndex;
}// Of classifyNominal
/**
********************
* Compute accuracy.
********************
*/
public double computeAccuracy() {
double tempCorrect = 0;
for (int i = 0; i < numInstances; i++) {
if (predicts[i] == (int) dataset.instance(i).classValue()) {
tempCorrect++;
} // Of if
} // Of for i
double resultAccuracy = tempCorrect / numInstances;
return resultAccuracy;
}// Of computeAccuracy
/**
*************************
* Test nominal data.
*************************
*/
public static void testNominal() {
System.out.println("Hello, Naive Bayes. I only want to test the nominal data.");
String tempFilename = "D:/data/iris.arff";
NaiveBayes tempLearner = new NaiveBayes(tempFilename);
tempLearner.setDataType(NOMINAL);
tempLearner.calculateClassDistribution();
tempLearner.calculateConditionalProbabilities();
tempLearner.classify();
System.out.println("The accuracy is: " + tempLearner.computeAccuracy());
}// Of testNominal
public static void main(String[] args) {
testNominal();
//testNumerical();
}// Of main
}// Of class NaiveBayes
第 59 天: 数值型数据的 NB 算法
今天把数值型数据处理的代码加上去.
假设所有属性的属性值都服从高斯分布. 也可以做其它假设.
将概率密度当成概率值直接使用 Bayes 公式.
可以看到, 数值型数据的处理并不会比符号型的复杂.
代码:
/**
********************
* Classify an instances with numerical data.
********************
*/
public int classifyNumerical(Instance paraInstance) {
// Find the biggest one
double tempBiggest = -10000;
int resultBestIndex = 0;
for (int i = 0; i < numClasses; i++) {
double tempPseudoProbability = Math.log(classDistributionLaplacian[i]);
for (int j = 0; j < numConditions; j++) {
double tempAttributeValue = paraInstance.value(j);
double tempSigma = gaussianParameters[i][j].sigma;
double tempMu = gaussianParameters[i][j].mu;
tempPseudoProbability += -Math.log(tempSigma) - (tempAttributeValue - tempMu)
* (tempAttributeValue - tempMu) / (2 * tempSigma * tempSigma);
} // Of for j
if (tempBiggest < tempPseudoProbability) {
tempBiggest = tempPseudoProbability;
resultBestIndex = i;
} // Of if
} // Of for i
return resultBestIndex;
}// Of classifyNumerical
/**
*************************
* Test numerical data.
*************************
*/
public static void testNumerical() {
System.out.println(
"Hello, Naive Bayes. I only want to test the numerical data with Gaussian assumption.");
String tempFilename = "D:/data/iris.arff";
NaiveBayes tempLearner = new NaiveBayes(tempFilename);
tempLearner.setDataType(NUMERICAL);
tempLearner.calculateClassDistribution();
tempLearner.calculateGausssianParameters();
tempLearner.classify();
System.out.println("The accuracy is: " + tempLearner.computeAccuracy());
}// Of testNominal
public static void main(String[] args) {
testNominal();
testNumerical();
}// Of main
第 60 天: 小结
1.机器学习主要分为监督学习、无监督学习、半监督学习、强化学习。
2.监督学习:
In:有标签
Out:有反馈
目的:预测结果
案例:学认字
算法:分类(类别),回归(数字)
3.无监督学习:
In:无标签
Out:无反馈
目的:发现潜在结构
案例:自动聚类
算法:聚类,降维
4.半监督学习:
已知:训练样本Data和待分类的类别
未知:训练样本有无标签均可
应用:训练数据量过时,
监督学习效果不能满足需求,因此用来增强效果。
5.强化学习:
In:决策流程及激励系统
Out:一系列行动
目的:长期利益最大化,回报函数(只会提示你是否在朝着目标方向前进的延迟反映)
案例:学下棋
算法:马尔科夫决策,动态规划
6.KNN属于分类算法, K值确定标准:
K值过小:
k值小,特征空间被划分为更多子空间(模型的项越多),整体模型变复杂,容易发生过拟合,k值越小,选择的范围就比较小,训练的时候命中率较高,近似误差小,而用test的时候就容易出错,估计误差大,容易过拟合。
K值=N:无论输入实例是什么,都将简单的预测他属于训练实例中最多的类。
7.所谓聚类算法是指将一堆没有标签的数据自动划分成几类的方法,属于无监督学习方法,这个方法要保证同一类的数据有相似的特征。
8.Kmeans是聚类算法,K值选取: 在实际应用中,由于Kmean一般作为数据预处理,或者用于辅助分类贴标签。所以k一般不会设置很大。可以通过枚举,令k从2到一个固定值如10,在每个k值上重复运行数次kmeans(避免局部最优解),并计算当前k的平均轮廓系数,最后选取轮廓系数最大的值对应的k作为最终的集群数目。
9.朴素贝叶斯是基于概率论的算法,属于分类算法,是监督学习,由于用先验数据去预测分类,因此存在误差。
10.通过十天的入门学习,了解了机器学习的基础算法,但是代码还不熟练,数学原理的理解还不是很透彻,之后会继续加强练习。