Day 51 kNN分类器

51.1 代码

package dl;

import java.io.FileReader;
import java.util.Arrays;
import java.util.Random;

import weka.core.*;

/**
 * KNN classification.
 *
 * @author 86183
 */
public class KnnClassification {

    /**
     * Manhattan distance.
     */
    public static final int MANHATTAN = 0;

    /**
     * Euclidean distance.
     */
    public static final int EUCLIDEAN = 1;

    /**
     * The distance measure.
     */
    public int distanceMeasure = EUCLIDEAN;

    /**
     * A random instance;
     */
    public static final Random random = new Random();

    /**
     * The number of neighbors.
     */
    int numNeighbors = 7;

    /**
     * The whole dataset.
     */
    Instances dataset;

    /**
     * The training set. Represented by the indices of the data.
     */
    int[] trainingSet;

    /**
     * The testing set. Represented by the indices of the data.
     */
    int[] testingSet;

    /**
     * The predictions.
     */
    int[] predictions;

    /**
     **********************
     * The first constructor.
     *
     * @param paraFilename The arff filename.
     **********************
     */
    public KnnClassification(String paraFilename) {
        try {
            FileReader fileReader = new FileReader(paraFilename);
            dataset = new Instances(fileReader);
            // The last attribute is the decision class.
            dataset.setClassIndex(dataset.numAttributes() - 1);
            fileReader.close();
        } catch (Exception ee) {
            System.out.println("Error occurred while trying to read \'" + paraFilename
                    + "\' in KnnClassification constructor.\r\n" + ee);
            System.exit(0);
        } // Of try
    }// Of the first constructor

    /**
     *********************
     * Get a random indices for data randomization.
     *
     * @param paraLength The length of the sequence.
     * @return An array of indices, e.g., {4, 3, 1, 5, 0, 2} with length 6.
     *********************
     */
    public static int[] getRandomIndices(int paraLength) {
        int[] resultIndices = new int[paraLength];

        // Step 1. Initialize.
        for (int i = 0; i < paraLength; i++) {
            resultIndices[i] = i;
        } // Of for i

        // Step 2. Randomly swap.
        int tempFirst, tempSecond, tempValue;
        for (int i = 0; i < paraLength; i++) {
            // Generate two random indices.
            tempFirst = random.nextInt(paraLength);
            tempSecond = random.nextInt(paraLength);

            // Swap.
            tempValue = resultIndices[tempFirst];
            resultIndices[tempFirst] = resultIndices[tempSecond];
            resultIndices[tempSecond] = tempValue;
        } // Of for i

        return resultIndices;
    }// Of getRandomIndices

    /**
     *********************
     * Split the data into training and testing parts.
     *
     * @param paraTrainingFraction The fraction of the training set.
     *********************
     */
    public void splitTrainingTesting(double paraTrainingFraction) {
        int tempSize = dataset.numInstances();
        int[] tempIndices = getRandomIndices(tempSize);
        int tempTrainingSize = (int) (tempSize * paraTrainingFraction);

        trainingSet = new int[tempTrainingSize];
        testingSet = new int[tempSize - tempTrainingSize];

        for (int i = 0; i < tempTrainingSize; i++) {
            trainingSet[i] = tempIndices[i];
        } // Of for i

        for (int i = 0; i < tempSize - tempTrainingSize; i++) {
            testingSet[i] = tempIndices[tempTrainingSize + i];
        } // Of for i
    }// Of splitTrainingTesting

    /**
     *********************
     * Predict for the whole testing set. The results are stored in predictions.
     * #see predictions.
     *********************
     */
    public void predict() {
        predictions = new int[testingSet.length];
        for (int i = 0; i < predictions.length; i++) {
            predictions[i] = predict(testingSet[i]);
        } // Of for i
    }// Of predict

    /**
     *********************
     * Predict for given instance.
     *
     * @return The prediction.
     *********************
     */
    public int predict(int paraIndex) {
        int[] tempNeighbors = computeNearests(paraIndex);
        int resultPrediction = simpleVoting(tempNeighbors);

        return resultPrediction;
    }// Of predict

    /**
     *********************
     * The distance between two instances.
     *
     * @param paraI The index of the first instance.
     * @param paraJ The index of the second instance.
     * @return The distance.
     *********************
     */
    public double distance(int paraI, int paraJ) {
        double resultDistance = 0;
        double tempDifference;
        switch (distanceMeasure) {
            case MANHATTAN:
                for (int i = 0; i < dataset.numAttributes() - 1; i++) {
                    tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
                    if (tempDifference < 0) {
                        resultDistance -= tempDifference;
                    } else {
                        resultDistance += tempDifference;
                    } // Of if
                } // Of for i
                break;

            case EUCLIDEAN:
                for (int i = 0; i < dataset.numAttributes() - 1; i++) {
                    tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
                    resultDistance += tempDifference * tempDifference;
                } // Of for i
                break;
            default:
                System.out.println("Unsupported distance measure: " + distanceMeasure);
        }// Of switch

        return resultDistance;
    }// Of distance

    /**
     *********************
     * Get the accuracy of the classifier.
     *
     * @return The accuracy.
     *********************
     */
    public double getAccuracy() {
        // A double divides an int gets another double.
        double tempCorrect = 0;
        for (int i = 0; i < predictions.length; i++) {
            if (predictions[i] == dataset.instance(testingSet[i]).classValue()) {
                tempCorrect++;
            } // Of if
        } // Of for i

        return tempCorrect / testingSet.length;
    }// Of getAccuracy

    /**
     ************************************
     * Compute the nearest k neighbors. Select one neighbor in each scan. In fact we
     * can scan only once. You may implement it by yourself.
     *
     * @param paraK       the k value for kNN.
     * @param paraCurrent current instance. We are comparing it with all others.
     * @return the indices of the nearest instances.
     ************************************
     */
    public int[] computeNearests(int paraCurrent) {
        int[] resultNearests = new int[numNeighbors];
        boolean[] tempSelected = new boolean[trainingSet.length];
        double tempMinimalDistance;
        int tempMinimalIndex = 0;

        // Compute all distances to avoid redundant computation.
        double[] tempDistances = new double[trainingSet.length];
        for (int i = 0; i < trainingSet.length; i++) {
            tempDistances[i] = distance(paraCurrent, trainingSet[i]);
        } // Of for i

        // Select the nearest paraK indices.
        for (int i = 0; i < numNeighbors; i++) {
            tempMinimalDistance = Double.MAX_VALUE;

            for (int j = 0; j < trainingSet.length; j++) {
                if (tempSelected[j]) {
                    continue;
                } // Of if

                if (tempDistances[j] < tempMinimalDistance) {
                    tempMinimalDistance = tempDistances[j];
                    tempMinimalIndex = j;
                } // Of if
            } // Of for j

            resultNearests[i] = trainingSet[tempMinimalIndex];
            tempSelected[tempMinimalIndex] = true;
        } // Of for i

        System.out.println("The nearest of " + paraCurrent + " are: " + Arrays.toString(resultNearests));
        return resultNearests;
    }// Of computeNearests

    /**
     ************************************
     * Voting using the instances.
     *
     * @param paraNeighbors The indices of the neighbors.
     * @return The predicted label.
     ************************************
     */
    public int simpleVoting(int[] paraNeighbors) {
        int[] tempVotes = new int[dataset.numClasses()];
        for (int i = 0; i < paraNeighbors.length; i++) {
            tempVotes[(int) dataset.instance(paraNeighbors[i]).classValue()]++;
        } // Of for i

        int tempMaximalVotingIndex = 0;
        int tempMaximalVoting = 0;
        for (int i = 0; i < dataset.numClasses(); i++) {
            if (tempVotes[i] > tempMaximalVoting) {
                tempMaximalVoting = tempVotes[i];
                tempMaximalVotingIndex = i;
            } // Of if
        } // Of for i

        return tempMaximalVotingIndex;
    }// Of simpleVoting

    /**
     *********************
     * The entrance of the program.
     *
     * @param args Not used now.
     *********************
     */
    public static void main(String args[]) {
        KnnClassification tempClassifier = new KnnClassification("src/main/java/resources/iris.arff");
        tempClassifier.splitTrainingTesting(0.8);
        tempClassifier.predict();
        System.out.println("The accuracy of the classifier is: " + tempClassifier.getAccuracy());
    }// Of main

}// Of class KnnClassification

51.2 疑问

computeNearests() 函数里选择最小的7个值,是否可以不用两个循环,而直接使用一次循环用tempDistance[]记录距离然后排序选前7呢? 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值