KMeans++算法和Hadoop关键代码实现

算法关键描述

kmeans算法比较简单,比较难的问题在于初始簇的选择。因为如果初始簇选的不好,会出现空簇,聚类不收敛等问题。因此kmeans++就是增加了初始簇的选择方法。

轮盘法

轮盘法是一种非均匀分布的均匀分布实现方法。意思就是给定一个概率分布,可以在0,1均匀分布下选择一个随机数,然后从起始概率分布遍历,直到概率和到达随机数。在kmeans++中,目的是让初始点的分布尽可能的远,因此新的簇点,应该更大概率的选择离已有簇点更远的点。这就是轮盘法的结果。在代码编写中,不妨初始点设置为0向量,然后总的簇的数目为k+1,最后去掉0向量就可以了。然后由于实际中数据量很大,因此必须要采用读写文件的方式。

Hadoop关键代码实现

Feature.java


package edu.bupt.kmeans;
import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;

public class Feature implements Writable {

    private ArrayList<Double> features;

    public Feature() {
        this.features = new ArrayList<Double>();
    }

    public Feature(ArrayList<Double> features) {
        this.features = features;
    }

    public Feature(int size){
        features = new ArrayList<Double>();
        for (int i=0;i<size;i++){
            features.add(0.0);
        }
    }

    public Feature(String line) {
        String[] valueString = line.split(" ");
        features = new ArrayList<Double>();
        for (String value : valueString) {
            features.add(Double.parseDouble(value));
        }
    }

    public double get(int i){
        if (i<getFeatureSize()){
            return this.features.get(i);
        }else{
            return 0;
        }

    }


    public void setFeatures(ArrayList<Double> features) {
        this.features = features;
    }

    public int getFeatureSize() {
        return features.size();
    }

    public ArrayList<Double> getFeatures() {
        return features;
    }

    @Override
    public String toString() {
        String s = new String();
        for (Double feature : features) {
            s += feature + " ";
        }
        return s.substring(0,s.length()-1);
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(features.size());
        for (Double feature : features) {
            out.writeDouble(feature);
        }

    }

    @Override
    public void readFields(DataInput in) throws IOException {
        int size = in.readInt();
        features = new ArrayList<Double>();
        for (int i=0;i<size;i++){
            features.add(in.readDouble());
        }

    }

    public static void main(String[] args) {
        ArrayList<Double> arr = new ArrayList<Double>();
        arr.add(1.0);
        arr.add(2.0);
        Feature feature = new Feature("-1.246179058446342136e+00 2.288084176933262270e+00");
        System.out.println(feature);
        int x;
        System.out.println(3.1/3);


    }
}

KMeans.java

package edu.bupt.kmeans;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.BufferedReader;
import java.io.IOException;

import java.io.InputStreamReader;
import java.util.ArrayList;

public class KMeans {

    public static class KmeansMapper extends Mapper<LongWritable, Text, IntWritable,Feature>{
        private ArrayList<Feature> kClusters = new ArrayList<Feature>();//存放上一次的分簇结果
        private Integer k;
        private Integer feature_size;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
            k = new Integer(context.getConfiguration().get("cluster_k"));
            feature_size = new Integer(context.getConfiguration().get("feature_size"));
            FileSystem fileSystem = FileSystem.get(context.getConfiguration());
            FileStatus[] fileList = fileSystem.listStatus(new Path(context.getConfiguration().get("clusterPath")));
            BufferedReader in = null;
            FSDataInputStream fsi = null;
            String line = null;
            for (int i=0;i<fileList.length;i++){
                if (!fileList[i].isDirectory()){
                    fsi = fileSystem.open(fileList[i].getPath());
                    in = new BufferedReader(new InputStreamReader(fsi,"UTF-8"));
                    while((line=in.readLine())!=null){
                        Feature feature = new Feature(line);
                        kClusters.add(feature);
                    }
                }
            }
            in.close();
            fsi.close();
            while (kClusters.size()<k){
                kClusters.add(new Feature(feature_size));
            }
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            Feature feature = new Feature(value.toString());
            int temp_k = 0;
            double minDis = Double.MAX_VALUE;
            double newDis = 0.0;
            for (int i=0;i<kClusters.size();i++){
                Feature point = kClusters.get(i);
                double temp = 0;
                for (int j=0;j<feature_size;j++){
                    temp+=Math.pow((point.get(j)-feature.get(j)),2);
                }
                newDis = Math.sqrt(temp);
                if (newDis<minDis){
                    minDis = newDis;
                    temp_k = i;
                }
            }
            context.write(new IntWritable(temp_k),feature);


        }
    }
    public static class KmeansReducer extends Reducer<IntWritable,Feature, NullWritable,Feature>{
        private Integer k;
        private Integer feature_size;
        private ArrayList<Feature> kClusters = new ArrayList<Feature>();//存放上一次的分簇结果
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);
            k = new Integer(context.getConfiguration().get("cluster_k"));
            feature_size = new Integer(context.getConfiguration().get("feature_size"));
            FileSystem fileSystem = FileSystem.get(context.getConfiguration());
            FileStatus[] fileList = fileSystem.listStatus(new Path(context.getConfiguration().get("clusterPath")));
            BufferedReader in = null;
            FSDataInputStream fsi = null;
            String line = null;
            for (int i=0;i<fileList.length;i++){
                if (!fileList[i].isDirectory()){
                    fsi = fileSystem.open(fileList[i].getPath());
                    in = new BufferedReader(new InputStreamReader(fsi,"UTF-8"));
                    while((line=in.readLine())!=null){
                        Feature feature = new Feature(line);
                        kClusters.add(feature);
                    }
                }
            }
            in.close();
            fsi.close();
            while (kClusters.size()<k){
                kClusters.add(new Feature(feature_size));
            }
        }

        @Override
        protected void reduce(IntWritable key, Iterable<Feature> values, Context context) throws IOException, InterruptedException {

            ArrayList<Double> mean_feature= new ArrayList<Double>();
            for (int i=0;i<feature_size;i++){
                mean_feature.add(0.0);
            }
            int count = 0;
            for (Feature value:values){
                count+=1;
                for (int j=0;j<feature_size;j++){
                    mean_feature.set(j,mean_feature.get(j)+value.get(j));
                }
            }
            if (count>0){
                for (int i=0;i<feature_size;i++){
                    mean_feature.set(i,mean_feature.get(i)/count);
                }
                context.write(NullWritable.get(),new Feature(mean_feature));
            }else{
                context.write(NullWritable.get(),kClusters.get(key.get()));
            }


        }
    }




}

KMeanspp.java


package edu.bupt.kmeans;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Random;

public final class KMeanspp {

    private int k; //分簇的个数
    private int feature_size; //特征的维度
    private FileStatus[] fileList; //文件列表
    private FileSystem fs; //源数据目录操作对象
    private ArrayList<Feature> kClusters;//初始簇的集合
    private Configuration conf; //配置文件
    private String filepath;


    public KMeanspp(Configuration conf,String filePath,int k,int feature_size){
        this.k = k;
        this.filepath = filePath;
        this.feature_size = feature_size;
        try {
            fs = FileSystem.get(conf);
            fileList = fs.listStatus((new Path(filePath)));
            //构造初始容量为K的空arraylist,kcluster.size =0
            kClusters = new ArrayList<Feature>();
            kClusters.add(new Feature(feature_size));
            this.conf = conf;
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public double getDis(Feature feature1,Feature feature2){

        double newDis = 0.0;
        for (int i=0;i<feature_size;i++){
            newDis+=Math.pow((feature1.get(i)-feature2.get(i)),2);
            }
        newDis = Math.sqrt(newDis);
        return newDis;
    }

    public void writeLineToTemp(String s, String temp){
        Path hdfsPath = new Path(temp);
        FSDataOutputStream fileOutputStream = null;
        try {
            if (!fs.exists(hdfsPath)) {
                fileOutputStream = fs.create(hdfsPath,false);
            }else{
                fileOutputStream = fs.append(hdfsPath);
            }
            fileOutputStream.writeBytes(s+"\n");
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(fileOutputStream!=null){
                try {
                    fileOutputStream.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }


    public Feature getNextCluster() throws IOException {

        Text line = new Text();
        FSDataInputStream fsi = null;
        FSDataOutputStream fso = null;

        double count = 0;
        try {
            if (fs.exists(new Path(filepath+"temp.txt"))){
                fs.delete(new Path(filepath+"temp.txt"),false);}

            for(int i = 0;i < fileList.length;i++){
                if (!(fileList[i].isDirectory() | fileList[i].getPath().equals(new Path(filepath+"temp.txt")))){
                    fsi = fs.open(fileList[i].getPath());
                    LineReader lineReader = new LineReader(fsi,conf);
                    while(lineReader.readLine(line)>0){
                        if(line.toString().length()==0){
                            continue;
                        }
                        Feature point = new Feature(line.toString());
                        double disMin = Double.MAX_VALUE;
                        for (Feature point2:kClusters){
                            double dis = getDis(point,point2);
                            if (dis<disMin){disMin = dis;}
                        }
                        count+=disMin;
                        writeLineToTemp((point.toString()+" "+Double.toString(disMin)),filepath+"temp.txt");


                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                fsi.close();
            } catch (IOException e) {
                e.printStackTrace();
            }

        }
        int result = new Random(0).nextInt((int)count);
        Feature newcluster = new Feature();

        try {
            Path hdfsPath = new Path(filepath+"temp.txt");
            fsi = fs.open(hdfsPath);
            LineReader lineReader = new LineReader(fsi,conf);
            while((lineReader.readLine(line)>0)&(result>0)) {
                if (line.toString().length() == 0) {
                    continue;
                }
                String[] l1 = line.toString().split(" ");
                Double dis = Double.parseDouble(l1[l1.length-1]);
                String fea = "";
                for (int i=0;i<l1.length-1;i++){
                    fea=fea+l1[i]+" ";
                }
                fea = fea.substring(0,fea.length()-1);
                result-=Double.parseDouble(dis.toString());
                newcluster = new Feature(fea);
            }
            return newcluster;

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                fsi.close();
            } catch (IOException e) {
                e.printStackTrace();
            }

        }
        return null;

    }

    public void writeNextCluster(String firstClusterPath) throws IOException {
        while(kClusters.size()<k+1){
            Feature newCluster = getNextCluster();
            kClusters.add(newCluster);
            writeLineToTemp(newCluster.toString(),firstClusterPath);
        }
        if (fs.exists(new Path(filepath+"temp.txt"))){
            fs.delete(new Path(filepath+"temp.txt"),false);}
    }
    public static void main(String[] args) throws IOException, InterruptedException {
        Configuration conf = new Configuration();
        conf.set("fs.default.name", "hdfs://hadoop1:9000/");
        KMeanspp test = new KMeanspp(conf,"KMeans_in/",6,2);
        test.writeNextCluster("KMeans_out/123.txt");
    }

}

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值