K-Means算法的Hadoop实现

最新推荐文章于 2024-08-08 10:34:27 发布

想作会飞的鱼

最新推荐文章于 2024-08-08 10:34:27 发布

阅读量7.9k

点赞数 4

分类专栏：云计算Hadoop 推荐系统设计文章标签： mapreduce 推荐算法协同过滤

本文链接：https://blog.csdn.net/xiaokang123456kao/article/details/74896725

版权

本文介绍了如何使用MapReduce实现K-Means算法，包括基本流程：Map阶段根据中心分配数据，Reduce阶段计算新中心。通过代码演示，展示了辅助工具类、MapReduce任务代码，并提供了输入数据格式和测试结果。

摘要由CSDN通过智能技术生成

一、MapReduce实现K-Means算法的基本流程

1、Map每读取一条数据就与所有选定的中心做对比，求出该条记录对应的中心，然后以中心的ID为Key，该条数据为value将数据输出。

2，利用reduce的归并功能将相同的Key归并到一起（因为map把中心的ID作为key，所以在reduce端时数据已经按照各自的中心分好了组，这是MapReduce框架自动完成的），集中与该Key对应的数据，再求出这些数据的平均值，输出平均值，以便进行新的中心的选取。

3，对比reduce求出的平均值与原来或者上一次的中心，如果不相同，这将清空原中心的数据文件，将reduce的结果写到中心文件中。（中心的值存在一个HDFS的文件中）同时删掉reduce的输出目录以便下次输出。然后继续循环1、2、3步骤。

4，对比reduce求出的平均值与原来的中心，如果相同。则删掉reduce的输出目录，运行一个没有reduce的任务将中心ID与值对应输出。

二、代码演示

1、辅助工具类

package com.kang.kmeans;

import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.LineReader;

public class Utils {

    //读取中心文件的数据
    public static ArrayList<ArrayList<Double>> getCentersFromHDFS(String centersPath,boolean isDirectory) throws IOException{

        ArrayList<ArrayList<Double>> result = new ArrayList<ArrayList<Double>>();

        Path path = new Path(centersPath);

        Configuration conf = new Configuration();

        FileSystem fileSystem = path.getFileSystem(conf);

        if(isDirectory){    
            FileStatus[] listFile = fileSystem.listStatus(path);
            for (int i = 0; i < listFile.length; i++) {
                result.addAll(getCentersFromHDFS(listFile[i].getPath().toString(),false));
            }
            return result;
        }

        FSDataInputStream fsis = fileSystem.open(path);
        LineReader lineReader = new LineReader(fsis, conf);

        Text line = new Text();

        while(lineReader.readLine(line) > 0){
            ArrayList<Double> tempList = textToArray(line);
            result.add(tempList);
        }
        lineReader.close();
        return result;
    }

    //删掉文件
    public static void deletePath(String pathStr) throws IOException{
        Configuration conf = new Configuration();
        Path path = new Path(pathStr);
        FileSystem hdfs = path.getFileSystem(conf);
        hdfs.delete(path ,true);
    }

    //文本string类型转为数组类型
    public static ArrayList<Double> textToArray(Text text){
        ArrayList<Double> list = new ArrayList<Double>();
        String[] fileds = text.toString().split(",");
        for(int i=0;i<fileds.length;i++){
            list.add(Double.parseDouble(fileds[i]));
        }
        return list;
    }

    //比较新旧中心点的变化情况
    public static boolean compareCenters(String centerPath,String newPath) throws IOException{

        List<ArrayList<Double>> oldCenters = Utils.getCentersFromHDFS(centerPath,false);
        List<ArrayList<Double>> newCenters = Utils.getCentersFromHDFS(newPath,true);

        int size = oldCenters.size();
        int fildSize = oldCenters.get(0).size();
        double distance = 0;
        for(int i=0;i<size;i++){
            for(int j=1;j<fildSize;j++){
                double t1 = Math.abs(oldCenters.get(i).get(j));
                double t2 = Math.abs(newCenters.get(i).get(j));
                distance += Math.pow((t1 - t2) / (t1 + t2), 2);//根据具体情况选择相应的距离算法，这里简化处理
            }
        }

        if(distance == 0.0){
            //删掉新的中心文件以便最后依次归类输出
            Utils.deletePath(newPath);
            return true;
        }else{
            //先清空中心文件，将新的中心文件复制到中心文件中，再删掉中心文件

            Configuration conf = new Configuration();
            Path outPath = new Path(centerPath);
            FileSystem fileSystem = outPath.getFileSystem(conf);

            FSDataOutputStream overWrite = fileSystem.create(outPath,true);
            overWrite.writeChars("");
            overWrite.close();


            Path inPath = new Path(newPath);
            FileStatus[] listFiles = fileSystem.listStatus(inPath);
            for (int i = 0; i < listFiles.length; i++) {                
                FSDataOutputStream out = fileSystem.create(outPath);
                FSDataInputStream in = fileSystem.open(listFiles[i].getPath());
                IOUtils.copyBytes(in, out, 4096, true);
            }
            //删掉新的中心文件以