Kmeans

最新推荐文章于 2017-10-11 09:35:46 发布

学战到底

最新推荐文章于 2017-10-11 09:35:46 发布

阅读量686

点赞数

分类专栏： hadoop MachineLearning 文章标签： hadoop

hadoop 同时被 2 个专栏收录

35 篇文章 0 订阅

订阅专栏

MachineLearning

21 篇文章 2 订阅

订阅专栏

在Hadoop分布式环境下实现K-Means聚类算法的伪代码如下：

输入：参数0--存储样本数据的文本文件inputfile；

参数1--存储样本数据的SequenceFile文件inputPath；

参数2--存储质心数据的SequenceFile文件centerPath；

参数3--存储聚类结果文件(SequenceFile文件)所处的路径clusterPath；

参数4--类的数量k；

输出：k个类

Begin

读取inputPath，从中选取前k个点作为初始质心，将质心数据写入centerPath；

While 聚类终止条件不满足

在Mapper阶段，读取inputPath，对于key所对应的点，遍历所有的质心，选择最近的质心，将该质心的编号作为键，

该点的编号作为值传递给Reducer；

在Reducer阶段，将Mapper阶段传递过来的值根据键归并输出，结果写入clusterPath；

读取clusterPath，重新计算质心，将结果写入centerPath；

EndWhile

End

判断聚类效果好坏的常见指标是下述的准则函数值：

有理由认为上述值越小，聚类效果越好，随着循环的不断进行，上述准则函数值会收敛到一个很小的值，所以可以用这个值不再明显变化作为聚类循环的终止条件。

以下是存储样本数据(总共200个点)的本地文件kmeans.txt的部分片段(10个点)：

163   61   20
17   34   25
66   7   10
14   34   34
128   5   41
49   33   24
185   58   20
83   8   14
54   3   17
96   1   13

其中第一个字段为点的id，第二个字段是点的横坐标，第三个字段是点的纵坐标。

将上述点可视化，见下图：

为了便于访问待聚类的点的ID及其坐标，将输入样本数据存储在SequenceFile格式的文件中，

其中key是点的ID，数据类型为Text，点的坐标是一个double[]型的数组，将该数组封装在类DoubleArray中，这个类需要继承Writable接口，

类DoubleArray的定义如下：DoubleArray.java

package kmeans;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class DoubleArray implements Writable {

private double[] data;

public DoubleArray() { 	}

public DoubleArray(double[] data) { set(data);}

public void set(double[] data) { this.data = data;}

public double[] get() { 		return data; 	}

public void write(DataOutput out) throws IOException {

int length = 0;

if (data != null) { 			length = data.length; 		}

out.writeInt(length);

for (int i = 0; i < length; i++) { 			out.writeDouble(data[i]); 		}

public void readFields(DataInput in) throws IOException {

int length = in.readInt();

data = new double[length];

for (int i = 0; i < length; i++) { 			data[i] = in.readDouble(); 		}

public double distanceTo(DoubleArray point) {

double[] data1 = point.get();

double distance = 0;

for (int i = 0; i < data.length; i++) {

distance = distance + Math.pow(data[i] - data1[i], 2); 		}

return distance;

public void plus(DoubleArray point) {

double[] data1 = point.get();

for (int i = 0; i < data.length; i++) { 			data[i] = data[i] + data1[i]; 		}

public void averageN(int n) {

for (int i = 0; i < data.length; i++) {

data[i] = data[i]/n; 		}

在Mapper阶段，为了便于计算准则函数的值，需要向Reducer传递隶属于某个质心的点的编号以及该点到该质心的距离的平方，为此将这两项数据封装在类IdAndDistance中，该类需要继承Writable接口，代码如下：IdAndDistance.java

package kmeans;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class IdAndDistance implements Writable {

private String id;

private double distance;

public void set(String id, double distance) {

this.id = id;

this.distance = distance; 	}

public IdAndDistance() { 	}

public IdAndDistance(String id, double distance) { 		set(id, distance); 	}

public String getId() { 		return id; 	}

public double getDistance() { 		return distance; 	}

public void write(DataOutput out) throws IOException {

out.writeUTF(id);

out.writeDouble(distance); 	}

public void readFields(DataInput in) throws IOException {

id = in.readUTF();

distance = in.readDouble(); 	} }

Mapper阶段代码：KMeansMapper.java

package kmeans;

import java.io.IOException;

import java.net.URI;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IOUtils;

import org.apache.hadoop.io.SequenceFile;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.util.ReflectionUtils;

public class KMeansMapper extends Mapper<Text, DoubleArray, Text, IdAndDistance> {

private DoubleArray[] centers = null;

protected void setup(Context context) throws IOException, InterruptedException {

super.setup(context);

Configuration conf = context.getConfiguration();

centers = new DoubleArray[conf.getInt("numberOfCenters", 4)];

String centerPath = conf.get("centerPath");

FileSystem fs =  FileSystem.get(URI.create(centerPath), conf);

Path path = new Path(centerPath);

SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);

Text key = (Text) ReflectionUtils.newInstance(Text.class, conf);

DoubleArray value = (DoubleArray) ReflectionUtils.newInstance(DoubleArray.class, conf);

try {

while (reader.next(key, value)) {

int index = Integer.parseInt(key.toString());

double[] shuzu = value.get();

centers[index] = new DoubleArray(shuzu);

} finally {

IOUtils.closeStream(reader);

public void map(Text key, DoubleArray value, Context context)      throws IOException, InterruptedException {  double minDistance = Double.MAX_VALUE;

int nearestCenter = 0;

for (int i = 0; i < centers.length; i++) {

if (value.distanceTo(centers[i]) < minDistance) {

nearestCenter = i;

minDistance = value.distanceTo(centers[i]);

context.write(new Text(String.valueOf(nearestCenter)), new IdAndDistance(key.toString(),minDistance));     } }

Reducer阶段代码：KMeansReducer.java

package kmeans;

import java.io.IOException;

import java.util.Iterator;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.WritableUtils;

import org.apache.hadoop.mapreduce.Reducer;

public class KMeansReducer extends Reducer<Text, IdAndDistance, Text, Text>{

public void reduce(Text key, Iterable<IdAndDistance> values, Context context)      throws IOException, InterruptedException {

double sumOfDistance = 0;

Iterator<IdAndDistance> ite = values.iterator();         String cluster = "";         while (ite.hasNext()) {             IdAndDistance temp = WritableUtils.clone(ite.next(), context.getConfiguration());             if (cluster.length() > 0) cluster = cluster + ",";             cluster = cluster + temp.getId();             sumOfDistance = sumOfDistance + temp.getDistance();         }         cluster = cluster + "," + String.valueOf(sumOfDistance);         context.write(key, new Text(cluster));     } }

驱动程序代码：KMeansDriver.java

package kmeans; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.net.URI; import java.util.Scanner; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.ReflectionUtils; public class KMeansDriver {     public static void main(String[] args) throws Exception {         // TODO Auto-generated method stub         Configuration conf = new Configuration();         conf.setInt("numberOfCenters", Integer.parseInt(args[4]));         String inputfile = args[0];//linux系统中包含全路径的文件         String inputPath = args[1];//hdfs文件系统中包含全路径的文件         String centerPath = args[2];//hdfs文件系统中包含全路径的文件         String clusterPath = args[3];//hdfs文件系统中存放簇的路径,不含文件名         conf.set("centerPath", centerPath);         double s = 0;         double s1 = Double.MAX_VALUE;         double shold = 0.1;         int times = 0;         writeToSeq(conf,inputPath,inputfile);//将样本数据转换成SequenceFile格式文件         System.out.println("Begin to generate centers");         int dimention = centersInitial(conf, inputPath, centerPath);//生成初始质心，并返回样本点的维数             System.out.println("Generating centers for MRJob "+times+" successfully");         conf.setInt("dimention", dimention);         FileSystem fs = FileSystem.get(conf);         Job job = null;         do {             System.out.println("MRJob-----------------------------"+times);             fs.delete(new Path(clusterPath), true);//删除上一轮循环生成的聚类结果             job = new Job(conf);             job.setJarByClass(KMeansDriver.class);             job.setInputFormatClass(SequenceFileInputFormat.class);             job.setMapperClass(KMeansMapper.class);             job.setMapOutputKeyClass(Text.class);             job.setMapOutputValueClass(IdAndDistance.class);             job.setReducerClass(KMeansReducer.class);             job.setOutputKeyClass(Text.class);             job.setOutputValueClass(Text.class);             job.setOutputFormatClass(SequenceFileOutputFormat.class);                         SequenceFileInputFormat.addInputPath(job, new Path(inputPath));             SequenceFileOutputFormat.setOutputPath(job, new Path(clusterPath));             job.waitForCompletion(true);             if (job.waitForCompletion(true)) {                 fs.delete(new Path(centerPath), true);                 System.out.println("Begin to generate centers"); //                根据聚类结果生成新质心，并返回聚类结果的准则函数值                 double s2 = newCenters(conf, inputPath, centerPath, clusterPath);                 System.out.println("s2 = "+s2);                 times ++;                 s = Math.abs(s1 - s2);                 System.out.println("Generating centers for MRJob "+times+" successfully");                 System.out.println("s = "+s);                 s1 = s2;             }         } while (s > shold);//若准则函数值不再有显著变化，则终止循环，否则进入下一轮循环         writeClustersToLocalFile(conf, clusterPath);//将聚类结果写入本地文本文件     }          public static void writeToSeq(Configuration conf, String inputPath, String inputfile) throws IOException {         String uri = inputPath;         FileSystem fs=FileSystem.get(URI.create(uri),conf);         Path path=new Path(uri);         SequenceFile.Writer Writer = new SequenceFile.Writer(fs,conf,path,Text.class,DoubleArray.class);          File file = new File(inputfile);         Scanner input = new Scanner(file);         while (input.hasNext()) {             String[] line = input.nextLine().split("t");             Text key = new Text(line[0]);             double[] data = new double[2];             data[0] = Double.parseDouble(line[1]);             data[1] = Double.parseDouble(line[2]);             DoubleArray value = new DoubleArray(data);             Writer.append(key, value);         }         input.close();         Writer.close();     }              public static int centersInitial(Configuration conf, String inputPath, String centerPath)      throws IOException {         int dimention = 0;         FileSystem inputPathFs= FileSystem.get(URI.create(inputPath),conf);         Path path1 = new Path(inputPath);         SequenceFile.Reader inputPathReader = new SequenceFile.Reader(inputPathFs, path1, conf);                  FileSystem centerPathFs= FileSystem.get(URI.create(centerPath),conf);         Path path2 = new Path(centerPath);         SequenceFile.Writer centerPathWriter = new SequenceFile.Writer(centerPathFs,conf,path2,Text.class,DoubleArray.class);         Text key = (Text) ReflectionUtils.newInstance(Text.class, conf);         DoubleArray value = (DoubleArray) ReflectionUtils.newInstance(DoubleArray.class, conf);         try {             int k = 0;             while (inputPathReader.next(key, value)) { //              改进方向:随机选择簇中心                 if (k < conf.getInt("numberOfCenters", 5)) {                     centerPathWriter.append(new Text(String.valueOf(k)), value);                     dimention = value.get().length;                     System.out.println("centert"+String.valueOf(k)+"t"+"("+(value.get())[0]+","+(value.get())[1]+")");                 } else {                     break;                 }                 k = k + 1;             }         } finally {             IOUtils.closeStream(inputPathReader);         }         centerPathWriter.close();         return dimention;     }          public static double newCenters(Configuration conf, String inputPath, String centerPath, String clusterPath)      throws IOException {         double s = 0;         String[] clusters = new String[conf.getInt("numberOfCenters", 4)];         DoubleArray[] centers = new DoubleArray[conf.getInt("numberOfCenters", 4)];         for (int i = 0; i < centers.length; i++) {             double[] temp = new double[conf.getInt("dimention", 1)];             for (int k = 0; k < temp.length; k++) temp[k] = 0;             centers[i] = new DoubleArray(temp);         }                  FileSystem clusterPathFs =  FileSystem.get(URI.create(clusterPath+"/part-r-00000"), conf);         Path path = new Path(clusterPath+"/part-r-00000");         SequenceFile.Reader clusterReader = new SequenceFile.Reader(clusterPathFs, path, conf);         Text clusterKey = (Text) ReflectionUtils.newInstance(Text.class, conf);         Text clusterValue = (Text) ReflectionUtils.newInstance(Text.class, conf);         int k = 0;         try {             while (clusterReader.next(clusterKey, clusterValue)) {                 clusters[Integer.parseInt(clusterKey.toString())] = clusterValue.toString();                 int indexOfDistance = clusterValue.toString().lastIndexOf(",") + 1;                 double sumOfDistance = Double.parseDouble(clusterValue.toString().substring(indexOfDistance));                 s = s + sumOfDistance;                 k = k + 1;             }         } finally {             IOUtils.closeStream(clusterReader);         }                  FileSystem inputPathFs= FileSystem.get(URI.create(inputPath),conf);         Path path1 = new Path(inputPath);         SequenceFile.Reader inputPathReader = new SequenceFile.Reader(inputPathFs, path1, conf);         Text inputKey = (Text) ReflectionUtils.newInstance(Text.class, conf);         DoubleArray inputValue = (DoubleArray) ReflectionUtils.newInstance(DoubleArray.class, conf);         try {             while (inputPathReader.next(inputKey, inputValue)) {                 for (int i = 0; i < conf.getInt("numberOfCenters", 5); i++) {                     if (clusters[i].indexOf(inputKey.toString()+",") == 0                          || clusters[i].indexOf(","+inputKey.toString()+",") > 0                          || clusters[i].indexOf(","+inputKey.toString()) == clusters[i].length()-(","+inputKey.toString()).length()) {                         centers[i].plus(inputValue);                     }                 }             }         } finally {             IOUtils.closeStream(inputPathReader);         }         for (int i = 0; i < conf.getInt("numberOfCenters", 5); i++) {             centers[i].averageN(clusters[i].split(",").length);             System.out.println("centert"+String.valueOf(i)+"t"+"("+(centers[i].get())[0]+","+(centers[i].get())[1]+")");         }                  FileSystem centerPathFs= FileSystem.get(URI.create(centerPath),conf);         Path path2 = new Path(centerPath);         SequenceFile.Writer centerPathWriter = new SequenceFile.Writer(centerPathFs,conf,path2,Text.class,DoubleArray.class);         for (int i = 0; i < conf.getInt("numberOfCenters", 5); i++) {             centerPathWriter.append(new Text(String.valueOf(i)), centers[i]);         }         centerPathWriter.close();                  return s;     }          public static void writeClustersToLocalFile(Configuration conf, String clusterPath) throws IOException {         File file = new File("/home/liujun/kmeans_clusters.txt");         PrintWriter output = new PrintWriter(file);                  FileSystem clusterPathFs =  FileSystem.get(URI.create(clusterPath+"/part-r-00000"), conf);         Path path = new Path(clusterPath+"/part-r-00000");         SequenceFile.Reader clusterReader = new SequenceFile.Reader(clusterPathFs, path, conf);         Text clusterKey = (Text) ReflectionUtils.newInstance(Text.class, conf);         Text clusterValue = (Text) ReflectionUtils.newInstance(Text.class, conf);         try {             while (clusterReader.next(clusterKey, clusterValue)) {                 String[] line = clusterValue.toString().split(",");                 for (int i = 0; i < line.length - 1; i++) {                     output.print(clusterKey.toString()+"t");                     output.print(line[i]+"n");                 }             }         } finally {             IOUtils.closeStream(clusterReader);         }         output.close();     } }

代码运行步骤：

第一步：将代码编译打包，生成kmeans.jar;

第二步：在单机伪分布式环境下执行下述指令：

liujun@liujun-Rev-1-0:~$ hadoop jar ~/kmeans.jar kmeans.KMeansDriver ~/kmeans.txt /in/kmeans.seq /cen/centers.seq /out 4

经过3分钟的漫长等待，代码执行完毕，下面是最后一轮循环的执行片段：

MRJob-----------------------------6
14/11/30 22:14:36 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
14/11/30 22:14:36 INFO input.FileInputFormat: Total input paths to process : 1
14/11/30 22:14:36 INFO mapred.JobClient: Running job: job_201411301328_0050
14/11/30 22:14:37 INFO mapred.JobClient: map 0% reduce 0%
14/11/30 22:14:51 INFO mapred.JobClient: map 100% reduce 0%
14/11/30 22:15:03 INFO mapred.JobClient: map 100% reduce 100%
14/11/30 22:15:08 INFO mapred.JobClient: Job complete: job_201411301328_0050
14/11/30 22:15:08 INFO mapred.JobClient: Counters: 29
14/11/30 22:15:08 INFO mapred.JobClient:   Map-Reduce Framework
14/11/30 22:15:08 INFO mapred.JobClient:     Spilled Records=400
14/11/30 22:15:08 INFO mapred.JobClient:     Map output materialized bytes=3296
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce input records=200
14/11/30 22:15:08 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=1066209280
14/11/30 22:15:08 INFO mapred.JobClient:     Map input records=200
14/11/30 22:15:08 INFO mapred.JobClient:     SPLIT_RAW_BYTES=100
14/11/30 22:15:08 INFO mapred.JobClient:     Map output bytes=2890
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce shuffle bytes=3296
14/11/30 22:15:08 INFO mapred.JobClient:     Physical memory (bytes) snapshot=239747072
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce input groups=4
14/11/30 22:15:08 INFO mapred.JobClient:     Combine output records=0
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce output records=4
14/11/30 22:15:08 INFO mapred.JobClient:     Map output records=200
14/11/30 22:15:08 INFO mapred.JobClient:     Combine input records=0
14/11/30 22:15:08 INFO mapred.JobClient:     CPU time spent (ms)=2500
14/11/30 22:15:08 INFO mapred.JobClient:     Total committed heap usage (bytes)=197918720
14/11/30 22:15:08 INFO mapred.JobClient:   File Input Format Counters
14/11/30 22:15:08 INFO mapred.JobClient:     Bytes Read=6421
14/11/30 22:15:08 INFO mapred.JobClient:   FileSystemCounters
14/11/30 22:15:08 INFO mapred.JobClient:     HDFS_BYTES_READ=6712
14/11/30 22:15:08 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=50553
14/11/30 22:15:08 INFO mapred.JobClient:     FILE_BYTES_READ=3296
14/11/30 22:15:08 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=885
14/11/30 22:15:08 INFO mapred.JobClient:   Job Counters
14/11/30 22:15:08 INFO mapred.JobClient:     Launched map tasks=1
14/11/30 22:15:08 INFO mapred.JobClient:     Launched reduce tasks=1
14/11/30 22:15:08 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=10625
14/11/30 22:15:08 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
14/11/30 22:15:08 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=13662
14/11/30 22:15:08 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
14/11/30 22:15:08 INFO mapred.JobClient:     Data-local map tasks=1
14/11/30 22:15:08 INFO mapred.JobClient:   File Output Format Counters
14/11/30 22:15:08 INFO mapred.JobClient:     Bytes Written=885
14/11/30 22:15:08 INFO mapred.JobClient: Running job: job_201411301328_0050
14/11/30 22:15:08 INFO mapred.JobClient: Job complete: job_201411301328_0050
14/11/30 22:15:08 INFO mapred.JobClient: Counters: 29
14/11/30 22:15:08 INFO mapred.JobClient:   Map-Reduce Framework
14/11/30 22:15:08 INFO mapred.JobClient:     Spilled Records=400
14/11/30 22:15:08 INFO mapred.JobClient:     Map output materialized bytes=3296
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce input records=200
14/11/30 22:15:08 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=1066209280
14/11/30 22:15:08 INFO mapred.JobClient:     Map input records=200
14/11/30 22:15:08 INFO mapred.JobClient:     SPLIT_RAW_BYTES=100
14/11/30 22:15:08 INFO mapred.JobClient:     Map output bytes=2890
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce shuffle bytes=3296
14/11/30 22:15:08 INFO mapred.JobClient:     Physical memory (bytes) snapshot=239747072
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce input groups=4
14/11/30 22:15:08 INFO mapred.JobClient:     Combine output records=0
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce output records=4
14/11/30 22:15:08 INFO mapred.JobClient:     Map output records=200
14/11/30 22:15:08 INFO mapred.JobClient:     Combine input records=0
14/11/30 22:15:08 INFO mapred.JobClient:     CPU time spent (ms)=2500
14/11/30 22:15:08 INFO mapred.JobClient:     Total committed heap usage (bytes)=197918720
14/11/30 22:15:08 INFO mapred.JobClient:   File Input Format Counters
14/11/30 22:15:08 INFO mapred.JobClient:     Bytes Read=6421
14/11/30 22:15:08 INFO mapred.JobClient:   FileSystemCounters
14/11/30 22:15:08 INFO mapred.JobClient:     HDFS_BYTES_READ=6712
14/11/30 22:15:08 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=50553
14/11/30 22:15:08 INFO mapred.JobClient:     FILE_BYTES_READ=3296
14/11/30 22:15:08 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=885
14/11/30 22:15:08 INFO mapred.JobClient:   Job Counters
14/11/30 22:15:08 INFO mapred.JobClient:     Launched map tasks=1
14/11/30 22:15:08 INFO mapred.JobClient:     Launched reduce tasks=1
14/11/30 22:15:08 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=10625
14/11/30 22:15:08 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
14/11/30 22:15:08 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=13662
14/11/30 22:15:08 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
14/11/30 22:15:08 INFO mapred.JobClient:     Data-local map tasks=1
14/11/30 22:15:08 INFO mapred.JobClient:   File Output Format Counters
14/11/30 22:15:08 INFO mapred.JobClient:     Bytes Written=885
Begin to generate centers
center   0   (52.568627450980394,18.647058823529413)
center   1   (28.333333333333332,30.07843137254902)
center   2   (6.549019607843137,15.0)
center   3   (9.176470588235293,38.294117647058826)
s2 = 9856.603998462127
Generating centers for MRJob 7 successfully
s = 0.0
可以看出经过7次循环之后，判别函数的值不再发生明显变化，将聚类结果写入本地文本文件kmeans_cluster.txt，文件部分片段如下：

0   163
0   198
1   22
1   17

2   82
2   67
3   142
3   113

其中第一个字段为该点所属的类的编号，第二个字段是该点的id。

利用Python将数据可视化，代码如下：

import numpy as np import matplotlib.pyplot as plt cluster = [] fileIn = open('/home/liujun/kmeans_clusters.txt')   for line in fileIn.readlines():       lineArr = line.strip().split('t')       cluster.append([int(lineArr[0]), int(lineArr[1])])   fileIn.close() dataSet = []   fileIn = open('/home/liujun/kmeans.txt')   for line in fileIn.readlines():       lineArr = line.strip().split('t')       dataSet.append([int(lineArr[0]), int(lineArr[1]), int(lineArr[2])])   fileIn.close() dataArray = np.array(dataSet) clusterArray = np.array(cluster) for i in range(200):     index = dataArray[i,0]     x = dataArray[i,1]     y = dataArray[i,2]     temp = clusterArray[id == index]     sign = temp[0,0]     if sign==0:         plt.plot(x,y,'or')     elif sign==1:         plt.plot(x,y,'Db')     elif sign==2:         plt.plot(x,y,'*y')     else:         plt.plot(x,y,'+k')

效果见下图：

可见对于输入数据来说，聚类效果还是很好的。

总结：在工作之余构思并编写上述代码，前前后后花了将近一周时间，虽然K-Means聚类算法的思想很简单，但在Hadoop环境下将代码实现还是有些麻烦，调试代码时碰到了很多陷阱，最终还是克服了。虽然单纯聚类完全可以调用Mahout中相应的模块，但这次的实践中积累的经验和教训对于今后利用MapReduce实现复杂的数据挖掘算法还是很用用的。

学战到底

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Kmeans

在Hadoop分布式环境下实现K-Means聚类算法的伪代码如下：输入：参数0--存储样本数据的文本文件inputfile；参数1--存储样本数据的SequenceFile文件inputPath；参数2--存储质心数据的SequenceFile文件centerPath；参数3--存储聚类结果文件
复制链接

扫一扫