Kmeans

在Hadoop分布式环境下实现K-Means聚类算法的伪代码如下:

输入:参数0--存储样本数据的文本文件inputfile;

            参数1--存储样本数据的SequenceFile文件inputPath;

            参数2--存储质心数据的SequenceFile文件centerPath;

            参数3--存储聚类结果文件(SequenceFile文件)所处的路径clusterPath;

            参数4--类的数量k;

输出:k个类

Begin

           读取inputPath,从中选取前k个点作为初始质心,将质心数据写入centerPath;

           While 聚类终止条件不满足

                    在Mapper阶段,读取inputPath,对于key所对应的点,遍历所有的质心,选择最近的质心,将该质心的编号作为键,

                    该点的编号作为值传递给Reducer;

                    在Reducer阶段,将Mapper阶段传递过来的值根据键归并输出,结果写入clusterPath;

                    读取clusterPath,重新计算质心,将结果写入centerPath;

           EndWhile

End

判断聚类效果好坏的常见指标是下述的准则函数值:

\

有理由认为上述值越小,聚类效果越好,随着循环的不断进行,上述准则函数值会收敛到一个很小的值,所以可以用这个值不再明显变化作为聚类循环的终止条件。

以下是存储样本数据(总共200个点)的本地文件kmeans.txt的部分片段(10个点):

163    61    20
17    34    25
66    7    10
14    34    34
128    5    41
49    33    24
185    58    20
83    8    14
54    3    17
96    1    13

其中第一个字段为点的id,第二个字段是点的横坐标,第三个字段是点的纵坐标。

将上述点可视化,见下图:

\

为了便于访问待聚类的点的ID及其坐标,将输入样本数据存储在SequenceFile格式的文件中,

其中key是点的ID,数据类型为Text,点的坐标是一个double[]型的数组,将该数组封装在类DoubleArray中,这个类需要继承Writable接口,

类DoubleArray的定义如下:DoubleArray.java

package kmeans; 
import java.io.DataInput; 
import java.io.DataOutput; 
import java.io.IOException; 
import org.apache.hadoop.io.Writable; 
public class DoubleArray implements Writable { 	
private double[] data; 	
public DoubleArray() { 	} 	
public DoubleArray(double[] data) { set(data);} 	
public void set(double[] data) { this.data = data;} 	
public double[] get() { 		return data; 	} 	
public void write(DataOutput out) throws IOException { 		
int length = 0; 		
if (data != null) { 			length = data.length; 		} 		
out.writeInt(length); 		
for (int i = 0; i < length; i++) { 			out.writeDouble(data[i]); 		} 	
} 	
public void readFields(DataInput in) throws IOException { 		
int length = in.readInt(); 		
data = new double[length]; 		
for (int i = 0; i < length; i++) { 			data[i] = in.readDouble(); 		} 	
} 	
public double distanceTo(DoubleArray point) { 		
double[] data1 = point.get(); 		
double distance = 0; 		
for (int i = 0; i < data.length; i++) { 			
distance = distance + Math.pow(data[i] - data1[i], 2); 		} 		
return distance; 	
} 	
public void plus(DoubleArray point) { 		
double[] data1 = point.get(); 		
for (int i = 0; i < data.length; i++) { 			data[i] = data[i] + data1[i]; 		} 	
} 	
public void averageN(int n) { 		
for (int i = 0; i < data.length; i++) { 			
data[i] = data[i]/n; 		} 	
} 

在Mapper阶段,为了便于计算准则函数的值,需要向Reducer传递隶属于某个质心的点的编号以及该点到该质心的距离的平方,为此将这两项数据封装在类IdAndDistance中,该类需要继承Writable接口,代码如下:IdAndDistance.java

package kmeans; 
import java.io.DataInput; 
import java.io.DataOutput; 
import java.io.IOException; 
import org.apache.hadoop.io.Writable; 
public class IdAndDistance implements Writable { 	
private String id; 	
private double distance; 	
public void set(String id, double distance) { 		
this.id = id; 		
this.distance = distance; 	} 	
public IdAndDistance() { 	} 	
public IdAndDistance(String id, double distance) { 		set(id, distance); 	} 	
public String getId() { 		return id; 	} 	
public double getDistance() { 		return distance; 	} 	
public void write(DataOutput out) throws IOException { 		
out.writeUTF(id); 		
out.writeDouble(distance); 	} 	
public void readFields(DataInput in) throws IOException { 		
id = in.readUTF(); 		
distance = in.readDouble(); 	} } 
Mapper阶段代码:KMeansMapper.java

package kmeans; 
import java.io.IOException; 
import java.net.URI; 
import org.apache.hadoop.conf.Configuration; 
import org.apache.hadoop.fs.FileSystem; 
import org.apache.hadoop.fs.Path; 
import org.apache.hadoop.io.IOUtils; 
import org.apache.hadoop.io.SequenceFile; 
import org.apache.hadoop.io.Text; 
import org.apache.hadoop.mapreduce.Mapper; 
import org.apache.hadoop.util.ReflectionUtils; 
public class KMeansMapper extends Mapper<Text, DoubleArray, Text, IdAndDistance> {     
private DoubleArray[] centers = null;       
protected void setup(Context context) throws IOException, InterruptedException {         
super.setup(context);         
Configuration conf = context.getConfiguration();         
centers = new DoubleArray[conf.getInt("numberOfCenters", 4)];         
String centerPath = conf.get("centerPath");         
FileSystem fs =  FileSystem.get(URI.create(centerPath), conf);         
Path path = new Path(centerPath);         
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);         
Text key = (Text) ReflectionUtils.newInstance(Text.class, conf);         
DoubleArray value = (DoubleArray) ReflectionUtils.newInstance(DoubleArray.class, conf);         
try {             
while (reader.next(key, value)) {                 
int index = Integer.parseInt(key.toString());                 
double[] shuzu = value.get();                 
centers[index] = new DoubleArray(shuzu);             
}         
} finally {             
IOUtils.closeStream(reader);         
}     
}          
public void map(Text key, DoubleArray value, Context context)      throws IOException, InterruptedException {  double minDistance = Double.MAX_VALUE;         
int nearestCenter = 0;         
for (int i = 0; i < centers.length; i++) {             
if (value.distanceTo(centers[i]) < minDistance) {                 
nearestCenter = i;                 
minDistance = value.distanceTo(centers[i]);             
}         
}         
context.write(new Text(String.valueOf(nearestCenter)), new IdAndDistance(key.toString(),minDistance));     } } 
Reducer阶段代码:KMeansReducer.java

package kmeans; 
import java.io.IOException; 
import java.util.Iterator; 
import org.apache.hadoop.io.Text; 
import org.apache.hadoop.io.WritableUtils; 
import org.apache.hadoop.mapreduce.Reducer; 
public class KMeansReducer extends Reducer<Text, IdAndDistance, Text, Text>{     
public void reduce(Text key, Iterable<IdAndDistance> values, Context context)      throws IOException, InterruptedException {         
double sumOfDistance = 0;         
Iterator<IdAndDistance> ite = values.iterator();         String cluster = "";         while (ite.hasNext()) {             IdAndDistance temp = WritableUtils.clone(ite.next(), context.getConfiguration());             if (cluster.length() > 0) cluster = cluster + ",";             cluster = cluster + temp.getId();             sumOfDistance = sumOfDistance + temp.getDistance();         }         cluster = cluster + "," + String.valueOf(sumOfDistance);         context.write(key, new Text(cluster));     } } 
驱动程序代码:KMeansDriver.java

package kmeans; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.net.URI; import java.util.Scanner; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.ReflectionUtils; public class KMeansDriver {     public static void main(String[] args) throws Exception {         // TODO Auto-generated method stub         Configuration conf = new Configuration();         conf.setInt("numberOfCenters", Integer.parseInt(args[4]));         String inputfile = args[0];//linux系统中包含全路径的文件         String inputPath = args[1];//hdfs文件系统中包含全路径的文件         String centerPath = args[2];//hdfs文件系统中包含全路径的文件         String clusterPath = args[3];//hdfs文件系统中存放簇的路径,不含文件名         conf.set("centerPath", centerPath);         double s = 0;         double s1 = Double.MAX_VALUE;         double shold = 0.1;         int times = 0;         writeToSeq(conf,inputPath,inputfile);//将样本数据转换成SequenceFile格式文件         System.out.println("Begin to generate centers");         int dimention = centersInitial(conf, inputPath, centerPath);//生成初始质心,并返回样本点的维数             System.out.println("Generating centers for MRJob "+times+" successfully");         conf.setInt("dimention", dimention);         FileSystem fs = FileSystem.get(conf);         Job job = null;         do {             System.out.println("MRJob-----------------------------"+times);             fs.delete(new Path(clusterPath), true);//删除上一轮循环生成的聚类结果             job = new Job(conf);             job.setJarByClass(KMeansDriver.class);             job.setInputFormatClass(SequenceFileInputFormat.class);             job.setMapperClass(KMeansMapper.class);             job.setMapOutputKeyClass(Text.class);             job.setMapOutputValueClass(IdAndDistance.class);             job.setReducerClass(KMeansReducer.class);             job.setOutputKeyClass(Text.class);             job.setOutputValueClass(Text.class);             job.setOutputFormatClass(SequenceFileOutputFormat.class);                         SequenceFileInputFormat.addInputPath(job, new Path(inputPath));             SequenceFileOutputFormat.setOutputPath(job, new Path(clusterPath));             job.waitForCompletion(true);             if (job.waitForCompletion(true)) {                 fs.delete(new Path(centerPath), true);                 System.out.println("Begin to generate centers"); //                根据聚类结果生成新质心,并返回聚类结果的准则函数值                 double s2 = newCenters(conf, inputPath, centerPath, clusterPath);                 System.out.println("s2 = "+s2);                 times ++;                 s = Math.abs(s1 - s2);                 System.out.println("Generating centers for MRJob "+times+" successfully");                 System.out.println("s = "+s);                 s1 = s2;             }         } while (s > shold);//若准则函数值不再有显著变化,则终止循环,否则进入下一轮循环         writeClustersToLocalFile(conf, clusterPath);//将聚类结果写入本地文本文件     }          public static void writeToSeq(Configuration conf, String inputPath, String inputfile) throws IOException {         String uri = inputPath;         FileSystem fs=FileSystem.get(URI.create(uri),conf);         Path path=new Path(uri);         SequenceFile.Writer Writer = new SequenceFile.Writer(fs,conf,path,Text.class,DoubleArray.class);          File file = new File(inputfile);         Scanner input = new Scanner(file);         while (input.hasNext()) {             String[] line = input.nextLine().split("t");             Text key = new Text(line[0]);             double[] data = new double[2];             data[0] = Double.parseDouble(line[1]);             data[1] = Double.parseDouble(line[2]);             DoubleArray value = new DoubleArray(data);             Writer.append(key, value);         }         input.close();         Writer.close();     }              public static int centersInitial(Configuration conf, String inputPath, String centerPath)      throws IOException {         int dimention = 0;         FileSystem inputPathFs= FileSystem.get(URI.create(inputPath),conf);         Path path1 = new Path(inputPath);         SequenceFile.Reader inputPathReader = new SequenceFile.Reader(inputPathFs, path1, conf);                  FileSystem centerPathFs= FileSystem.get(URI.create(centerPath),conf);         Path path2 = new Path(centerPath);         SequenceFile.Writer centerPathWriter = new SequenceFile.Writer(centerPathFs,conf,path2,Text.class,DoubleArray.class);         Text key = (Text) ReflectionUtils.newInstance(Text.class, conf);         DoubleArray value = (DoubleArray) ReflectionUtils.newInstance(DoubleArray.class, conf);         try {             int k = 0;             while (inputPathReader.next(key, value)) { //              改进方向:随机选择簇中心                 if (k < conf.getInt("numberOfCenters", 5)) {                     centerPathWriter.append(new Text(String.valueOf(k)), value);                     dimention = value.get().length;                     System.out.println("centert"+String.valueOf(k)+"t"+"("+(value.get())[0]+","+(value.get())[1]+")");                 } else {                     break;                 }                 k = k + 1;             }         } finally {             IOUtils.closeStream(inputPathReader);         }         centerPathWriter.close();         return dimention;     }          public static double newCenters(Configuration conf, String inputPath, String centerPath, String clusterPath)      throws IOException {         double s = 0;         String[] clusters = new String[conf.getInt("numberOfCenters", 4)];         DoubleArray[] centers = new DoubleArray[conf.getInt("numberOfCenters", 4)];         for (int i = 0; i < centers.length; i++) {             double[] temp = new double[conf.getInt("dimention", 1)];             for (int k = 0; k < temp.length; k++) temp[k] = 0;             centers[i] = new DoubleArray(temp);         }                  FileSystem clusterPathFs =  FileSystem.get(URI.create(clusterPath+"/part-r-00000"), conf);         Path path = new Path(clusterPath+"/part-r-00000");         SequenceFile.Reader clusterReader = new SequenceFile.Reader(clusterPathFs, path, conf);         Text clusterKey = (Text) ReflectionUtils.newInstance(Text.class, conf);         Text clusterValue = (Text) ReflectionUtils.newInstance(Text.class, conf);         int k = 0;         try {             while (clusterReader.next(clusterKey, clusterValue)) {                 clusters[Integer.parseInt(clusterKey.toString())] = clusterValue.toString();                 int indexOfDistance = clusterValue.toString().lastIndexOf(",") + 1;                 double sumOfDistance = Double.parseDouble(clusterValue.toString().substring(indexOfDistance));                 s = s + sumOfDistance;                 k = k + 1;             }         } finally {             IOUtils.closeStream(clusterReader);         }                  FileSystem inputPathFs= FileSystem.get(URI.create(inputPath),conf);         Path path1 = new Path(inputPath);         SequenceFile.Reader inputPathReader = new SequenceFile.Reader(inputPathFs, path1, conf);         Text inputKey = (Text) ReflectionUtils.newInstance(Text.class, conf);         DoubleArray inputValue = (DoubleArray) ReflectionUtils.newInstance(DoubleArray.class, conf);         try {             while (inputPathReader.next(inputKey, inputValue)) {                 for (int i = 0; i < conf.getInt("numberOfCenters", 5); i++) {                     if (clusters[i].indexOf(inputKey.toString()+",") == 0                          || clusters[i].indexOf(","+inputKey.toString()+",") > 0                          || clusters[i].indexOf(","+inputKey.toString()) == clusters[i].length()-(","+inputKey.toString()).length()) {                         centers[i].plus(inputValue);                     }                 }             }         } finally {             IOUtils.closeStream(inputPathReader);         }         for (int i = 0; i < conf.getInt("numberOfCenters", 5); i++) {             centers[i].averageN(clusters[i].split(",").length);             System.out.println("centert"+String.valueOf(i)+"t"+"("+(centers[i].get())[0]+","+(centers[i].get())[1]+")");         }                  FileSystem centerPathFs= FileSystem.get(URI.create(centerPath),conf);         Path path2 = new Path(centerPath);         SequenceFile.Writer centerPathWriter = new SequenceFile.Writer(centerPathFs,conf,path2,Text.class,DoubleArray.class);         for (int i = 0; i < conf.getInt("numberOfCenters", 5); i++) {             centerPathWriter.append(new Text(String.valueOf(i)), centers[i]);         }         centerPathWriter.close();                  return s;     }          public static void writeClustersToLocalFile(Configuration conf, String clusterPath) throws IOException {         File file = new File("/home/liujun/kmeans_clusters.txt");         PrintWriter output = new PrintWriter(file);                  FileSystem clusterPathFs =  FileSystem.get(URI.create(clusterPath+"/part-r-00000"), conf);         Path path = new Path(clusterPath+"/part-r-00000");         SequenceFile.Reader clusterReader = new SequenceFile.Reader(clusterPathFs, path, conf);         Text clusterKey = (Text) ReflectionUtils.newInstance(Text.class, conf);         Text clusterValue = (Text) ReflectionUtils.newInstance(Text.class, conf);         try {             while (clusterReader.next(clusterKey, clusterValue)) {                 String[] line = clusterValue.toString().split(",");                 for (int i = 0; i < line.length - 1; i++) {                     output.print(clusterKey.toString()+"t");                     output.print(line[i]+"n");                 }             }         } finally {             IOUtils.closeStream(clusterReader);         }         output.close();     } } 

代码运行步骤:

第一步:将代码编译打包,生成kmeans.jar;

第二步:在单机伪分布式环境下执行下述指令:

liujun@liujun-Rev-1-0:~$ hadoop jar ~/kmeans.jar kmeans.KMeansDriver ~/kmeans.txt /in/kmeans.seq /cen/centers.seq /out 4
经过3分钟的漫长等待,代码执行完毕,下面是最后一轮循环的执行片段:

MRJob-----------------------------6
14/11/30 22:14:36 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
14/11/30 22:14:36 INFO input.FileInputFormat: Total input paths to process : 1
14/11/30 22:14:36 INFO mapred.JobClient: Running job: job_201411301328_0050
14/11/30 22:14:37 INFO mapred.JobClient:  map 0% reduce 0%
14/11/30 22:14:51 INFO mapred.JobClient:  map 100% reduce 0%
14/11/30 22:15:03 INFO mapred.JobClient:  map 100% reduce 100%
14/11/30 22:15:08 INFO mapred.JobClient: Job complete: job_201411301328_0050
14/11/30 22:15:08 INFO mapred.JobClient: Counters: 29
14/11/30 22:15:08 INFO mapred.JobClient:   Map-Reduce Framework
14/11/30 22:15:08 INFO mapred.JobClient:     Spilled Records=400
14/11/30 22:15:08 INFO mapred.JobClient:     Map output materialized bytes=3296
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce input records=200
14/11/30 22:15:08 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=1066209280
14/11/30 22:15:08 INFO mapred.JobClient:     Map input records=200
14/11/30 22:15:08 INFO mapred.JobClient:     SPLIT_RAW_BYTES=100
14/11/30 22:15:08 INFO mapred.JobClient:     Map output bytes=2890
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce shuffle bytes=3296
14/11/30 22:15:08 INFO mapred.JobClient:     Physical memory (bytes) snapshot=239747072
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce input groups=4
14/11/30 22:15:08 INFO mapred.JobClient:     Combine output records=0
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce output records=4
14/11/30 22:15:08 INFO mapred.JobClient:     Map output records=200
14/11/30 22:15:08 INFO mapred.JobClient:     Combine input records=0
14/11/30 22:15:08 INFO mapred.JobClient:     CPU time spent (ms)=2500
14/11/30 22:15:08 INFO mapred.JobClient:     Total committed heap usage (bytes)=197918720
14/11/30 22:15:08 INFO mapred.JobClient:   File Input Format Counters 
14/11/30 22:15:08 INFO mapred.JobClient:     Bytes Read=6421
14/11/30 22:15:08 INFO mapred.JobClient:   FileSystemCounters
14/11/30 22:15:08 INFO mapred.JobClient:     HDFS_BYTES_READ=6712
14/11/30 22:15:08 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=50553
14/11/30 22:15:08 INFO mapred.JobClient:     FILE_BYTES_READ=3296
14/11/30 22:15:08 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=885
14/11/30 22:15:08 INFO mapred.JobClient:   Job Counters 
14/11/30 22:15:08 INFO mapred.JobClient:     Launched map tasks=1
14/11/30 22:15:08 INFO mapred.JobClient:     Launched reduce tasks=1
14/11/30 22:15:08 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=10625
14/11/30 22:15:08 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
14/11/30 22:15:08 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=13662
14/11/30 22:15:08 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
14/11/30 22:15:08 INFO mapred.JobClient:     Data-local map tasks=1
14/11/30 22:15:08 INFO mapred.JobClient:   File Output Format Counters 
14/11/30 22:15:08 INFO mapred.JobClient:     Bytes Written=885
14/11/30 22:15:08 INFO mapred.JobClient: Running job: job_201411301328_0050
14/11/30 22:15:08 INFO mapred.JobClient: Job complete: job_201411301328_0050
14/11/30 22:15:08 INFO mapred.JobClient: Counters: 29
14/11/30 22:15:08 INFO mapred.JobClient:   Map-Reduce Framework
14/11/30 22:15:08 INFO mapred.JobClient:     Spilled Records=400
14/11/30 22:15:08 INFO mapred.JobClient:     Map output materialized bytes=3296
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce input records=200
14/11/30 22:15:08 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=1066209280
14/11/30 22:15:08 INFO mapred.JobClient:     Map input records=200
14/11/30 22:15:08 INFO mapred.JobClient:     SPLIT_RAW_BYTES=100
14/11/30 22:15:08 INFO mapred.JobClient:     Map output bytes=2890
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce shuffle bytes=3296
14/11/30 22:15:08 INFO mapred.JobClient:     Physical memory (bytes) snapshot=239747072
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce input groups=4
14/11/30 22:15:08 INFO mapred.JobClient:     Combine output records=0
14/11/30 22:15:08 INFO mapred.JobClient:     Reduce output records=4
14/11/30 22:15:08 INFO mapred.JobClient:     Map output records=200
14/11/30 22:15:08 INFO mapred.JobClient:     Combine input records=0
14/11/30 22:15:08 INFO mapred.JobClient:     CPU time spent (ms)=2500
14/11/30 22:15:08 INFO mapred.JobClient:     Total committed heap usage (bytes)=197918720
14/11/30 22:15:08 INFO mapred.JobClient:   File Input Format Counters 
14/11/30 22:15:08 INFO mapred.JobClient:     Bytes Read=6421
14/11/30 22:15:08 INFO mapred.JobClient:   FileSystemCounters
14/11/30 22:15:08 INFO mapred.JobClient:     HDFS_BYTES_READ=6712
14/11/30 22:15:08 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=50553
14/11/30 22:15:08 INFO mapred.JobClient:     FILE_BYTES_READ=3296
14/11/30 22:15:08 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=885
14/11/30 22:15:08 INFO mapred.JobClient:   Job Counters 
14/11/30 22:15:08 INFO mapred.JobClient:     Launched map tasks=1
14/11/30 22:15:08 INFO mapred.JobClient:     Launched reduce tasks=1
14/11/30 22:15:08 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=10625
14/11/30 22:15:08 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
14/11/30 22:15:08 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=13662
14/11/30 22:15:08 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
14/11/30 22:15:08 INFO mapred.JobClient:     Data-local map tasks=1
14/11/30 22:15:08 INFO mapred.JobClient:   File Output Format Counters 
14/11/30 22:15:08 INFO mapred.JobClient:     Bytes Written=885
Begin to generate centers
center    0    (52.568627450980394,18.647058823529413)
center    1    (28.333333333333332,30.07843137254902)
center    2    (6.549019607843137,15.0)
center    3    (9.176470588235293,38.294117647058826)
s2 = 9856.603998462127
Generating centers for MRJob 7 successfully
s = 0.0
可以看出经过7次循环之后,判别函数的值不再发生明显变化,将聚类结果写入本地文本文件kmeans_cluster.txt,文件部分片段如下:

0    163
0    198
1    22
1    17

2    82
2    67
3    142
3    113

其中第一个字段为该点所属的类的编号,第二个字段是该点的id。

利用Python将数据可视化,代码如下:

import numpy as np import matplotlib.pyplot as plt cluster = [] fileIn = open('/home/liujun/kmeans_clusters.txt')   for line in fileIn.readlines():       lineArr = line.strip().split('t')       cluster.append([int(lineArr[0]), int(lineArr[1])])   fileIn.close() dataSet = []   fileIn = open('/home/liujun/kmeans.txt')   for line in fileIn.readlines():       lineArr = line.strip().split('t')       dataSet.append([int(lineArr[0]), int(lineArr[1]), int(lineArr[2])])   fileIn.close() dataArray = np.array(dataSet) clusterArray = np.array(cluster) for i in range(200):     index = dataArray[i,0]     x = dataArray[i,1]     y = dataArray[i,2]     temp = clusterArray[id == index]     sign = temp[0,0]     if sign==0:         plt.plot(x,y,'or')     elif sign==1:         plt.plot(x,y,'Db')     elif sign==2:         plt.plot(x,y,'*y')     else:         plt.plot(x,y,'+k') 
效果见下图:

可见对于输入数据来说,聚类效果还是很好的。

总结:在工作之余构思并编写上述代码,前前后后花了将近一周时间,虽然K-Means聚类算法的思想很简单,但在Hadoop环境下将代码实现还是有些麻烦,调试代码时碰到了很多陷阱,最终还是克服了。虽然单纯聚类完全可以调用Mahout中相应的模块,但这次的实践中积累的经验和教训对于今后利用MapReduce实现复杂的数据挖掘算法还是很用用的。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值