MapReduce是Hadoop处理平台的核心部分,实现了海量数据的并行处理。本文主要介绍基于MapReduce框架实现的kmeans(参考http://blog.csdn.net/qll125596718/article/details/8243404)算法。
下面是代码的实现:sample.java
package kmeans;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Writable;
public class Sample implements Writable{
private static final Log log=LogFactory.getLog(Sample.class);
public static final int DIMENTION=60;
public double arr[];
public Sample(){
arr=new double[DIMENTION];
}
public static double getEulerDist(Sample vec1,Sample vec2){
if(!(vec1.arr.length==DIMENTION && vec2.arr.length==DIMENTION)){
log.error("vector's dimention is not "+DIMENTION);
System.exit(1);
}
double dist=0.0;
for(int i=0;i
kmeans的代码:
package kmeans;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Vector;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class KMeans extends Configured implements Tool{
private static final Log log = LogFactory.getLog(KMeans.class);
private static final int K = 2;
private static final int MAXITERATIONS = 300;
private static final double THRESHOLD = 0.01;
public static boolean stopIteration(Configuration conf) throws IOException{
FileSystem fs=FileSystem.get(conf);
Path pervCenterFile=new Path("hdfs://192.168.7.119:9010/user/sn/Input/centers");
Path currentCenterFile=new Path("hdfs://192.168.7.119:9010/user/sn/Output/part-r-00000");
if(!(fs.exists(pervCenterFile) && fs.exists(currentCenterFile))){
log.info("两个质心文件需要同时存在");
System.exit(1);
}
//比较前后两次质心的变化是否小于阈值,决定迭代是否继续
boolean stop=true;
String line1,line2;
FSDataInputStream in1=fs.open(pervCenterFile);
FSDataInputStream in2=fs.open(currentCenterFile);
InputStreamReader isr1=new InputStreamReader(in1);
InputStreamReader isr2=new InputStreamReader(in2);
BufferedReader br1=new BufferedReader(isr1);
BufferedReader br2=new BufferedReader(isr2);
Sample prevCenter,currCenter;
while((line1=br1.readLine())!=null && (line2=br2.readLine())!=null){
prevCenter=new Sample();
currCenter=new Sample();
String []str1=line1.split("\\s+");
String []str2=line2.split("\\s+");
assert(str1[0].equals(str2[0]));
for(int i=1;i<=Sample.DIMENTION;i++){
prevCenter.arr[i-1]=Double.parseDouble(str1[i]);
currCenter.arr[i-1]=Double.parseDouble(str2[i]);
}
if(Sample.getEulerDist(prevCenter, currCenter)>THRESHOLD){
stop=false;
break;
}
}
//如果还要进行下一次迭代,就用当前质心替代上一次的质心
if(stop==false){
fs.delete(pervCenterFile,true);
if(fs.rename(currentCenterFile, pervCenterFile)==false){
log.error("质心文件替换失败");
System.exit(1);
}
}
return stop;
}
public static class ClusterMapper extends Mapper
{
Vector
centers = new Vector
();
@Override
//清空centers
public void setup(Context context){
for (int i = 0; i < K; i++) {
centers.add(new Sample());
}
}
@Override
//从输入文件读入centers
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String []str=value.toString().split("\\s+");
if(str.length!=Sample.DIMENTION+1){
log.error("读入centers时维度不对");
System.exit(1);
}
int index=Integer.parseInt(str[0]);
for(int i=1;i
{
int prev=-1;
Sample center=new Sample();;
int count=0;
@Override
//更新每个质心(除最后一个)
public void reduce(IntWritable key,Iterable
values,Context context) throws IOException,InterruptedException{ while(values.iterator().hasNext()){ Sample value=values.iterator().next(); if(key.get()!=prev){ if(prev!=-1){ for(int i=0;i
新手第一次学习MapReduce框架实现kmeans算法,对算法和MapReduce编程还不是太多的了解,还需要更多的深入了解。未来希望用Hadoop实现海量数据的数据挖掘。