hadoop之测试KMeans(一)：运行源码实例

最新推荐文章于 2022-09-11 10:30:09 发布

小飞侠-2

最新推荐文章于 2022-09-11 10:30:09 发布

阅读量243

点赞数 1

文章标签：数据结构与算法大数据 java

新学hadoop，测试了hello word级别的程序WordCount，打算用hadoop做聚类分析，这次测试KMeans，二维的数据，具体代码来自于http://download.csdn.net/detail/tinycui/4384750#comment，由于评论中大家对于文档偏少，所以这里详细的介绍一下我的测试过程（伪分布式），以供参考，同时感谢tinycui提供的源代码下载。

Step1: 配置好eclipse和hadoop，具体可以参考网上一些操作。

Step2: 新建一个Project --> Map/Reduce Project工程，命名为KMeans，这里记得要选择Map/Reduce Project工程，否则把代码加进来会出现import错误

Step3: 用tinycui提供的网址下载KMeans的源代码，把src和bin文件覆盖到自己新建的工程中，并在eclipse中刷新KMeans工程

Step4: 在DFS中新建两个文件夹center, cluster, 并在center文件夹中上传一个空的文件center以存放每次迭代的center值，在cluter文件夹中上传cluster的文件，这个文件中是输入数据，数据格式为：(20,30) (50,61) (20,32) (50,64) (59,67) (24,34) (19,39) (20,32) (50,65) (50,77) (20,30) (20,31) (20,32) (50,64) (50,67)

Step5: 配置main的输入参数，Run --> Run Configurations中的Arguments中配置main的三个参数：输入路径，存放KMeans的中心的路径，输出路径，中间空格隔开。

分别为

hdfs://192.168.56.171:9000/cluster

hdfs://192.168.56.171:9000/center

hdfs://192.168.56.171:9000/ouput

这里的IP可以填写自己的IP地址或者localhost

Step6: 修改部分配置代码，具体代码可以参考如下：

主程序KMeans.java

[java]view plaincopy 
   
 importorg.apache.hadoop.conf.Configuration; 
 importorg.apache.hadoop.fs.FileSystem; 
 importorg.apache.hadoop.fs.Path; 
 importorg.apache.hadoop.io.Text; 
 importorg.apache.hadoop.mapreduce.Job; 
 importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat; 
 importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
  
  
 publicclassKMeans{ 
  
 publicstaticvoidmain(String[]args)throwsException 
 { 
 CenterInitialcenterInitial=newCenterInitial(); 
 centerInitial.run(args); 
 inttimes=0; 
 doubles=0,shold=0.0001; 
 do{ 
 Configurationconf=newConfiguration(); 
 conf.set("fs.default.name","hdfs://192.168.56.171:9000"); 
 Jobjob=newJob(conf,"KMeans"); 
 job.setJarByClass(KMeans.class); 
 job.setOutputKeyClass(Text.class); 
 job.setOutputValueClass(Text.class); 
 job.setMapperClass(KMapper.class); 
 job.setMapOutputKeyClass(Text.class); 
 job.setMapOutputValueClass(Text.class); 
 job.setReducerClass(KReducer.class); 
 FileSystemfs=FileSystem.get(conf); 
 fs.delete(newPath(args[2]),true); 
 FileInputFormat.addInputPath(job,newPath(args[0])); 
 FileOutputFormat.setOutputPath(job,newPath(args[2])); 
 job.waitForCompletion(true); 
 if(job.waitForCompletion(true)) 
 { 
 NewCenternewCenter=newNewCenter(); 
 s=newCenter.run(args); 
 times++; 
 } 
 }while(s>shold); 
 System.out.println("Iterator:"+times); 
 } 
  
 } 

初始化中心CenterInitial.java

[java]view plaincopy 
   
 importjava.io.ByteArrayInputStream; 
 importjava.io.ByteArrayOutputStream; 
 importjava.io.IOException; 
 importjava.io.OutputStream; 
 importjava.net.URI; 
  
 importorg.apache.hadoop.conf.Configuration; 
 importorg.apache.hadoop.fs.FSDataInputStream; 
 importorg.apache.hadoop.fs.FileSystem; 
 importorg.apache.hadoop.fs.Path; 
 importorg.apache.hadoop.io.IOUtils; 
  
  
 publicclassCenterInitial{ 
  
  
 publicvoidrun(String[]args)throwsIOException 
 { 
 String[]clist; 
 intk=5; 
 Stringstring=""; 
 Stringinpath=args[0]+"/cluster";//cluster 
 Stringoutpath=args[1]+"/center";//center 
 Configurationconf1=newConfiguration();//读取hadoop文件系统的配置 
 conf1.set("hadoop.job.ugi","hadoop,hadoop"); 
 FileSystemfs=FileSystem.get(URI.create(inpath),conf1);//FileSystem是用户操作HDFS的核心类，它获得URI对应的HDFS文件系统 
 FSDataInputStreamin=null; 
 ByteArrayOutputStreamout=newByteArrayOutputStream(); 
 try{ 
  
 in=fs.open(newPath(inpath)); 
 IOUtils.copyBytes(in,out,50,false);//用Hadoop的IOUtils工具方法来让这个文件的指定字节复制到标准输出流上 
 clist=out.toString().split(""); 
 }finally{ 
 IOUtils.closeStream(in); 
 } 
  
 FileSystemfilesystem=FileSystem.get(URI.create(outpath),conf1); 
  
 for(inti=0;i<k;i++) 
 { 
 intj=(int)(Math.random()*100)%clist.length; 
 if(string.contains(clist[j]))//choosethesameone 
 { 
 k++; 
 continue; 
 } 
 string=string+clist[j].replace("","")+""; 
 } 
 OutputStreamout2=filesystem.create(newPath(outpath)); 
 IOUtils.copyBytes(newByteArrayInputStream(string.getBytes()),out2,4096,true);//writestring 
 System.out.println(string); 
 } 
  
 } 

KMapper.java

[java]view plaincopy 
   
 importjava.io.ByteArrayOutputStream; 
 importjava.io.IOException; 
 importjava.net.URI; 
 importjava.util.StringTokenizer; 
  
 importorg.apache.hadoop.conf.Configuration; 
 importorg.apache.hadoop.fs.FSDataInputStream; 
 importorg.apache.hadoop.fs.FileSystem; 
 importorg.apache.hadoop.fs.Path; 
 importorg.apache.hadoop.io.IOUtils; 
 importorg.apache.hadoop.io.LongWritable; 
 importorg.apache.hadoop.io.Text; 
 importorg.apache.hadoop.mapreduce.Mapper; 
  
  
 publicclassKMapperextendsMapper<LongWritable,Text,Text,Text>{ 
  
 privateString[]center; 
  
 protectedvoidsetup(Contextcontext)throwsIOException,InterruptedException//readcenterlist,andsavetocenter[] 
 { 
 Stringcenterlist="hdfs://192.168.56.171:9000/center/center";//center文件 
 Configurationconf1=newConfiguration(); 
 conf1.set("hadoop.job.ugi","hadoop-user,hadoop-user"); 
 FileSystemfs=FileSystem.get(URI.create(centerlist),conf1); 
 FSDataInputStreamin=null; 
 ByteArrayOutputStreamout=newByteArrayOutputStream(); 
 try{ 
  
 in=fs.open(newPath(centerlist)); 
 IOUtils.copyBytes(in,out,100,false); 
 center=out.toString().split(""); 
 }finally{ 
 IOUtils.closeStream(in); 
 } 
 } 
  
 publicvoidmap(LongWritablekey,Textvalue,Contextcontext)throwsIOException,InterruptedException 
 { 
 StringTokenizeritr=newStringTokenizer(value.toString()); 
 while(itr.hasMoreTokens()) 
 { 
 StringoutValue=newString(itr.nextToken()); 
 String[]list=outValue.replace("(","").replace(")","").split(","); 
 String[]c=center[0].replace("(","").replace(")","").split(","); 
 floatmin=0; 
 intpos=0; 
 for(inti=0;i<list.length;i++) 
 { 
 min+=(float)Math.pow((Float.parseFloat(list[i])-Float.parseFloat(c[i])),2); 
 } 
 for(inti=0;i<center.length;i++) 
 { 
 String[]centerStrings=center[i].replace("(","").replace(")","").split(","); 
 floatdistance=0; 
 for(intj=0;j<list.length;j++) 
 distance+=(float)Math.pow((Float.parseFloat(list[j])-Float.parseFloat(centerStrings[j])),2); 
 if(min>distance) 
 { 
 min=distance; 
 pos=i; 
 } 
 } 
 context.write(newText(center[pos]),newText(outValue)); 
 } 
 } 
  
 } 

KReducer.java

[java]view plaincopy 
   
 importjava.io.IOException; 
  
 importorg.apache.hadoop.io.Text; 
 importorg.apache.hadoop.mapreduce.Reducer; 
  
  
 publicclassKReducerextendsReducer<Text,Text,Text,Text>{ 
  
  
 publicvoidreduce(Textkey,Iterable<Text>value,Contextcontext)throwsIOException,InterruptedException 
 { 
 StringoutVal=""; 
 intcount=0; 
 Stringcenter=""; 
 intlength=key.toString().replace("(","").replace(")","").replace(":","").split(",").length; 
 float[]ave=newfloat[Float.SIZE*length]; 
 for(inti=0;i<length;i++) 
 ave[i]=0; 
 for(Textval:value) 
 { 
 outVal+=val.toString()+""; 
 String[]tmp=val.toString().replace("(","").replace(")","").split(","); 
 for(inti=0;i<tmp.length;i++) 
 ave[i]+=Float.parseFloat(tmp[i]); 
 count++; 
 } 
 for(inti=0;i<length;i++) 
 { 
 ave[i]=ave[i]/count; 
 if(i==0) 
 center+="("+ave[i]+","; 
 else{ 
 if(i==length-1) 
 center+=ave[i]+")"; 
 else{ 
 center+=ave[i]+","; 
 } 
 } 
 } 
 System.out.println(center); 
 context.write(key,newText(outVal+center)); 
 } 
  
 } 

NewCenter.java

[java]view plaincopy 
   
 importjava.io.ByteArrayInputStream; 
 importjava.io.ByteArrayOutputStream; 
 importjava.io.IOException; 
 importjava.io.OutputStream; 
 importjava.net.URI; 
  
 importorg.apache.hadoop.conf.Configuration; 
 importorg.apache.hadoop.fs.FSDataInputStream; 
 importorg.apache.hadoop.fs.FileSystem; 
 importorg.apache.hadoop.fs.Path; 
 importorg.apache.hadoop.io.IOUtils; 
  
  
 publicclassNewCenter{ 
  
 intk=3; 
 floatshold=Integer.MIN_VALUE; 
 String[]line; 
 Stringnewcenter=newString(""); 
  
 publicfloatrun(String[]args)throwsIOException,InterruptedException 
 { 
 Configurationconf=newConfiguration(); 
 conf.set("hadoop.job.ugi","hadoop,hadoop"); 
 FileSystemfs=FileSystem.get(URI.create(args[2]+"/part-r-00000"),conf); 
 FSDataInputStreamin=null; 
 ByteArrayOutputStreamout=newByteArrayOutputStream(); 
 try{ 
  
 in=fs.open(newPath(args[2]+"/part-r-00000")); 
 IOUtils.copyBytes(in,out,50,false); 
 line=out.toString().split("\n"); 
 }finally{ 
 IOUtils.closeStream(in); 
 } 
 System.out.println(out.toString()); 
 for(inti=0;i<k;i++) 
 { 
 String[]l=line[i].replace("\t","").split(""); 
 String[]startCenter=l[0].replace("(","").replace(")","").split(","); 
 String[]finalCenter=l[l.length-1].replace("(","").replace(")","").split(","); 
 floattmp=0; 
 for(intj=0;j<startCenter.length;j++) 
 tmp+=Math.pow(Float.parseFloat(startCenter[j])-Float.parseFloat(finalCenter[j]),2); 
 newcenter=newcenter+l[l.length-1].replace("\t","")+""; 
 if(shold<=tmp) 
 shold=tmp; 
 } 
 OutputStreamout2=fs.create(newPath(args[1]+"/center")); 
 IOUtils.copyBytes(newByteArrayInputStream(newcenter.getBytes()),out2,4096,true); 
 System.out.println(newcenter); 
 returnshold; 
 } 
  
 } 

输出：

[java]view plaincopy 
   
 13/05/2411:20:29INFOmapred.Task:Task:attempt_local_0004_r_000000_0isdone.Andisintheprocessofcommiting 
 13/05/2411:20:29INFOmapred.LocalJobRunner: 
 13/05/2411:20:29INFOmapred.Task:Taskattempt_local_0004_r_000000_0isallowedtocommitnow 
 13/05/2411:20:29INFOoutput.FileOutputCommitter:Savedoutputoftask'attempt_local_0004_r_000000_0'tohdfs://192.168.56.171:9000/ouput 
 13/05/2411:20:30INFOmapred.JobClient:map100%reduce0% 
 13/05/2411:20:32INFOmapred.LocalJobRunner:reduce>reduce 
 13/05/2411:20:32INFOmapred.Task:Task'attempt_local_0004_r_000000_0'done. 
 13/05/2411:20:33INFOmapred.JobClient:map100%reduce100% 
 13/05/2411:20:33INFOmapred.JobClient:Jobcomplete:job_local_0004 
 13/05/2411:20:33INFOmapred.JobClient:Counters:22 
 13/05/2411:20:33INFOmapred.JobClient:FileOutputFormatCounters 
 13/05/2411:20:33INFOmapred.JobClient:BytesWritten=230 
 13/05/2411:20:33INFOmapred.JobClient:FileSystemCounters 
 13/05/2411:20:33INFOmapred.JobClient:FILE_BYTES_READ=3843 
 13/05/2411:20:33INFOmapred.JobClient:HDFS_BYTES_READ=2896 
 13/05/2411:20:33INFOmapred.JobClient:FILE_BYTES_WRITTEN=326968 
 13/05/2411:20:33INFOmapred.JobClient:HDFS_BYTES_WRITTEN=1916 
 13/05/2411:20:33INFOmapred.JobClient:FileInputFormatCounters 
 13/05/2411:20:33INFOmapred.JobClient:BytesRead=121 
 13/05/2411:20:33INFOmapred.JobClient:Map-ReduceFramework 
 13/05/2411:20:33INFOmapred.JobClient:Mapoutputmaterializedbytes=469 
 13/05/2411:20:33INFOmapred.JobClient:Mapinputrecords=1 
 13/05/2411:20:33INFOmapred.JobClient:Reduceshufflebytes=0 
 13/05/2411:20:33INFOmapred.JobClient:SpilledRecords=30 
 13/05/2411:20:33INFOmapred.JobClient:Mapoutputbytes=433 
 13/05/2411:20:33INFOmapred.JobClient:Totalcommittedheapusage(bytes)=352845824 
 13/05/2411:20:33INFOmapred.JobClient:CPUtimespent(ms)=0 
 13/05/2411:20:33INFOmapred.JobClient:SPLIT_RAW_BYTES=107 
 13/05/2411:20:33INFOmapred.JobClient:Combineinputrecords=0 
 13/05/2411:20:33INFOmapred.JobClient:Reduceinputrecords=15 
 13/05/2411:20:33INFOmapred.JobClient:Reduceinputgroups=3 
 13/05/2411:20:33INFOmapred.JobClient:Combineoutputrecords=0 
 13/05/2411:20:33INFOmapred.JobClient:Physicalmemory(bytes)snapshot=0 
 13/05/2411:20:33INFOmapred.JobClient:Reduceoutputrecords=3 
 13/05/2411:20:33INFOmapred.JobClient:Virtualmemory(bytes)snapshot=0 
 13/05/2411:20:33INFOmapred.JobClient:Mapoutputrecords=15 
 13/05/2411:20:33INFOmapred.JobClient:Runningjob:job_local_0004 
 13/05/2411:20:33INFOmapred.JobClient:Jobcomplete:job_local_0004 
 13/05/2411:20:33INFOmapred.JobClient:Counters:22 
 13/05/2411:20:33INFOmapred.JobClient:FileOutputFormatCounters 
 13/05/2411:20:33INFOmapred.JobClient:BytesWritten=230 
 13/05/2411:20:33INFOmapred.JobClient:FileSystemCounters 
 13/05/2411:20:33INFOmapred.JobClient:FILE_BYTES_READ=3843 
 13/05/2411:20:33INFOmapred.JobClient:HDFS_BYTES_READ=2896 
 13/05/2411:20:33INFOmapred.JobClient:FILE_BYTES_WRITTEN=326968 
 13/05/2411:20:33INFOmapred.JobClient:HDFS_BYTES_WRITTEN=1916 
 13/05/2411:20:33INFOmapred.JobClient:FileInputFormatCounters 
 13/05/2411:20:33INFOmapred.JobClient:BytesRead=121 
 13/05/2411:20:33INFOmapred.JobClient:Map-ReduceFramework 
 13/05/2411:20:33INFOmapred.JobClient:Mapoutputmaterializedbytes=469 
 13/05/2411:20:33INFOmapred.JobClient:Mapinputrecords=1 
 13/05/2411:20:33INFOmapred.JobClient:Reduceshufflebytes=0 
 13/05/2411:20:33INFOmapred.JobClient:SpilledRecords=30 
 13/05/2411:20:33INFOmapred.JobClient:Mapoutputbytes=433 
 13/05/2411:20:33INFOmapred.JobClient:Totalcommittedheapusage(bytes)=352845824 
 13/05/2411:20:33INFOmapred.JobClient:CPUtimespent(ms)=0 
 13/05/2411:20:33INFOmapred.JobClient:SPLIT_RAW_BYTES=107 
 13/05/2411:20:33INFOmapred.JobClient:Combineinputrecords=0 
 13/05/2411:20:33INFOmapred.JobClient:Reduceinputrecords=15 
 13/05/2411:20:33INFOmapred.JobClient:Reduceinputgroups=3 
 13/05/2411:20:33INFOmapred.JobClient:Combineoutputrecords=0 
 13/05/2411:20:33INFOmapred.JobClient:Physicalmemory(bytes)snapshot=0 
 13/05/2411:20:33INFOmapred.JobClient:Reduceoutputrecords=3 
 13/05/2411:20:33INFOmapred.JobClient:Virtualmemory(bytes)snapshot=0 
 13/05/2411:20:33INFOmapred.JobClient:Mapoutputrecords=15 
 (19.0,39.0)(19,39)(19.0,39.0) 
 (20.571428,31.571428)(20,30)(20,32)(24,34)(20,32)(20,30)(20,31)(20,32)(20.571428,31.571428) 
 (51.285713,66.42857)(50,65)(50,77)(50,64)(59,67)(50,67)(50,61)(50,64)(51.285713,66.42857) 
  
 (19.0,39.0)(20.571428,31.571428)(51.285713,66.42857) 
 Iterator:4