hadoop之测试KMeans(一):运行源码实例

新学hadoop,测试了hello word级别的程序WordCount,打算用hadoop做聚类分析,这次测试KMeans,二维的数据,具体代码来自于http://download.csdn.net/detail/tinycui/4384750#comment,由于评论中大家对于文档偏少,所以这里详细的介绍一下我的测试过程(伪分布式),以供参考,同时感谢tinycui提供的源代码下载。


Step1: 配置好eclipse和hadoop,具体可以参考网上一些操作。

Step2: 新建一个Project --> Map/Reduce Project工程,命名为KMeans,这里记得要选择Map/Reduce Project工程,否则把代码加进来会出现import错误

Step3: 用tinycui提供的网址下载KMeans的源代码,把src和bin文件覆盖到自己新建的工程中,并在eclipse中刷新KMeans工程

Step4: 在DFS中新建两个文件夹center, cluster, 并在center文件夹中上传一个空的文件center以存放每次迭代的center值,在cluter文件夹中上传cluster的文件,这个文件中是输入数据,数据格式为:(20,30) (50,61) (20,32) (50,64) (59,67) (24,34) (19,39) (20,32) (50,65) (50,77) (20,30) (20,31) (20,32) (50,64) (50,67)

Step5: 配置main的输入参数,Run --> Run Configurations中的Arguments中配置main的三个参数:输入路径,存放KMeans的中心的路径,输出路径,中间空格隔开。

分别为

hdfs://192.168.56.171:9000/cluster

hdfs://192.168.56.171:9000/center

hdfs://192.168.56.171:9000/ouput

这里的IP可以填写自己的IP地址或者localhost

Step6: 修改部分配置代码,具体代码可以参考如下:

主程序KMeans.java

[java] view plain copy
  1. importorg.apache.hadoop.conf.Configuration;
  2. importorg.apache.hadoop.fs.FileSystem;
  3. importorg.apache.hadoop.fs.Path;
  4. importorg.apache.hadoop.io.Text;
  5. importorg.apache.hadoop.mapreduce.Job;
  6. importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  7. importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  8. publicclassKMeans{
  9. publicstaticvoidmain(String[]args)throwsException
  10. {
  11. CenterInitialcenterInitial=newCenterInitial();
  12. centerInitial.run(args);
  13. inttimes=0;
  14. doubles=0,shold=0.0001;
  15. do{
  16. Configurationconf=newConfiguration();
  17. conf.set("fs.default.name","hdfs://192.168.56.171:9000");
  18. Jobjob=newJob(conf,"KMeans");
  19. job.setJarByClass(KMeans.class);
  20. job.setOutputKeyClass(Text.class);
  21. job.setOutputValueClass(Text.class);
  22. job.setMapperClass(KMapper.class);
  23. job.setMapOutputKeyClass(Text.class);
  24. job.setMapOutputValueClass(Text.class);
  25. job.setReducerClass(KReducer.class);
  26. FileSystemfs=FileSystem.get(conf);
  27. fs.delete(newPath(args[2]),true);
  28. FileInputFormat.addInputPath(job,newPath(args[0]));
  29. FileOutputFormat.setOutputPath(job,newPath(args[2]));
  30. job.waitForCompletion(true);
  31. if(job.waitForCompletion(true))
  32. {
  33. NewCenternewCenter=newNewCenter();
  34. s=newCenter.run(args);
  35. times++;
  36. }
  37. }while(s>shold);
  38. System.out.println("Iterator:"+times);
  39. }
  40. }

初始化中心CenterInitial.java

[java] view plain copy
  1. importjava.io.ByteArrayInputStream;
  2. importjava.io.ByteArrayOutputStream;
  3. importjava.io.IOException;
  4. importjava.io.OutputStream;
  5. importjava.net.URI;
  6. importorg.apache.hadoop.conf.Configuration;
  7. importorg.apache.hadoop.fs.FSDataInputStream;
  8. importorg.apache.hadoop.fs.FileSystem;
  9. importorg.apache.hadoop.fs.Path;
  10. importorg.apache.hadoop.io.IOUtils;
  11. publicclassCenterInitial{
  12. publicvoidrun(String[]args)throwsIOException
  13. {
  14. String[]clist;
  15. intk=5;
  16. Stringstring="";
  17. Stringinpath=args[0]+"/cluster";//cluster
  18. Stringoutpath=args[1]+"/center";//center
  19. Configurationconf1=newConfiguration();//读取hadoop文件系统的配置
  20. conf1.set("hadoop.job.ugi","hadoop,hadoop");
  21. FileSystemfs=FileSystem.get(URI.create(inpath),conf1);//FileSystem是用户操作HDFS的核心类,它获得URI对应的HDFS文件系统
  22. FSDataInputStreamin=null;
  23. ByteArrayOutputStreamout=newByteArrayOutputStream();
  24. try{
  25. in=fs.open(newPath(inpath));
  26. IOUtils.copyBytes(in,out,50,false);//用Hadoop的IOUtils工具方法来让这个文件的指定字节复制到标准输出流上
  27. clist=out.toString().split("");
  28. }finally{
  29. IOUtils.closeStream(in);
  30. }
  31. FileSystemfilesystem=FileSystem.get(URI.create(outpath),conf1);
  32. for(inti=0;i<k;i++)
  33. {
  34. intj=(int)(Math.random()*100)%clist.length;
  35. if(string.contains(clist[j]))//choosethesameone
  36. {
  37. k++;
  38. continue;
  39. }
  40. string=string+clist[j].replace("","")+"";
  41. }
  42. OutputStreamout2=filesystem.create(newPath(outpath));
  43. IOUtils.copyBytes(newByteArrayInputStream(string.getBytes()),out2,4096,true);//writestring
  44. System.out.println(string);
  45. }
  46. }

KMapper.java

[java] view plain copy
  1. importjava.io.ByteArrayOutputStream;
  2. importjava.io.IOException;
  3. importjava.net.URI;
  4. importjava.util.StringTokenizer;
  5. importorg.apache.hadoop.conf.Configuration;
  6. importorg.apache.hadoop.fs.FSDataInputStream;
  7. importorg.apache.hadoop.fs.FileSystem;
  8. importorg.apache.hadoop.fs.Path;
  9. importorg.apache.hadoop.io.IOUtils;
  10. importorg.apache.hadoop.io.LongWritable;
  11. importorg.apache.hadoop.io.Text;
  12. importorg.apache.hadoop.mapreduce.Mapper;
  13. publicclassKMapperextendsMapper<LongWritable,Text,Text,Text>{
  14. privateString[]center;
  15. protectedvoidsetup(Contextcontext)throwsIOException,InterruptedException//readcenterlist,andsavetocenter[]
  16. {
  17. Stringcenterlist="hdfs://192.168.56.171:9000/center/center";//center文件
  18. Configurationconf1=newConfiguration();
  19. conf1.set("hadoop.job.ugi","hadoop-user,hadoop-user");
  20. FileSystemfs=FileSystem.get(URI.create(centerlist),conf1);
  21. FSDataInputStreamin=null;
  22. ByteArrayOutputStreamout=newByteArrayOutputStream();
  23. try{
  24. in=fs.open(newPath(centerlist));
  25. IOUtils.copyBytes(in,out,100,false);
  26. center=out.toString().split("");
  27. }finally{
  28. IOUtils.closeStream(in);
  29. }
  30. }
  31. publicvoidmap(LongWritablekey,Textvalue,Contextcontext)throwsIOException,InterruptedException
  32. {
  33. StringTokenizeritr=newStringTokenizer(value.toString());
  34. while(itr.hasMoreTokens())
  35. {
  36. StringoutValue=newString(itr.nextToken());
  37. String[]list=outValue.replace("(","").replace(")","").split(",");
  38. String[]c=center[0].replace("(","").replace(")","").split(",");
  39. floatmin=0;
  40. intpos=0;
  41. for(inti=0;i<list.length;i++)
  42. {
  43. min+=(float)Math.pow((Float.parseFloat(list[i])-Float.parseFloat(c[i])),2);
  44. }
  45. for(inti=0;i<center.length;i++)
  46. {
  47. String[]centerStrings=center[i].replace("(","").replace(")","").split(",");
  48. floatdistance=0;
  49. for(intj=0;j<list.length;j++)
  50. distance+=(float)Math.pow((Float.parseFloat(list[j])-Float.parseFloat(centerStrings[j])),2);
  51. if(min>distance)
  52. {
  53. min=distance;
  54. pos=i;
  55. }
  56. }
  57. context.write(newText(center[pos]),newText(outValue));
  58. }
  59. }
  60. }

KReducer.java

[java] view plain copy
  1. importjava.io.IOException;
  2. importorg.apache.hadoop.io.Text;
  3. importorg.apache.hadoop.mapreduce.Reducer;
  4. publicclassKReducerextendsReducer<Text,Text,Text,Text>{
  5. publicvoidreduce(Textkey,Iterable<Text>value,Contextcontext)throwsIOException,InterruptedException
  6. {
  7. StringoutVal="";
  8. intcount=0;
  9. Stringcenter="";
  10. intlength=key.toString().replace("(","").replace(")","").replace(":","").split(",").length;
  11. float[]ave=newfloat[Float.SIZE*length];
  12. for(inti=0;i<length;i++)
  13. ave[i]=0;
  14. for(Textval:value)
  15. {
  16. outVal+=val.toString()+"";
  17. String[]tmp=val.toString().replace("(","").replace(")","").split(",");
  18. for(inti=0;i<tmp.length;i++)
  19. ave[i]+=Float.parseFloat(tmp[i]);
  20. count++;
  21. }
  22. for(inti=0;i<length;i++)
  23. {
  24. ave[i]=ave[i]/count;
  25. if(i==0)
  26. center+="("+ave[i]+",";
  27. else{
  28. if(i==length-1)
  29. center+=ave[i]+")";
  30. else{
  31. center+=ave[i]+",";
  32. }
  33. }
  34. }
  35. System.out.println(center);
  36. context.write(key,newText(outVal+center));
  37. }
  38. }

NewCenter.java

[java] view plain copy
  1. importjava.io.ByteArrayInputStream;
  2. importjava.io.ByteArrayOutputStream;
  3. importjava.io.IOException;
  4. importjava.io.OutputStream;
  5. importjava.net.URI;
  6. importorg.apache.hadoop.conf.Configuration;
  7. importorg.apache.hadoop.fs.FSDataInputStream;
  8. importorg.apache.hadoop.fs.FileSystem;
  9. importorg.apache.hadoop.fs.Path;
  10. importorg.apache.hadoop.io.IOUtils;
  11. publicclassNewCenter{
  12. intk=3;
  13. floatshold=Integer.MIN_VALUE;
  14. String[]line;
  15. Stringnewcenter=newString("");
  16. publicfloatrun(String[]args)throwsIOException,InterruptedException
  17. {
  18. Configurationconf=newConfiguration();
  19. conf.set("hadoop.job.ugi","hadoop,hadoop");
  20. FileSystemfs=FileSystem.get(URI.create(args[2]+"/part-r-00000"),conf);
  21. FSDataInputStreamin=null;
  22. ByteArrayOutputStreamout=newByteArrayOutputStream();
  23. try{
  24. in=fs.open(newPath(args[2]+"/part-r-00000"));
  25. IOUtils.copyBytes(in,out,50,false);
  26. line=out.toString().split("\n");
  27. }finally{
  28. IOUtils.closeStream(in);
  29. }
  30. System.out.println(out.toString());
  31. for(inti=0;i<k;i++)
  32. {
  33. String[]l=line[i].replace("\t","").split("");
  34. String[]startCenter=l[0].replace("(","").replace(")","").split(",");
  35. String[]finalCenter=l[l.length-1].replace("(","").replace(")","").split(",");
  36. floattmp=0;
  37. for(intj=0;j<startCenter.length;j++)
  38. tmp+=Math.pow(Float.parseFloat(startCenter[j])-Float.parseFloat(finalCenter[j]),2);
  39. newcenter=newcenter+l[l.length-1].replace("\t","")+"";
  40. if(shold<=tmp)
  41. shold=tmp;
  42. }
  43. OutputStreamout2=fs.create(newPath(args[1]+"/center"));
  44. IOUtils.copyBytes(newByteArrayInputStream(newcenter.getBytes()),out2,4096,true);
  45. System.out.println(newcenter);
  46. returnshold;
  47. }
  48. }

输出:

[java] view plain copy
  1. 13/05/2411:20:29INFOmapred.Task:Task:attempt_local_0004_r_000000_0isdone.Andisintheprocessofcommiting
  2. 13/05/2411:20:29INFOmapred.LocalJobRunner:
  3. 13/05/2411:20:29INFOmapred.Task:Taskattempt_local_0004_r_000000_0isallowedtocommitnow
  4. 13/05/2411:20:29INFOoutput.FileOutputCommitter:Savedoutputoftask'attempt_local_0004_r_000000_0'tohdfs://192.168.56.171:9000/ouput
  5. 13/05/2411:20:30INFOmapred.JobClient:map100%reduce0%
  6. 13/05/2411:20:32INFOmapred.LocalJobRunner:reduce>reduce
  7. 13/05/2411:20:32INFOmapred.Task:Task'attempt_local_0004_r_000000_0'done.
  8. 13/05/2411:20:33INFOmapred.JobClient:map100%reduce100%
  9. 13/05/2411:20:33INFOmapred.JobClient:Jobcomplete:job_local_0004
  10. 13/05/2411:20:33INFOmapred.JobClient:Counters:22
  11. 13/05/2411:20:33INFOmapred.JobClient:FileOutputFormatCounters
  12. 13/05/2411:20:33INFOmapred.JobClient:BytesWritten=230
  13. 13/05/2411:20:33INFOmapred.JobClient:FileSystemCounters
  14. 13/05/2411:20:33INFOmapred.JobClient:FILE_BYTES_READ=3843
  15. 13/05/2411:20:33INFOmapred.JobClient:HDFS_BYTES_READ=2896
  16. 13/05/2411:20:33INFOmapred.JobClient:FILE_BYTES_WRITTEN=326968
  17. 13/05/2411:20:33INFOmapred.JobClient:HDFS_BYTES_WRITTEN=1916
  18. 13/05/2411:20:33INFOmapred.JobClient:FileInputFormatCounters
  19. 13/05/2411:20:33INFOmapred.JobClient:BytesRead=121
  20. 13/05/2411:20:33INFOmapred.JobClient:Map-ReduceFramework
  21. 13/05/2411:20:33INFOmapred.JobClient:Mapoutputmaterializedbytes=469
  22. 13/05/2411:20:33INFOmapred.JobClient:Mapinputrecords=1
  23. 13/05/2411:20:33INFOmapred.JobClient:Reduceshufflebytes=0
  24. 13/05/2411:20:33INFOmapred.JobClient:SpilledRecords=30
  25. 13/05/2411:20:33INFOmapred.JobClient:Mapoutputbytes=433
  26. 13/05/2411:20:33INFOmapred.JobClient:Totalcommittedheapusage(bytes)=352845824
  27. 13/05/2411:20:33INFOmapred.JobClient:CPUtimespent(ms)=0
  28. 13/05/2411:20:33INFOmapred.JobClient:SPLIT_RAW_BYTES=107
  29. 13/05/2411:20:33INFOmapred.JobClient:Combineinputrecords=0
  30. 13/05/2411:20:33INFOmapred.JobClient:Reduceinputrecords=15
  31. 13/05/2411:20:33INFOmapred.JobClient:Reduceinputgroups=3
  32. 13/05/2411:20:33INFOmapred.JobClient:Combineoutputrecords=0
  33. 13/05/2411:20:33INFOmapred.JobClient:Physicalmemory(bytes)snapshot=0
  34. 13/05/2411:20:33INFOmapred.JobClient:Reduceoutputrecords=3
  35. 13/05/2411:20:33INFOmapred.JobClient:Virtualmemory(bytes)snapshot=0
  36. 13/05/2411:20:33INFOmapred.JobClient:Mapoutputrecords=15
  37. 13/05/2411:20:33INFOmapred.JobClient:Runningjob:job_local_0004
  38. 13/05/2411:20:33INFOmapred.JobClient:Jobcomplete:job_local_0004
  39. 13/05/2411:20:33INFOmapred.JobClient:Counters:22
  40. 13/05/2411:20:33INFOmapred.JobClient:FileOutputFormatCounters
  41. 13/05/2411:20:33INFOmapred.JobClient:BytesWritten=230
  42. 13/05/2411:20:33INFOmapred.JobClient:FileSystemCounters
  43. 13/05/2411:20:33INFOmapred.JobClient:FILE_BYTES_READ=3843
  44. 13/05/2411:20:33INFOmapred.JobClient:HDFS_BYTES_READ=2896
  45. 13/05/2411:20:33INFOmapred.JobClient:FILE_BYTES_WRITTEN=326968
  46. 13/05/2411:20:33INFOmapred.JobClient:HDFS_BYTES_WRITTEN=1916
  47. 13/05/2411:20:33INFOmapred.JobClient:FileInputFormatCounters
  48. 13/05/2411:20:33INFOmapred.JobClient:BytesRead=121
  49. 13/05/2411:20:33INFOmapred.JobClient:Map-ReduceFramework
  50. 13/05/2411:20:33INFOmapred.JobClient:Mapoutputmaterializedbytes=469
  51. 13/05/2411:20:33INFOmapred.JobClient:Mapinputrecords=1
  52. 13/05/2411:20:33INFOmapred.JobClient:Reduceshufflebytes=0
  53. 13/05/2411:20:33INFOmapred.JobClient:SpilledRecords=30
  54. 13/05/2411:20:33INFOmapred.JobClient:Mapoutputbytes=433
  55. 13/05/2411:20:33INFOmapred.JobClient:Totalcommittedheapusage(bytes)=352845824
  56. 13/05/2411:20:33INFOmapred.JobClient:CPUtimespent(ms)=0
  57. 13/05/2411:20:33INFOmapred.JobClient:SPLIT_RAW_BYTES=107
  58. 13/05/2411:20:33INFOmapred.JobClient:Combineinputrecords=0
  59. 13/05/2411:20:33INFOmapred.JobClient:Reduceinputrecords=15
  60. 13/05/2411:20:33INFOmapred.JobClient:Reduceinputgroups=3
  61. 13/05/2411:20:33INFOmapred.JobClient:Combineoutputrecords=0
  62. 13/05/2411:20:33INFOmapred.JobClient:Physicalmemory(bytes)snapshot=0
  63. 13/05/2411:20:33INFOmapred.JobClient:Reduceoutputrecords=3
  64. 13/05/2411:20:33INFOmapred.JobClient:Virtualmemory(bytes)snapshot=0
  65. 13/05/2411:20:33INFOmapred.JobClient:Mapoutputrecords=15
  66. (19.0,39.0)(19,39)(19.0,39.0)
  67. (20.571428,31.571428)(20,30)(20,32)(24,34)(20,32)(20,30)(20,31)(20,32)(20.571428,31.571428)
  68. (51.285713,66.42857)(50,65)(50,77)(50,64)(59,67)(50,67)(50,61)(50,64)(51.285713,66.42857)
  69. (19.0,39.0)(20.571428,31.571428)(51.285713,66.42857)
  70. Iterator:4

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值