新学hadoop,测试了hello word级别的程序WordCount,打算用hadoop做聚类分析,这次测试KMeans,二维的数据,具体代码来自于http://download.csdn.net/detail/tinycui/4384750#comment,由于评论中大家对于文档偏少,所以这里详细的介绍一下我的测试过程(伪分布式),以供参考,同时感谢tinycui提供的源代码下载。
Step1: 配置好eclipse和hadoop,具体可以参考网上一些操作。
Step2: 新建一个Project --> Map/Reduce Project工程,命名为KMeans,这里记得要选择Map/Reduce Project工程,否则把代码加进来会出现import错误
Step3: 用tinycui提供的网址下载KMeans的源代码,把src和bin文件覆盖到自己新建的工程中,并在eclipse中刷新KMeans工程
Step4: 在DFS中新建两个文件夹center, cluster, 并在center文件夹中上传一个空的文件center以存放每次迭代的center值,在cluter文件夹中上传cluster的文件,这个文件中是输入数据,数据格式为:(20,30) (50,61) (20,32) (50,64) (59,67) (24,34) (19,39) (20,32) (50,65) (50,77) (20,30) (20,31) (20,32) (50,64) (50,67)
Step5: 配置main的输入参数,Run --> Run Configurations中的Arguments中配置main的三个参数:输入路径,存放KMeans的中心的路径,输出路径,中间空格隔开。
分别为
hdfs://192.168.56.171:9000/cluster
hdfs://192.168.56.171:9000/center
hdfs://192.168.56.171:9000/ouput
这里的IP可以填写自己的IP地址或者localhost
Step6: 修改部分配置代码,具体代码可以参考如下:
主程序KMeans.java
- importorg.apache.hadoop.conf.Configuration;
- importorg.apache.hadoop.fs.FileSystem;
- importorg.apache.hadoop.fs.Path;
- importorg.apache.hadoop.io.Text;
- importorg.apache.hadoop.mapreduce.Job;
- importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- publicclassKMeans{
- publicstaticvoidmain(String[]args)throwsException
- {
- CenterInitialcenterInitial=newCenterInitial();
- centerInitial.run(args);
- inttimes=0;
- doubles=0,shold=0.0001;
- do{
- Configurationconf=newConfiguration();
- conf.set("fs.default.name","hdfs://192.168.56.171:9000");
- Jobjob=newJob(conf,"KMeans");
- job.setJarByClass(KMeans.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- job.setMapperClass(KMapper.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(Text.class);
- job.setReducerClass(KReducer.class);
- FileSystemfs=FileSystem.get(conf);
- fs.delete(newPath(args[2]),true);
- FileInputFormat.addInputPath(job,newPath(args[0]));
- FileOutputFormat.setOutputPath(job,newPath(args[2]));
- job.waitForCompletion(true);
- if(job.waitForCompletion(true))
- {
- NewCenternewCenter=newNewCenter();
- s=newCenter.run(args);
- times++;
- }
- }while(s>shold);
- System.out.println("Iterator:"+times);
- }
- }
初始化中心CenterInitial.java
- importjava.io.ByteArrayInputStream;
- importjava.io.ByteArrayOutputStream;
- importjava.io.IOException;
- importjava.io.OutputStream;
- importjava.net.URI;
- importorg.apache.hadoop.conf.Configuration;
- importorg.apache.hadoop.fs.FSDataInputStream;
- importorg.apache.hadoop.fs.FileSystem;
- importorg.apache.hadoop.fs.Path;
- importorg.apache.hadoop.io.IOUtils;
- publicclassCenterInitial{
- publicvoidrun(String[]args)throwsIOException
- {
- String[]clist;
- intk=5;
- Stringstring="";
- Stringinpath=args[0]+"/cluster";//cluster
- Stringoutpath=args[1]+"/center";//center
- Configurationconf1=newConfiguration();//读取hadoop文件系统的配置
- conf1.set("hadoop.job.ugi","hadoop,hadoop");
- FileSystemfs=FileSystem.get(URI.create(inpath),conf1);//FileSystem是用户操作HDFS的核心类,它获得URI对应的HDFS文件系统
- FSDataInputStreamin=null;
- ByteArrayOutputStreamout=newByteArrayOutputStream();
- try{
- in=fs.open(newPath(inpath));
- IOUtils.copyBytes(in,out,50,false);//用Hadoop的IOUtils工具方法来让这个文件的指定字节复制到标准输出流上
- clist=out.toString().split("");
- }finally{
- IOUtils.closeStream(in);
- }
- FileSystemfilesystem=FileSystem.get(URI.create(outpath),conf1);
- for(inti=0;i<k;i++)
- {
- intj=(int)(Math.random()*100)%clist.length;
- if(string.contains(clist[j]))//choosethesameone
- {
- k++;
- continue;
- }
- string=string+clist[j].replace("","")+"";
- }
- OutputStreamout2=filesystem.create(newPath(outpath));
- IOUtils.copyBytes(newByteArrayInputStream(string.getBytes()),out2,4096,true);//writestring
- System.out.println(string);
- }
- }
KMapper.java
- importjava.io.ByteArrayOutputStream;
- importjava.io.IOException;
- importjava.net.URI;
- importjava.util.StringTokenizer;
- importorg.apache.hadoop.conf.Configuration;
- importorg.apache.hadoop.fs.FSDataInputStream;
- importorg.apache.hadoop.fs.FileSystem;
- importorg.apache.hadoop.fs.Path;
- importorg.apache.hadoop.io.IOUtils;
- importorg.apache.hadoop.io.LongWritable;
- importorg.apache.hadoop.io.Text;
- importorg.apache.hadoop.mapreduce.Mapper;
- publicclassKMapperextendsMapper<LongWritable,Text,Text,Text>{
- privateString[]center;
- protectedvoidsetup(Contextcontext)throwsIOException,InterruptedException//readcenterlist,andsavetocenter[]
- {
- Stringcenterlist="hdfs://192.168.56.171:9000/center/center";//center文件
- Configurationconf1=newConfiguration();
- conf1.set("hadoop.job.ugi","hadoop-user,hadoop-user");
- FileSystemfs=FileSystem.get(URI.create(centerlist),conf1);
- FSDataInputStreamin=null;
- ByteArrayOutputStreamout=newByteArrayOutputStream();
- try{
- in=fs.open(newPath(centerlist));
- IOUtils.copyBytes(in,out,100,false);
- center=out.toString().split("");
- }finally{
- IOUtils.closeStream(in);
- }
- }
- publicvoidmap(LongWritablekey,Textvalue,Contextcontext)throwsIOException,InterruptedException
- {
- StringTokenizeritr=newStringTokenizer(value.toString());
- while(itr.hasMoreTokens())
- {
- StringoutValue=newString(itr.nextToken());
- String[]list=outValue.replace("(","").replace(")","").split(",");
- String[]c=center[0].replace("(","").replace(")","").split(",");
- floatmin=0;
- intpos=0;
- for(inti=0;i<list.length;i++)
- {
- min+=(float)Math.pow((Float.parseFloat(list[i])-Float.parseFloat(c[i])),2);
- }
- for(inti=0;i<center.length;i++)
- {
- String[]centerStrings=center[i].replace("(","").replace(")","").split(",");
- floatdistance=0;
- for(intj=0;j<list.length;j++)
- distance+=(float)Math.pow((Float.parseFloat(list[j])-Float.parseFloat(centerStrings[j])),2);
- if(min>distance)
- {
- min=distance;
- pos=i;
- }
- }
- context.write(newText(center[pos]),newText(outValue));
- }
- }
- }
KReducer.java
- importjava.io.IOException;
- importorg.apache.hadoop.io.Text;
- importorg.apache.hadoop.mapreduce.Reducer;
- publicclassKReducerextendsReducer<Text,Text,Text,Text>{
- publicvoidreduce(Textkey,Iterable<Text>value,Contextcontext)throwsIOException,InterruptedException
- {
- StringoutVal="";
- intcount=0;
- Stringcenter="";
- intlength=key.toString().replace("(","").replace(")","").replace(":","").split(",").length;
- float[]ave=newfloat[Float.SIZE*length];
- for(inti=0;i<length;i++)
- ave[i]=0;
- for(Textval:value)
- {
- outVal+=val.toString()+"";
- String[]tmp=val.toString().replace("(","").replace(")","").split(",");
- for(inti=0;i<tmp.length;i++)
- ave[i]+=Float.parseFloat(tmp[i]);
- count++;
- }
- for(inti=0;i<length;i++)
- {
- ave[i]=ave[i]/count;
- if(i==0)
- center+="("+ave[i]+",";
- else{
- if(i==length-1)
- center+=ave[i]+")";
- else{
- center+=ave[i]+",";
- }
- }
- }
- System.out.println(center);
- context.write(key,newText(outVal+center));
- }
- }
NewCenter.java
- importjava.io.ByteArrayInputStream;
- importjava.io.ByteArrayOutputStream;
- importjava.io.IOException;
- importjava.io.OutputStream;
- importjava.net.URI;
- importorg.apache.hadoop.conf.Configuration;
- importorg.apache.hadoop.fs.FSDataInputStream;
- importorg.apache.hadoop.fs.FileSystem;
- importorg.apache.hadoop.fs.Path;
- importorg.apache.hadoop.io.IOUtils;
- publicclassNewCenter{
- intk=3;
- floatshold=Integer.MIN_VALUE;
- String[]line;
- Stringnewcenter=newString("");
- publicfloatrun(String[]args)throwsIOException,InterruptedException
- {
- Configurationconf=newConfiguration();
- conf.set("hadoop.job.ugi","hadoop,hadoop");
- FileSystemfs=FileSystem.get(URI.create(args[2]+"/part-r-00000"),conf);
- FSDataInputStreamin=null;
- ByteArrayOutputStreamout=newByteArrayOutputStream();
- try{
- in=fs.open(newPath(args[2]+"/part-r-00000"));
- IOUtils.copyBytes(in,out,50,false);
- line=out.toString().split("\n");
- }finally{
- IOUtils.closeStream(in);
- }
- System.out.println(out.toString());
- for(inti=0;i<k;i++)
- {
- String[]l=line[i].replace("\t","").split("");
- String[]startCenter=l[0].replace("(","").replace(")","").split(",");
- String[]finalCenter=l[l.length-1].replace("(","").replace(")","").split(",");
- floattmp=0;
- for(intj=0;j<startCenter.length;j++)
- tmp+=Math.pow(Float.parseFloat(startCenter[j])-Float.parseFloat(finalCenter[j]),2);
- newcenter=newcenter+l[l.length-1].replace("\t","")+"";
- if(shold<=tmp)
- shold=tmp;
- }
- OutputStreamout2=fs.create(newPath(args[1]+"/center"));
- IOUtils.copyBytes(newByteArrayInputStream(newcenter.getBytes()),out2,4096,true);
- System.out.println(newcenter);
- returnshold;
- }
- }
输出:
- 13/05/2411:20:29INFOmapred.Task:Task:attempt_local_0004_r_000000_0isdone.Andisintheprocessofcommiting
- 13/05/2411:20:29INFOmapred.LocalJobRunner:
- 13/05/2411:20:29INFOmapred.Task:Taskattempt_local_0004_r_000000_0isallowedtocommitnow
- 13/05/2411:20:29INFOoutput.FileOutputCommitter:Savedoutputoftask'attempt_local_0004_r_000000_0'tohdfs://192.168.56.171:9000/ouput
- 13/05/2411:20:30INFOmapred.JobClient:map100%reduce0%
- 13/05/2411:20:32INFOmapred.LocalJobRunner:reduce>reduce
- 13/05/2411:20:32INFOmapred.Task:Task'attempt_local_0004_r_000000_0'done.
- 13/05/2411:20:33INFOmapred.JobClient:map100%reduce100%
- 13/05/2411:20:33INFOmapred.JobClient:Jobcomplete:job_local_0004
- 13/05/2411:20:33INFOmapred.JobClient:Counters:22
- 13/05/2411:20:33INFOmapred.JobClient:FileOutputFormatCounters
- 13/05/2411:20:33INFOmapred.JobClient:BytesWritten=230
- 13/05/2411:20:33INFOmapred.JobClient:FileSystemCounters
- 13/05/2411:20:33INFOmapred.JobClient:FILE_BYTES_READ=3843
- 13/05/2411:20:33INFOmapred.JobClient:HDFS_BYTES_READ=2896
- 13/05/2411:20:33INFOmapred.JobClient:FILE_BYTES_WRITTEN=326968
- 13/05/2411:20:33INFOmapred.JobClient:HDFS_BYTES_WRITTEN=1916
- 13/05/2411:20:33INFOmapred.JobClient:FileInputFormatCounters
- 13/05/2411:20:33INFOmapred.JobClient:BytesRead=121
- 13/05/2411:20:33INFOmapred.JobClient:Map-ReduceFramework
- 13/05/2411:20:33INFOmapred.JobClient:Mapoutputmaterializedbytes=469
- 13/05/2411:20:33INFOmapred.JobClient:Mapinputrecords=1
- 13/05/2411:20:33INFOmapred.JobClient:Reduceshufflebytes=0
- 13/05/2411:20:33INFOmapred.JobClient:SpilledRecords=30
- 13/05/2411:20:33INFOmapred.JobClient:Mapoutputbytes=433
- 13/05/2411:20:33INFOmapred.JobClient:Totalcommittedheapusage(bytes)=352845824
- 13/05/2411:20:33INFOmapred.JobClient:CPUtimespent(ms)=0
- 13/05/2411:20:33INFOmapred.JobClient:SPLIT_RAW_BYTES=107
- 13/05/2411:20:33INFOmapred.JobClient:Combineinputrecords=0
- 13/05/2411:20:33INFOmapred.JobClient:Reduceinputrecords=15
- 13/05/2411:20:33INFOmapred.JobClient:Reduceinputgroups=3
- 13/05/2411:20:33INFOmapred.JobClient:Combineoutputrecords=0
- 13/05/2411:20:33INFOmapred.JobClient:Physicalmemory(bytes)snapshot=0
- 13/05/2411:20:33INFOmapred.JobClient:Reduceoutputrecords=3
- 13/05/2411:20:33INFOmapred.JobClient:Virtualmemory(bytes)snapshot=0
- 13/05/2411:20:33INFOmapred.JobClient:Mapoutputrecords=15
- 13/05/2411:20:33INFOmapred.JobClient:Runningjob:job_local_0004
- 13/05/2411:20:33INFOmapred.JobClient:Jobcomplete:job_local_0004
- 13/05/2411:20:33INFOmapred.JobClient:Counters:22
- 13/05/2411:20:33INFOmapred.JobClient:FileOutputFormatCounters
- 13/05/2411:20:33INFOmapred.JobClient:BytesWritten=230
- 13/05/2411:20:33INFOmapred.JobClient:FileSystemCounters
- 13/05/2411:20:33INFOmapred.JobClient:FILE_BYTES_READ=3843
- 13/05/2411:20:33INFOmapred.JobClient:HDFS_BYTES_READ=2896
- 13/05/2411:20:33INFOmapred.JobClient:FILE_BYTES_WRITTEN=326968
- 13/05/2411:20:33INFOmapred.JobClient:HDFS_BYTES_WRITTEN=1916
- 13/05/2411:20:33INFOmapred.JobClient:FileInputFormatCounters
- 13/05/2411:20:33INFOmapred.JobClient:BytesRead=121
- 13/05/2411:20:33INFOmapred.JobClient:Map-ReduceFramework
- 13/05/2411:20:33INFOmapred.JobClient:Mapoutputmaterializedbytes=469
- 13/05/2411:20:33INFOmapred.JobClient:Mapinputrecords=1
- 13/05/2411:20:33INFOmapred.JobClient:Reduceshufflebytes=0
- 13/05/2411:20:33INFOmapred.JobClient:SpilledRecords=30
- 13/05/2411:20:33INFOmapred.JobClient:Mapoutputbytes=433
- 13/05/2411:20:33INFOmapred.JobClient:Totalcommittedheapusage(bytes)=352845824
- 13/05/2411:20:33INFOmapred.JobClient:CPUtimespent(ms)=0
- 13/05/2411:20:33INFOmapred.JobClient:SPLIT_RAW_BYTES=107
- 13/05/2411:20:33INFOmapred.JobClient:Combineinputrecords=0
- 13/05/2411:20:33INFOmapred.JobClient:Reduceinputrecords=15
- 13/05/2411:20:33INFOmapred.JobClient:Reduceinputgroups=3
- 13/05/2411:20:33INFOmapred.JobClient:Combineoutputrecords=0
- 13/05/2411:20:33INFOmapred.JobClient:Physicalmemory(bytes)snapshot=0
- 13/05/2411:20:33INFOmapred.JobClient:Reduceoutputrecords=3
- 13/05/2411:20:33INFOmapred.JobClient:Virtualmemory(bytes)snapshot=0
- 13/05/2411:20:33INFOmapred.JobClient:Mapoutputrecords=15
- (19.0,39.0)(19,39)(19.0,39.0)
- (20.571428,31.571428)(20,30)(20,32)(24,34)(20,32)(20,30)(20,31)(20,32)(20.571428,31.571428)
- (51.285713,66.42857)(50,65)(50,77)(50,64)(59,67)(50,67)(50,61)(50,64)(51.285713,66.42857)
- (19.0,39.0)(20.571428,31.571428)(51.285713,66.42857)
- Iterator:4