WordCount程序详解

最新推荐文章于 2023-10-09 15:06:54 发布

小飞侠-2

最新推荐文章于 2023-10-09 15:06:54 发布

阅读量274

点赞数

文章标签：大数据 java

[html]view plaincopyprint? 
   
 输入：hellohadoop 

[html]view plaincopyprint? 
   
 helloword 

下面是map函数

[java]view plaincopyprint? 
   
 importjava.io.IOException; 
 importjava.util.StringTokenizer; 
  
 importorg.apache.hadoop.io.IntWritable; 
 importorg.apache.hadoop.io.Text; 
 importorg.apache.hadoop.mapreduce.Mapper; 
  
 publicclassMapperClassextendsMapper<Object,Text,Text,IntWritable>{//四个参数的含义，前两个输入，后两个输出 
 ，类型需一致 
 publicTextkeytext=newText("text");//定义一个text对象，用来充当中间变量，存储词 
 publicIntWritableintvalue=newIntWritable(1);//词的个数，刚开始都为1，也可以不定义，直接context.write(keytext,1); 
 @Override 
 protectedvoidmap(Objectkey,Textvalue, 
 Contextcontext)//key即行偏移量，作用不大，主要是value，根据value进行拆分 
 throwsIOException,InterruptedException{ 
 //获取值 
 Stringstr=value.toString(); 
 //分隔 
 StringTokenizerstringTokenizer=newStringTokenizer(str);//StringTokenizer根据空格等分隔字符串到stringTokenizer 
 while(stringTokenizer.hasMoreElements()){//返回是否还有分隔符，判断是否还有单词 
 keytext.set(stringTokenizer.nextToken());//nextToken()：返回从当前位置到下一个分隔符的字符串。 
 context.write(keytext,intvalue);//context.write("hello",1) 
  
 } 
  
 } 
 } 

reduce函数

[java]view plaincopyprint? 
   
 importjava.io.IOException; 
 importjava.util.Iterator; 
  
 importorg.apache.hadoop.io.IntWritable; 
 importorg.apache.hadoop.io.Text; 
 importorg.apache.hadoop.mapreduce.Reducer; 
  
 publicclassReducerClassextendsReducer<Text,IntWritable,Text,IntWritable>{//前两个输入：例：（hello，1），后两个输出（hello，2） 
  
 publicIntWritableintValue=newIntWritable(0); 
  
 @Override 
 protectedvoidreduce(Textkey,Iterable<IntWritable>values,//这里声明了一个实现Iterator接口的匿名内部类，并返回了内部类的实例 
 Contextcontext)//它用来与MapReduce系统进行通信，如把map的结果传给reduce处理 
 throwsIOException,InterruptedException{ 
 //step1 
 intsum=0; 
 Iterator<IntWritable>itr=values.iterator();//迭代器，访问容器中的元素，为容器而生 
 while(itr.hasNext()){ 
  
 sum+=itr.next().get();//如果有，则加入迭代器中的个数 
  
 } 
 intValue.set(sum);//对于hello，sum是2 
 context.write(key,intValue);//hello，2 
 } 
 } 

主函数：

[java]view plaincopyprint? 
   
 importorg.apache.hadoop.conf.Configuration; 
 importorg.apache.hadoop.fs.Path; 
 importorg.apache.hadoop.io.IntWritable; 
 importorg.apache.hadoop.io.Text; 
 importorg.apache.hadoop.mapreduce.Job; 
 importorg.apache.hadoop.mapreduce.lib.input.FileInputFormat; 
 importorg.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
 importorg.apache.hadoop.util.GenericOptionsParser; 
  
 publicclassWordCount{ 
 publicstaticvoidmain(String[]args)throwsException{ 
 Configurationconf=newConfiguration();//指定作业执行规范 
 String[]otherArgs=newGenericOptionsParser(conf,args).getRemainingArgs(); 
 if(otherArgs.length!=2) 
 { 
 System.err.println("Usage:wordcount<in><out>"); 
 System.exit(2); 
 } 
 Jobjob=newJob(conf,"wordcount");//指定job名称，及运行对象 
 job.setJarByClass(WordCount.class); 
 job.setMapperClass(MapperClass.class);//指定map函数 
 job.setCombinerClass(ReducerClass.class);//是否需要conbiner整合 
 job.setReducerClass(ReducerClass.class);//指定reduce函数 
 job.setOutputKeyClass(Text.class);//输出key格式 
 job.setOutputValueClass(IntWritable.class);//输出value格式 
 FileInputFormat.addInputPath(job,newPath(otherArgs[0]));//处理文件路径 
 FileOutputFormat.setOutputPath(job,newPath(otherArgs[1]));//结果输出路径 
 System.exit(job.waitForCompletion(true)?0:1); 
 } 
 }