Hadoop编程作业part1,利用MapReduce计算

Map是将输入的一个key-value记录<ik.iv>输出为多个key-value记录<mk,mv>,Reduce函数则是输入一个mk和与之对应的mv,输出为多个key-value记录<ok,ov>,在示例中,老师给了一个WordCount的例子,网上有一篇博客记录的特别好,如下
在这里插入图片描述
这里对Mapper和Reducer的代码编写部分进行一下简单介绍
在这里插入图片描述
在这里插入图片描述
在这个例子的基础上进行修改,完成如下的目标任务。思路就是
直接map + reduce :简单粗暴,
两个类,一个继承Mapper,一个继承Reducer.。
在这里插入图片描述
下面是README中的部分,我对其中的内容进行了修改。

1. start hadoop
   $ service ssh start
   $ start-dfs.sh
   $ start-yarn.sh
   //一定要把文件放在HDFS中才可以用
   put input into hdfs
   $ hadoop fs -mkdir /hw2
   $ hadoop fs -put input_0 /hw2
   $ hadoop fs -ls -R /hw2

2. Example: Hw2part1.java

   (1) edit Hw2part1.java  (have a look at the code)

   (2) edit Hww2part1-manifest.txt (have a look at this)

   (3) compile and generate jar
   $ rm -f *.class *.jar
   $ javac Hw2part1.java
   $ jar cfm Hw2part1.jar Hw2part1-mainifest.txt Hw2part1*.class
  //jar的使用方法:jar cvf classes.jar Foo.class Bar.class
  //将Foo.class和Bar.class 打包成classer.jar文件,并在在标准输出中生成详细输出
   (4) remove output hdfs directory then run MapReduce job
   $ hdfs dfs -rm -f -r /hw2/output
   $ hadoop jar ./Hw2part1.jar /hw2/input_0 /hw2/output  
   //注意 output在hdfs里,可以用hadoop fs -ls -R /hw2查看,一开始以为
   //在当前目录下生成

   (5) display output
   $ hdfs dfs -cat '/hw2/output/part-*'

3. Homework 2 part 1 specification

  (1) java class name: Hw2Part1

  (2) command line:

  $ hadoop jar ./Hw2Part1.jar <input file> <output directory>

  <input file> : on hdfs
  <output directory> : on hdfs, it is removed before running the command

  (3) input file format
  every line consists of 3 fields separated by space:

     <source> <destination> <duration>

  (4) output file format
  every line should consist of four fields:

     <source> <destination> <count> <average duration>

  the four fields are sparated by either space or tab

下图是编写好程序,用上面的命令所生成的jar
在这里插入图片描述
下图为hdfs中放入和生成的文件
在这里插入图片描述
流程明白之后就具体来实现任务上的功能

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// Modified by Shimin Chen to demonstrate functionality for Homework 2
// April-May 2015

import java.io.IOException;
import java.util.StringTokenizer;
import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.text.DecimalFormat;

public class Hw2part1 {

  // This is the Mapper class
  // reference: http://hadoop.apache.org/docs/r2.6.0/api/org/apache/hadoop/mapreduce/Mapper.html
  //
  public static class TokenizerMapper extends Mapper<Object, Text, Text, Text>{
        Text newKey =new Text();
        Text newValue= new Text();
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
              int count =1;
              //自认为这里十分巧妙,因为hadoop会循环调用run,这里实现的是如果一行的数据不满足要求,就跳过这一行继续执行,一开始用的continue,但循环的跳出和函数的跳出不同,循环用continue或break,函数跳出用return;
              StringTokenizer itr = new StringTokenizer(value.toString());
              if(itr.countTokens()!=3){
                  return;
              }
              String source = itr.nextToken();
              String destination = itr.nextToken();
              String time_str= itr.nextToken();
              newKey.set(source + " " + destination);
              newValue.set(Integer.toString(count) + " " + time_str);
                  context.write(newKey, newValue);
          }
      }


  public static class FloatAvgReducer extends Reducer<Text,Text,Text,Text> {

        private Text result_key= new Text();
        private Text result_value= new Text();
 
        public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException {
              float sum =0;
              int count =0;
              float avg;
              for (Text val : values) {
                String val_tmp = val.toString();
                StringTokenizer str = new StringTokenizer(val_tmp);
                int cnum=Integer.valueOf(str.nextToken());
                float time = Float.valueOf(str.nextToken());
                sum+=time;
                count+=cnum;
              }
              // generate result key
              result_key.set(key);
              // generate result value
              double avg_value=(double)(sum/count);
              //保留3位小数并用字符串的形式打印出
              avg_value = (double)(Math.round(avg_value * 1000)/1000.0); 
              DecimalFormat df = new DecimalFormat("#.000"); 
              String avg_result = df.format(avg_value);
              result_value.set(Integer.toString(count) + " " + avg_value);
              context.write(result_key, result_value);
            }
          }

  public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length < 2) {
          System.err.println("Usage: wordcount <in> [<in>...] <out>");
          System.exit(2);
        }

        Job job = Job.getInstance(conf, "FloatAvgcount");

        job.setJarByClass(Hw2part1.class);

        job.setMapperClass(TokenizerMapper.class);
        job.setReducerClass(FloatAvgReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // add the input paths as given by command line
        for (int i = 0; i < otherArgs.length - 1; ++i) {
          FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }

        // add the output path as given by the command line
        FileOutputFormat.setOutputPath(job,
          new Path(otherArgs[otherArgs.length - 1]));
		//main函数里的waitForCompletion会让Hadoop调用run
        System.exit(job.waitForCompletion(true) ? 0 : 1);
      }
    
    
}
//注意:shuffle等一些东西是底层实现的,框架除了调用你写的map和reduce外,它还会执行别的东西。

OK,编写完上面的代码,用input_0去检验一下,input_0魔改如下所示,要把其中的好的数据筛选出来,并实现平均值保留三位小数的运算。
在这里插入图片描述
测试结果如下,并且测试也2分通过
在这里插入图片描述
挺完美的,睡觉!

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值