mapreduce几个例子

最新推荐文章于 2024-03-19 20:00:00 发布

珍惜每分每秒

最新推荐文章于 2024-03-19 20:00:00 发布

阅读量6.9k

点赞数 5

本文链接：https://blog.csdn.net/qq_36238595/article/details/78276950

版权

http://www.cnblogs.com/xia520pi/archive/2012/06/04/2534533.html 原文地址，动手敲敲实践下

1. 文本去重

要求：将文本中重复的行去掉，输出结果为所有无重复的行
运行大致流程：
map函数读取每行数据，输出k是这行数据，v是空的， shuffle后，自动完成去重，进入reduce函数，得到的key就是我们想要的数据

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Dedup {
    //map将输入中的value复制到输出数据的key上，并直接输出

    public static class Map extends Mapper<Object,Text,Text,Text>{

        private static Text line=new Text();//每行数据
        //实现map函数
        public void map(Object key,Text value,Context context)
                throws IOException,InterruptedException{
            line=value;
            context.write(line, new Text(""));
        }
    }
    //reduce将输入中的key复制到输出数据的key上，并直接输出

    public static class Reduce extends Reducer<Text,Text,Text,Text>{

        //实现reduce函数

        public void reduce(Text key,Iterable<Text> values,Context context)

                throws IOException,InterruptedException{

            context.write(key, new Text(""));
        }
    }



    public static void main(String[] args) throws Exception{
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "dedup");

     job.setJarByClass(Dedup.class);

     //设置Map、Combine和Reduce处理类

     job.setMapperClass(Map.class);

     job.setCombinerClass(Reduce.class);

     job.setReducerClass(Reduce.class);

     //设置输出类型
     job.setOutputKeyClass(Text.class);

     job.setOutputValueClass(Text.class);

     //设置输入和输出目录

     FileInputFormat.addInputPath(job, new Path(args[0]));

     FileOutputFormat.setOutputPath(job, new Path(args[1]));

     System.out.println(args[0]+"  "+args[1]);
     System.exit(job.waitForCompletion(true) ? 0 : 1);

     }

}

数据排序

要求：在文本中的数据如下

要对些数据进行排序处理 , 得到如下结果

大致流程：
map函数收到一个数字，是字符串格式的，要转成int 在转成IntWritable把这个数字作为k， 1作为v写出
shuffle过程将数字排序，并将可能相同数字形成value_list
reduce函数接受到k为数字，v为这个数字出现的次数，定义一个计数变量就可以实现sort


import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;
public class Sort {
    //map将输入中的value化成IntWritable类型，作为输出的key

    public static class Map extends Mapper<Object,Text,IntWritable,IntWritable>{

        private static IntWritable data=new IntWritable();
        //实现map函数

        public void map(Object key,Text value,Context context)

                throws IOException,InterruptedException{

            String line=value.toString();

            data.set(Integer.parseInt(line));

            context.write(data, new IntWritable(1));
        }
    }
    //reduce将输入中的key复制到输出数据的key上，

    //然后根据输入的value-list中元素的个数决定key的输出次数

    //用全局linenum来代表key的位次

    public static class Reduce extends

            Reducer<IntWritable,IntWritable,IntWritable,IntWritable>{

        private static IntWritable linenum = new IntWritable(1);

        //实现reduce函数

        public void reduce(IntWritable key,Iterable<IntWritable> values,Context context)

                throws IOException,InterruptedException{

            for(IntWritable val:values){

                context.write(linenum, key);

                linenum = new IntWritable(linenum.get()+1);

            }
        }
    }

    public static void main(String[] args) throws Exception{

        Configuration conf = new Configuration();

        Job job = new Job(conf, "Data Sort");

     job.setJarByClass(Sort.class);



     //设置Map和Reduce处理类
     job.setMapperClass(Map.class);
     job.setReducerClass(Reduce.class);


     //设置输出类型
     job.setOutputKeyClass(IntWritable.class);
     job.setOutputValueClass(IntWritable.class);

     //设置输入和输出目录
     FileInputFormat.addInputPath(job, new Path(args[0]));
     FileOutputFormat.setOutputPath(job, new Path(args[1]));
     System.exit(job.waitForCompletion(true) ? 0 : 1);
     }

}

统计平均分

分别有3个文件
file1:

张三    88

李四    99

王五    66

赵六    77

file2:

张三    78

李四    89

王五    96

赵六    67

file3:

张三    80

李四    82

王五    84

赵六    86

要求：对每个人求平均分

方法: 用FileInputFormat.addInputPath() 读取多个文件
map函数：将每行的name和score筛出来，用name作为key ，score作为value
shuffle：把name相同的key进行合并，形成<张三，<88,78,70>>
reduce函数：遍历value_list，求平均

import java.io.IOException;

import java.util.Iterator;

import java.util.StringTokenizer;



import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

public class Score {

    public static class Map extends

            Mapper<LongWritable, Text, Text, IntWritable> {

        // 实现map函数
        public void map(LongWritable key, Text value, Context context)

                throws IOException, InterruptedException {

            // 将输入的纯文本文件的数据转化成String

            String line = value.toString();



            // 将输入的数据首先按行进行分割

            StringTokenizer tokenizerArticle = new StringTokenizer(line, "\n");



            // 分别对每一行进行处理

            while (tokenizerArticle.hasMoreElements()) {

                // 每行按空格划分

                StringTokenizer tokenizerLine = new StringTokenizer(tokenizerArticle.nextToken());



                String strName = tokenizerLine.nextToken();// 学生姓名部分

                String strScore = tokenizerLine.nextToken();// 成绩部分

                Text name = new Text(strName);
                int scoreInt = Integer.parseInt(strScore);
                // 输出姓名和成绩
                context.write(name, new IntWritable(scoreInt));
            }
        }
    }



    public static class Reduce extends

            Reducer<Text, IntWritable, Text, IntWritable> {

        // 实现reduce函数

        public void reduce(Text key, Iterable<IntWritable> values,

                Context context) throws IOException, InterruptedException {
            int sum = 0;
            int count = 0;

            Iterator<IntWritable> iterator = values.iterator();
            while (iterator.hasNext()) {

                sum += iterator.next().get();// 计算总分

                count++;// 统计总的科目数

            }



            int average = (int) sum / count;// 计算平均成绩

            context.write(key, new IntWritable(average));

        }
    }



    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();


        Job job = new Job(conf, "Score Average");

        job.setJarByClass(Score.class);



        // 设置Map、Combine和Reduce处理类

        job.setMapperClass(Map.class);

        job.setCombinerClass(Reduce.class);

        job.setReducerClass(Reduce.class);



        // 设置输出类型

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);



        // 将输入的数据集分割成小数据块splites，提供一个RecordReder的实现

        job.setInputFormatClass(TextInputFormat.class);

        // 提供一个RecordWriter的实现，负责数据输出

        job.setOutputFormatClass(TextOutputFormat.class);



        // 设置输入和输出目录
        for (int i = 0; i<args.length-1; i++){
            FileInputFormat.addInputPath(job, new Path(args[i]));
        }

        FileOutputFormat.setOutputPath(job, new Path(args[args.length-1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

珍惜每分每秒

关注

5
点赞
踩
23

收藏

觉得还不错? 一键收藏
0
评论
mapreduce几个例子

http://www.cnblogs.com/xia520pi/archive/2012/06/04/2534533.html 原文地址，动手敲敲实践下1. 文本去重要求：将文本中重复的行去掉，输出结果为所有无重复的行运行大致流程： map函数读取每行数据，输出k是这行数据，v是空的， shuffle后，自动完成去重，进入reduce函数，得到的key就是我们想要的数据impo
复制链接

扫一扫