实验一、MapReduce基本编程方法

一戴宗师

已于 2023-11-24 22:53:48 修改

阅读量351

点赞数

分类专栏：大数据计算文章标签： mapreduce 大数据 ubuntu linux intellij-idea jar

于 2023-10-23 10:57:58 首次发布

本文链接：https://blog.csdn.net/qq_52147555/article/details/133984267

版权

大数据计算专栏收录该内容

6 篇文章 0 订阅

订阅专栏

一、实验目的

理解MapReduce工作流程；
掌握MapReduce基础编程方法；

二、实验平台

操作系统：Linux（建议Ubuntu16.04）；
Hadoop版本：2.7.1；
JDK版本：1.7或以上版本；
Java IDE：IDEA

三、实验内容

(一)、单词去重: 将一个文件内的所有单词去重，输出为去重后的单词

(1)编写MapReduce代码
(2)编译并打包项目
(3)使用hadoop jar命令运行程序
(4)到控制台查看输出文件结果

输入如下
one two three four five
one two three four
one two three
one two
hello world
hello China
hello fuzhou
hello hi

输出如下
China
five
four
fuzhou
hello
hi
one
three
two
world

（1）建立 input 文件夹存放所要处理的文件。
（2）将 ex1.java 文件所在 project_one 项目打包成 jar，并将其复制到相关文件夹中，如下图所示：
在这里插入图片描述（3）运行 project_one.jar，将结果存放在新文件夹 output 中，如下图所示：

（4）查看运行结果，如下图所示：
在这里插入图片描述

程序代码：

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class ex1 {
    public static class TokenizerMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
        private final static IntWritable one =new IntWritable(1);
        private Text word =new Text();
        public void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context
        ) throws IOException, InterruptedException {
            String[] itr = value.toString().split(" ");
            for (String it:itr) {
                word.set(it);
                context.write(word, one);
            }
        }
    }

    public static class IntSumReducer extends Reducer<Text,IntWritable,Text,Text> {
        private IntWritable result =new IntWritable();
        public void reduce(Text key, Iterable<IntWritable> values,
                           Context context
        ) throws IOException, InterruptedException {

            context.write(key,new Text(""));
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf =new Configuration();
        conf.set("fs.defaultFS","hdfs://localhost:9000");
        String[] otherArgs=new String[]{"input","output"};
        FileSystem fs  = FileSystem.get(conf);
        if (otherArgs.length != 2){
            System.err.println("Usage:wordcount<in><out>");
            System.exit(2);
        }

        Job job =new Job(conf, "ex1");//设置一个用户定义的job名称
        job.setJarByClass(ex1.class);
        job.setMapperClass(TokenizerMapper.class);//为job设置Mapper类
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setReducerClass(IntSumReducer.class);    //为job设置Reducer类
        job.setOutputKeyClass(Text.class);        //为job的输出数据设置Key类
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));    //为job设置输入路径
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//为job设置输出路径
        System.exit(job.waitForCompletion(true) ?0 : 1);        //运行job
    }
}

(二)、计算股票的资本损益

统计买卖的每个股票收益。（将每个股票的名称作为key值，当操作为Buy时，value记为负的价格，当操作为Sell时，value记为正的价格，以这个key和value作为map阶段输出，reduce阶段的输入）

(1)编写MapReduce代码
(2)编译并打包项目
(3)使用hadoop jar命令运行程序
(4)到控制台查看输出文件结果

输入如下
Leetcode Buy 1000
Corona Buy 10
Leetcode Sell 9000
Handbags Buy 30000
Corona Sell 1010
Corona Buy 1000
Corona Sell 500
Corona Buy 1000
Handbags Sell 7000
Corona Sell 10000

输出如下
Corona 9500
Handbags -23000
Leetcode 8000

基本步骤同上

程序代码：

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class ex2 {
    public static class TokenizerMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
        private Text word =new Text();
        public void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
            String[] itr = value.toString().split(" ");
            word.set(itr[0]);
            if(itr[1].equals("Buy")){
                context.write(word,new IntWritable(-1*Integer.parseInt(itr[2])));
            }
            else{
                context.write(word,new IntWritable(Integer.parseInt(itr[2])));
            }
        }
    }

    public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
        private IntWritable result =new IntWritable();
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum =0;
            for (IntWritable val:values){
                sum += val.get();
            }
            result.set(sum);
            context.write(key,result);
        }
    }
    public static void main(String[] args) throws Exception {

        Configuration conf =new Configuration();
        conf.set("fs.defaultFS","hdfs://localhost:9000");
        String[] otherArgs =new String[]{"input","output"};
        if (otherArgs.length !=2){
            System.err.println("Usage:Stocks<in><out>");
            System.exit(2);
        }

        Job job =new Job(conf, "ex2");//设置一个用户定义的job名称
        job.setJarByClass(ex2.class);
        job.setMapperClass(ex2.TokenizerMapper.class);//为job设置Mapper类
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setReducerClass(ex2.IntSumReducer.class);    //为job设置Reducer类
        job.setOutputKeyClass(Text.class);        //为job的输出数据设置Key类
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));    //为job设置输入路径
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//为job设置输出路径
        System.exit(job.waitForCompletion(true) ?0 : 1);        //运行job
    }
}

运行结果：

在这里插入图片描述

(三)、求互相关注的用户

(1)编写MapReduce代码
(2)编译并打包项目
(3)使用hadoop jar命令运行程序
(4)到控制台查看输出文件结果

输入数据格式如下
A<B,C,D,F,E,O
B<A,C,E,K
C<F,A,D,I
D<A,E,F,L
E<B,C,D,M,L
F<A,B,C,D,E,O,M
G<A,C,D,E,F
H<A,C,D,E,O
I<A,O
J<B,O
K<A,C,D
L<D,E,F
M<E,F,G
O<A,H,I,J,K

例如第一行表示用户B,C,D,F,E,O关注了A，现在要求找出互相关注的所用用户对，输出不能重复（输出了A<->B就不能输出B<->A），输出格式如下：
A<->B
A<->C
A<->D
A<->F
A<->O
B<->E
C<->F
D<->E
D<->F
D<->L
E<->L
E<->M
F<->M
H<->O
I<->O
J<->O

基本步骤同上

程序代码：

import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class ex3 {
    private static class TokenizerMapper
            extends Mapper<LongWritable, Text, Text, IntWritable>{
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text,IntWritable>.Context context) throws IOException, InterruptedException {
            String[] itrs = value.toString().split("<");
            char p = itrs[0].charAt(0);
            for (String str : itrs[1].split(",")) {
                char f = str.charAt(0);
                String each = "";
                if (p > f)
                    each += f + "<->" + p;
                else
                    each += p + "<->" + f;
                context.write(new Text(each), new IntWritable(1));
            }
        }
    }

    private static class IntSumReducer
            extends Reducer<Text, IntWritable, Text, Text>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, Text>.Context context) throws IOException, InterruptedException {
            int num = 0;
            for (IntWritable it : values) num++;
            if(num == 2)
                context.write(key, new Text(""));
        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        Configuration conf =new Configuration();
        conf.set("fs.defaultFS","hdfs://localhost:9000");
        String[] otherArgs=new String[]{"input","output"};
        if (otherArgs.length!=2){
            System.err.println("Usage:err<in><out>");
            System.exit(2);
        }

        Job job =new Job(conf, "ex3");//设置一个用户定义的job名称
        job.setJarByClass(ex3.class);
        job.setMapperClass(TokenizerMapper.class);//为job设置Mapper类
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setReducerClass(IntSumReducer.class);    //为job设置Reducer类
        job.setOutputKeyClass(Text.class);        //为job的输出数据设置Key类
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));    //为job设置输入路径
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//为job设置输出路径
        System.exit(job.waitForCompletion(true) ?0 : 1);

    }

}