实验五 MapReduce初级编程实战

最新推荐文章于 2024-05-07 08:39:08 发布

DT-熊

最新推荐文章于 2024-05-07 08:39:08 发布

阅读量635

点赞数

文章标签： mapreduce hadoop hdfs

本文链接：https://blog.csdn.net/qq_62907049/article/details/134230737

版权

二、编程实现对输入文件的排序
现在有多个输入文件，每个文件中的每行内容均为一个整数。要求读取所有文件中的整
数，进行升序排序后，输出到一个新的文件中，输出的数据格式为每行两个整数，第一个数
字为第二个整数的排序位次，第二个整数为原待排列的整数。下面是输入文件和输出文件的
一个样例供参考。
输入文件 1 的样例如下：
33
37
12
40
输入文件 2 的样例如下：
4
16
39
5
输入文件 3 的样例如下：
1
45
25
根据输入文件 1、2 和 3 得到的输出文件如下：
1 1
2 4
3 5
4 12
5 16
6 25
7 33
54
8 37
9 39
10 40
11 45

package mapreduce.sortNum;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class sortNum {
    public static class sortNumMap extends Mapper<LongWritable, Text,IntWritable,IntWritable>{
        private  IntWritable intValue = new IntWritable();
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, IntWritable, IntWritable>.Context context) throws IOException, InterruptedException {
            intValue.set(Integer.parseInt(value.toString()));
            context.write(intValue,intValue);
        }
    }

    public static class sortNumReduce extends Reducer<IntWritable,IntWritable,IntWritable,IntWritable>{
        private int count = 1;
        private  IntWritable counter = new IntWritable();
        @Override
        protected void reduce(IntWritable key, Iterable<IntWritable> values, Reducer<IntWritable, IntWritable, IntWritable, IntWritable>.Context context) throws IOException, InterruptedException {
            for (IntWritable value : values) {
                counter.set(count);
                context.write(counter,value);
                count++;
            }
        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        System.setProperty("HADOOP_USER_NAME", "root");

        Configuration con = new Configuration();

        //配置到yarn上执行
        con.set("mapreduce.framework.name", "yarn");
        //配置操作HDFS数据
        con.set("fs.defaultFS", "hdfs://linux01:8020");
        //配置resourceManager位置
        con.set("yarn.resourcemanager.hostname", "linux01");
        //配置mr程序运行在windows上的跨平台参数
        con.set("mapreduce.app-submission.cross-platform","true");

        Job job = Job.getInstance(con,"sortNum");

        //设置jar包的路径
        job.setJar("E:\\IdeaProject\\hadoop\\target\\original-hadoop-1.0-SNAPSHOT.jar");



        //设置Mapper
        job.setMapperClass(sortNumMap.class);

        //设置Reducer
        job.setReducerClass(sortNumReduce.class);
        job.setNumReduceTasks(1);

        //设置map的输出类型
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(IntWritable.class);

        //设置reducer输出类型
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);

        //设置读取HDFS上的文件 的路径
        FileInputFormat.setInputPaths(job,new Path("/ShiYan/numberIn"));
        FileOutputFormat.setOutputPath(job,new Path("/ShiYan/numberOut"));


        //读取本地文件
        //设置读取文件的位置 可以是文件 也可以是文件夹
//        FileInputFormat.setInputPaths(job,new Path("D:\\大学课程\\大数据技术与原理\\实验二\\numberIn"));
        //设置输出文件的位置 指定一个文件夹 文件夹不存在 会报错
//        FileOutputFormat.setOutputPath(job,new Path("D:\\大学课程\\大数据技术与原理\\实验二\\numberOut"));



        //提交任务 并等待任务结束
        job.waitForCompletion(true);

    }
}

（三）对给定的表格进行信息挖掘
下面给出一个 child-parent 的表格，要求挖掘其中的父子辈关系，给出祖孙辈关系的
表格。
输入文件内容如下：
child parent
Steven Lucy
Steven Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Frank
Jack Alice
Jack Jesse
David Alice
David Jesse
Philip David
Philip Alma
Mark David
Mark Alma
输出文件内容如下：
grandchild grandparent
Steven Alice
Steven Jesse
Jone Alice
Jone Jesse
Steven Mary
Steven Frank
Jone Mary
Jone Frank
Philip Alice
Philip Jesse
Mark Alice
Mark Jesse

package mapreduce.childAndParent;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;


public class childAndParent_3 {

    public static class cpMap extends Mapper<LongWritable, Text, Text, Text> {
        private static int num = 0;

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            if (num == 0) {
                ++num;
            } else {
                String[] split = line.split("\\s+");
                String child = split[0];
                String parent = split[1];
                context.write(new Text(parent), new Text(
                        "-" + child));
                context.write(new Text(child), new Text
                        ("+" + parent));
            }

        }
    }

    public static class cpReduce extends Reducer<Text, Text, Text, Text> {
        private static int num = 0;

        public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

            if (num == 0) {
                context.write(new Text("grandchild"), new Text("grandparent"));
                ++num;
            }
            ArrayList<Text> grandChild = new ArrayList<>();
            ArrayList<Text> grandParent = new ArrayList<>();

            for (Text val : values) {
                String s = val.toString();

                if (s.startsWith("-")) {
                    grandChild.add(new Text(s.substring(1)));
                } else {
                    grandParent.add(new Text(s.substring(1)));
                }
            }
            for (Text child : grandChild) {
                for (Text parent : grandParent) {
                    context.write(child, parent);
                }
            }


        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        System.setProperty("HADOOP_USER_NAME", "root");

        Configuration con = new Configuration();

        //配置到yarn上执行
        con.set("mapreduce.framework.name", "yarn");
        //配置操作HDFS数据
        con.set("fs.defaultFS", "hdfs://linux01:8020");
        //配置resourceManager位置
        con.set("yarn.resourcemanager.hostname", "linux01");
        //配置mr程序运行在windows上的跨平台参数
        con.set("mapreduce.app-submission.cross-platform", "true");

        Job job = Job.getInstance(con, "childAndParent");

        //设置jar包的路径
        job.setJar("E:\\IdeaProject\\hadoop\\target\\original-hadoop-1.0-SNAPSHOT.jar");


        //设置Mapper
        job.setMapperClass(cpMap.class);


        //设置Reducer
        job.setReducerClass(cpReduce.class);
        job.setNumReduceTasks(1);

        //设置map的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        //设置reducer输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //设置读取HDFS上的文件 的路径
        FileInputFormat.setInputPaths(job, new Path("/ShiYan/child_parent_In"));
        FileOutputFormat.setOutputPath(job, new Path("/ShiYan/child_parent_Out"));

        //提交任务 并等待任务结束
        job.waitForCompletion(true);
        job.close();

    }
}