MapReduce练习题

最新推荐文章于 2023-11-04 21:45:39 发布

赵瑞峰-

最新推荐文章于 2023-11-04 21:45:39 发布

阅读量847

点赞数

文章标签： mapreduce hadoop 大数据

本文链接：https://blog.csdn.net/qq_47944580/article/details/127076958

版权

1.在hdfs目录/tmp/input/wordcount目录中有一系列文件，内容为","号分隔，分隔后的元素均为数值类型、字母、中文，求数值类型、字母类型、中文类型各自的次数


public class Fileone {
    static class FileoneMapper  extends Mapper<Object, Text,Text, IntWritable> {
        private Text word=new Text();
        private  IntWritable one =new IntWritable(1);
        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer str=new StringTokenizer(value.toString(),",");
            while (str.hasMoreTokens()){
                String item=str.nextToken();
                if (TypeUtil.isNumeric(item)){
                    word.set("数字");
                    context.write(word,one);
                }else if (TypeUtil.isChineseStr(item)){
                    word.set("中文字符");
                    context.write(word,one);
                }else {
                    word.set("字母类型");
                    context.write(word,one);
                }

            }
        }
    }
    static class FileoneReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
        private  IntWritable value =new IntWritable(1);
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum=0;
            for (IntWritable val: values){
                sum+=val.get();
            }
            value.set(sum);
            context.write(key ,value);
        }
    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        String output =args[1];
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration, "zrf");
        job.setJarByClass(Fileone.class);
        job.setMapperClass(FileoneMapper.class);
        job.setReducerClass(FileoneReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        List<Path> list= FilePathUtil.readFile(args[0]);
        for (Path path:list
             ) {
            FileInputFormat.addInputPath(job,path);
            
        }
        FileOutputFormat.setOutputPath(job, new Path(output));
        System.exit(job.waitForCompletion(true) ? 0 : 1);


    }

}

用到的判断类型的工具类

package com.zrf.MapDriver;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public  class TypeUtil {
    public static boolean isNumeric(String str){
        Pattern pattern = Pattern.compile("[0-9]*");
        return pattern.matcher(str).matches();

    }

    public static boolean isEnglishStr(String charaString){
        return charaString.matches("^[a-zA-Z]*");
    }

  
    public static boolean isChineseStr(String str){
        String regEx = "[\\u4e00-\\u9fa5]+";
        Pattern p = Pattern.compile(regEx);
        Matcher m = p.matcher(str);
        if(m.find()) {
            return true;
        } else {
            return false;
        }
    }

}

读取文件夹及其以下文件夹内的文件地址

    public static String readFile2(String path) throws Exception{
        FileInputStream is = new FileInputStream(path);
        BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
        String line="";
        StringBuilder result = new StringBuilder();
        boolean flag = true;
        while ((line = br.readLine()) != null) {
            if (flag) {
                flag=false;
            }else {
                result.append(",");
            }
            result.append(line);
        }

        return result.toString();
    }

2.在hdfs目录/tmp/tl/input/wordcount目录中有一系列文件，内容为","号分隔，同时在hdfs路径/tmp/tl/black.txt黑名单文件，一行一个单词用于存放不记入统计的单词列表。求按","号分隔的各个元素去除掉黑名单后的出现频率，输出到目录/tmp/tl/output/个人用户名的hdfs目录中。


public class FileTwo {
    static class FileTwoMapper extends Mapper<Object, Text, Text, IntWritable> {
        private Text outputk = new Text("number");
        private IntWritable outputv = new IntWritable(1);
        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
//            StringTokenizer str=new StringTokenizer(value.toString(),",");
            String filename = ((FileSplit) context.getInputSplit()).getPath().getName();
            if (filename.startsWith("black")) {
                outputk.set(value.toString());
                outputv.set(0);
                context.write(outputk, outputv);
            } else {
                StringTokenizer str = new StringTokenizer(value.toString(), ",");
                while (str.hasMoreTokens()) {
                    String item = str.nextToken();
                    outputk.set(item);
                    context.write(outputk, outputv);
                }

            }
        }

        static class FileTwoReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
            @Override
            protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

                int sum = 0;
                boolean isBlack = false;
                for (IntWritable val : values) {
                    if (val.get() == 0) {
                        isBlack = true;
                        break;
                    }
                    sum += val.get();
                }
                if (!isBlack) {
                    context.write(key, new IntWritable(sum));
                }
            }
        }

        public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
            Configuration configuration = new Configuration();
            GenericOptionsParser optionParser = new GenericOptionsParser(configuration, args);
            String[] remainingArgs = optionParser.getRemainingArgs();
            Job job = Job.getInstance(configuration, "zrf");
            job.setJarByClass(FileTwo.class);
            job.setMapperClass(FileTwoMapper.class);
            job.setReducerClass(FileTwoReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            List<Path> list = FilePathUtil.readFile(remainingArgs[0]);
            for (Path path : list
            ) {
                FileInputFormat.addInputPath(job, path);

            }
            FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1]));
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        }
    }
}

3.在hdfs目录/tmp/table/student中存在student.txt文件，按tab分隔，字段名为(学号，姓名，课程号，班级名称），hdfs目录/tmp/table/student_location中存在student_location.txt文件，按tab分隔，字段名为（学号，省份，城市，区名），在Map任务中用student_location.txt文件中的学号过滤student.txt中的学号字段，输出student.txt中的存在交集的记录，输出结果结构按tab分隔后的四个字段为（学号，姓名，课程号，班级名称，省份，城市）。


public class Test {

    private static class TokenCountermapper33 extends
            Mapper<Object, Text, Text, Text> {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        private Text word1 = new Text();

        public void map(Object key, Text value, Context context)
                throws IOException, InterruptedException {
            String fileName = ((FileSplit) context.getInputSplit()).getPath().getName();
            if (fileName.startsWith("student")) {
                String[] str = value.toString().split(",");
                word.set(str[0]);
                word1.set("student" + "," + str[1] + "," + str[2] + "," + str[3]);
            } else {
                String[] str = value.toString().split(",");
                word.set(str[0]);
                word1.set("locat" + "," + str[1] + "," + str[2] + "," + str[3]);
            }
            context.write(word, word1);
        }
    }


    private static class IntSumReducer33 extends
            Reducer<Text, Text, Text, Text> {
        private Text text1 = new Text();
        private final static IntWritable one = new IntWritable();

        public void reduce(Text key, Iterable<Text> values,
                           Context context) throws IOException, InterruptedException {
            Map mp = new HashMap();
            int count = 0;
            for (Text val : values) {
                count++;
                if (val.toString().startsWith("student")) {
                    String[] a = val.toString().split(",");
                    mp.put("姓名", a[1]);
                    mp.put("课程号", a[2]);
                    mp.put("班级名称", a[2]);
                } else {
                    String[] a = val.toString().split(",");
                    mp.put("省份", a[1]);
                    mp.put("城市", a[2]);
                    mp.put("区名", a[3]);
                }
            }
            if (count > 1) {
                String str = mp.get("姓名") + "\t" + mp.get("省份") + "\t" + mp.get("城市");
                text1.set(str);
                context.write(key, text1);
            }
        }
    }

    private static class MyPartitions extends Partitioner<Text, Text> {
        @Override
        public int getPartition(Text key, Text value, int reducesNum) {
          //  Integer.valueOf(key.toString());
            if (Integer.parseInt(key.toString()) % 2 == 0) {
                return 0;
            } else {
                return 1;
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.set("mapred.reduce.tasks", "2");
        Job job = Job.getInstance(conf, "Work");
        job.setJarByClass(Test.class);
        job.setMapperClass(TokenCountermapper33.class);
        job.setReducerClass(IntSumReducer33.class);
        job.setPartitionerClass(MyPartitions.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        Path path = new Path(args[0]);
        Path path1 = new Path(args[1]);
        FileInputFormat.addInputPath(job, path);
        FileInputFormat.addInputPath(job, path1);
        FileOutputFormat.setOutputPath(job, new Path(args[2]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

4.（已排序好文本文件的分组-流式分组）给定一个本地文本文件finance_record_sorted.txt，共2个字段(工号，报销费用)，其中按工号升序排列，并用tab分隔。求对该数据进行按工号字段的分组，

public class Test2 {
    public static void main(String[] args) throws IOException {
        //1,逐行读取数据
        String path= "E:\\ideaxiangmu\\MapReduce\\data\\text1.txt";
        FileInputStream fis =new FileInputStream(path);
        BufferedReader br =new BufferedReader(new InputStreamReader(fis));
        //2，将S作为key，后面的作为集合存储
        String line=null;
        String first=null;
        List<String> list =new ArrayList<>();
        while((line = br.readLine()) != null){
            String[] values =line.split("\\s+");
            if(first==null){
                first = values[0];
                list.add(values[1]);
            }else if(first.equals(values[0])){
                list.add(values[1]);
            }else{
                //4，如果key不一样，则输出上一个key，并将上一个key替换为新的key
                System.out.println(first+ " "+list);
                list.clear();
                first=values[0];
                list.add(values[1]);
            }
        }
        //5，输出最后一组key，val
        System.out.println(first+ " "+list);
        br.close();
        fis.close();
    }
}

赵瑞峰-

关注

0
点赞
踩
8

收藏

觉得还不错? 一键收藏
0
评论
MapReduce练习题

2.在hdfs目录/tmp/tl/input/wordcount目录中有一系列文件，内容为","号分隔，同时在hdfs路径/tmp/tl/black.txt黑名单文件，一行一个单词用于存放不记入统计的单词列表。求按","号分隔的各个元素去除掉黑名单后的出现频率，输出到目录/tmp/tl/output/个人用户名的hdfs目录中。1.在hdfs目录/tmp/input/wordcount目录中有一系列文件，内容为","号分隔，分隔后的元素均为数值类型、字母、中文，求数值类型、字母类型、中文类型各自的次数。
复制链接

扫一扫