MapReduce基础测试(二)

MapReduce基础测试(二)

有一份源数据文件如下,请根据需求,编写对应的MapReduce程序

hello#world#hadoop
hive#sqoop#xxx#flume#hello
hdfs#hive#yyy#world
hadoop#yyy#spark#flink
flink#hello#xxx#sqoop#tom
hdfs#tom#hive#hadoop

需求1:去除源文件中xxx和yyy单词

代码实现:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class TTest extends Configured implements Tool {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        int status = ToolRunner.run(conf, new TTest(), args);
        System.exit(status);
    }

    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(this.getConf(), "test26");
        job.setJarByClass(TTest.class);

        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.setInputPaths(job, new Path("C:\\Users\\User\\Desktop\\test26\\test26.txt"));

        job.setMapperClass(MapWordCount.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

//        job.setPartitionerClass(HashPartitioner.class);
//        job.setSortComparatorClass(null);
//        job.setGroupingComparatorClass(null);
//        job.setCombinerClass(null);

//        job.setReducerClass(ReduceWordCount.class);
//        job.setOutputKeyClass(Text.class);
//        job.setOutputValueClass(IntWritable.class);

        job.setOutputFormatClass(TextOutputFormat.class);
        Path path = new Path("C:\\Users\\User\\Desktop\\test26\\test01");
        FileSystem fs = FileSystem.get(this.getConf());
        if (fs.exists(path)) {
            fs.delete(path, true);
        }
        TextOutputFormat.setOutputPath(job, path);

        //job.setNumReduceTasks(1);
        return job.waitForCompletion(true) ? 0 : -1;
    }

    public static class MapWordCount extends Mapper<LongWritable, Text, Text, NullWritable> {
        Text outputKey = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String empk=value.toString().replace("xxx","");
            String k=empk.replace("yyy","");
            outputKey.set(k);
            context.write(outputKey, NullWritable.get());

        }
    }

//    public static class ReduceWordCount extends Reducer<Text, IntWritable, Text, IntWritable> {
//        Text outputKey = new Text();
//        IntWritable outputValue = new IntWritable();
//        @Override
//        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//            int sum=0;
//            for (IntWritable value : values) {
//                sum+=value.get();
//            }
//            outputKey.set(key);
//            outputValue.set(sum);
//            context.write(outputKey,outputValue);
//        }
//    }

}

需求2:将文件中的单词长度>=5的数据分到一个文件,剩余的分到另一个文件

代码实现:
import com.itheima.hadoop.mr.Flow3.FlowBean3;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class TTest02 extends Configured implements Tool {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        int status = ToolRunner.run(conf, new TTest02(), args);
        System.exit(status);
    }

    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(this.getConf(), "test26");
        job.setJarByClass(TTest02.class);

        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.setInputPaths(job, new Path("C:\\Users\\User\\Desktop\\test26\\test26.txt"));

        job.setMapperClass(MapWordCount.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setPartitionerClass(MRPartition.class);
//        job.setSortComparatorClass(null);
//        job.setGroupingComparatorClass(null);
//        job.setCombinerClass(null);

        job.setReducerClass(ReduceWordCount.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setOutputFormatClass(TextOutputFormat.class);
        Path path = new Path("C:\\Users\\User\\Desktop\\test26\\test02");
        FileSystem fs = FileSystem.get(this.getConf());
        if (fs.exists(path)) {
            fs.delete(path, true);
        }
        TextOutputFormat.setOutputPath(job, path);
        job.setNumReduceTasks(2);
        return job.waitForCompletion(true) ? 0 : -1;
    }

    public static class MapWordCount extends Mapper<LongWritable, Text, Text, NullWritable> {
        Text outputKey=new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split("#");
            for (String s : split) {
                outputKey.set(s);
                context.write(outputKey, NullWritable.get());
            }


        }
    }
    public static class MRPartition extends Partitioner<Text,NullWritable>{
        @Override
        public int getPartition(Text k2, NullWritable v2, int i) {
            String s = k2.toString();
            if (s.length()>=5){
                return 0;
            }else {
                return 1;
            }
        }
    }
    public static class ReduceWordCount extends Reducer<Text, NullWritable, Text, NullWritable> {
        @Override
        protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            context.write(key,NullWritable.get());
        }
    }

}

需求3:统计文件中每个单词出现的次数

代码实现:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;


public class TTest03 extends Configured implements Tool {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        int status = ToolRunner.run(conf, new TTest03(), args);
        System.exit(status);
    }

    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(this.getConf(), "test26");
        job.setJarByClass(TTest03.class);

        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.setInputPaths(job, new Path("C:\\Users\\User\\Desktop\\test26\\test26.txt"));

        job.setMapperClass(MapWordCount.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

//        job.setPartitionerClass(HashPartitioner.class);
//        job.setSortComparatorClass(null);
//        job.setGroupingComparatorClass(null);
//        job.setCombinerClass(null);

        job.setReducerClass(ReduceWordCount.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setOutputFormatClass(TextOutputFormat.class);
        Path path = new Path("C:\\Users\\User\\Desktop\\test26\\test03");
        FileSystem fs = FileSystem.get(this.getConf());
        if (fs.exists(path)) {
            fs.delete(path, true);
        }
        TextOutputFormat.setOutputPath(job, path);

        //job.setNumReduceTasks(1);
        return job.waitForCompletion(true) ? 0 : -1;
    }

    public static class MapWordCount extends Mapper<LongWritable, Text, Text, IntWritable> {
        Text outputKey = new Text();
        IntWritable outputValue = new IntWritable(1);
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split("#");
            for (String s : split) {
                outputKey.set(s);
                context.write(outputKey,outputValue);
            }

        }
    }

    public static class ReduceWordCount extends Reducer<Text, IntWritable, Text, IntWritable> {
        Text outputKey = new Text();
        IntWritable outputValue = new IntWritable();
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum=0;
            for (IntWritable value : values) {
                sum+=value.get();
            }
            outputKey.set(key);
            outputValue.set(sum);
            context.write(outputKey,outputValue);
        }
    }

}

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

人间清醒vv子

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值