MapReduce基础测试(二)
有一份源数据文件如下,请根据需求,编写对应的MapReduce程序
hello#world#hadoop
hive#sqoop#xxx#flume#hello
hdfs#hive#yyy#world
hadoop#yyy#spark#flink
flink#hello#xxx#sqoop#tom
hdfs#tom#hive#hadoop
需求1:去除源文件中xxx和yyy单词
代码实现:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
public class TTest extends Configured implements Tool {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int status = ToolRunner.run(conf, new TTest(), args);
System.exit(status);
}
public int run(String[] args) throws Exception {
Job job = Job.getInstance(this.getConf(), "test26");
job.setJarByClass(TTest.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job, new Path("C:\\Users\\User\\Desktop\\test26\\test26.txt"));
job.setMapperClass(MapWordCount.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
// job.setPartitionerClass(HashPartitioner.class);
// job.setSortComparatorClass(null);
// job.setGroupingComparatorClass(null);
// job.setCombinerClass(null);
// job.setReducerClass(ReduceWordCount.class);
// job.setOutputKeyClass(Text.class);
// job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
Path path = new Path("C:\\Users\\User\\Desktop\\test26\\test01");
FileSystem fs = FileSystem.get(this.getConf());
if (fs.exists(path)) {
fs.delete(path, true);
}
TextOutputFormat.setOutputPath(job, path);
//job.setNumReduceTasks(1);
return job.waitForCompletion(true) ? 0 : -1;
}
public static class MapWordCount extends Mapper<LongWritable, Text, Text, NullWritable> {
Text outputKey = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String empk=value.toString().replace("xxx","");
String k=empk.replace("yyy","");
outputKey.set(k);
context.write(outputKey, NullWritable.get());
}
}
// public static class ReduceWordCount extends Reducer<Text, IntWritable, Text, IntWritable> {
// Text outputKey = new Text();
// IntWritable outputValue = new IntWritable();
// @Override
// protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
// int sum=0;
// for (IntWritable value : values) {
// sum+=value.get();
// }
// outputKey.set(key);
// outputValue.set(sum);
// context.write(outputKey,outputValue);
// }
// }
}
需求2:将文件中的单词长度>=5的数据分到一个文件,剩余的分到另一个文件
代码实现:
import com.itheima.hadoop.mr.Flow3.FlowBean3;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
public class TTest02 extends Configured implements Tool {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int status = ToolRunner.run(conf, new TTest02(), args);
System.exit(status);
}
public int run(String[] args) throws Exception {
Job job = Job.getInstance(this.getConf(), "test26");
job.setJarByClass(TTest02.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job, new Path("C:\\Users\\User\\Desktop\\test26\\test26.txt"));
job.setMapperClass(MapWordCount.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setPartitionerClass(MRPartition.class);
// job.setSortComparatorClass(null);
// job.setGroupingComparatorClass(null);
// job.setCombinerClass(null);
job.setReducerClass(ReduceWordCount.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
Path path = new Path("C:\\Users\\User\\Desktop\\test26\\test02");
FileSystem fs = FileSystem.get(this.getConf());
if (fs.exists(path)) {
fs.delete(path, true);
}
TextOutputFormat.setOutputPath(job, path);
job.setNumReduceTasks(2);
return job.waitForCompletion(true) ? 0 : -1;
}
public static class MapWordCount extends Mapper<LongWritable, Text, Text, NullWritable> {
Text outputKey=new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("#");
for (String s : split) {
outputKey.set(s);
context.write(outputKey, NullWritable.get());
}
}
}
public static class MRPartition extends Partitioner<Text,NullWritable>{
@Override
public int getPartition(Text k2, NullWritable v2, int i) {
String s = k2.toString();
if (s.length()>=5){
return 0;
}else {
return 1;
}
}
}
public static class ReduceWordCount extends Reducer<Text, NullWritable, Text, NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
}
需求3:统计文件中每个单词出现的次数
代码实现:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
public class TTest03 extends Configured implements Tool {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int status = ToolRunner.run(conf, new TTest03(), args);
System.exit(status);
}
public int run(String[] args) throws Exception {
Job job = Job.getInstance(this.getConf(), "test26");
job.setJarByClass(TTest03.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job, new Path("C:\\Users\\User\\Desktop\\test26\\test26.txt"));
job.setMapperClass(MapWordCount.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// job.setPartitionerClass(HashPartitioner.class);
// job.setSortComparatorClass(null);
// job.setGroupingComparatorClass(null);
// job.setCombinerClass(null);
job.setReducerClass(ReduceWordCount.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
Path path = new Path("C:\\Users\\User\\Desktop\\test26\\test03");
FileSystem fs = FileSystem.get(this.getConf());
if (fs.exists(path)) {
fs.delete(path, true);
}
TextOutputFormat.setOutputPath(job, path);
//job.setNumReduceTasks(1);
return job.waitForCompletion(true) ? 0 : -1;
}
public static class MapWordCount extends Mapper<LongWritable, Text, Text, IntWritable> {
Text outputKey = new Text();
IntWritable outputValue = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("#");
for (String s : split) {
outputKey.set(s);
context.write(outputKey,outputValue);
}
}
}
public static class ReduceWordCount extends Reducer<Text, IntWritable, Text, IntWritable> {
Text outputKey = new Text();
IntWritable outputValue = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable value : values) {
sum+=value.get();
}
outputKey.set(key);
outputValue.set(sum);
context.write(outputKey,outputValue);
}
}
}