输入数据:
期望数据数据格式:
实现思路:
第一部分:
- Map阶段获数据来源取文件名
- 在map方法内将文件名称与切分得到的每个单词拼接作为key,value恒等于1,循环写出
- Reduce阶段就收到的数据格式为:
单词+“-”+文件名 1
单词+“-”+文件名 1
单词+“-”+文件名 1
单词2+“-”+文件名 1
单词2+“-”+文件名 1
单词2+“-”+文件名 1
把相同key 的做数据统计,获得同一文件单词出现此次数 - 将文件 以:单词+"-"+文件名 单词出现此次数 形式写出
第二部分: - 以第一阶段数据的结果数据为输入数据
- 将数据以”-“ 切分 取出文件名,剩余数据
- 将数据写出
- reduce将获取的数据进行重新拼接 形成: hadoop c.txt–>2 b.txt–>2 a.txt–>3 数据格式输出
- 写两个Driver类
代码实现:
自定义Mapper1
package com.mr.njob;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* @author kate
*/
public class NJobMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
Text k= new Text();
IntWritable n=new IntWritable(1);
String name;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSplit inputSplit = (FileSplit) context.getInputSplit();
name = inputSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(" ");
for (String s : split) {
k.set(s+"-"+ name );
context.write(k,n);
}
}
}
自定义Reduce1
package com.mr.njob;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author kate
*/
public class NJobReduce extends Reducer<Text, IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable value : values) {
sum+=value.get();
}
context.write(key,new IntWritable(sum));
}
}
自定义Driver1
package com.mr.njob;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author kate
*/
public class NJobDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(NJobDriver.class);
job.setMapperClass(NJobMapper.class);
job.setReducerClass(NJobReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
Path input = new Path("src/main/resources/input/njob/");
Path output = new Path("src/main/resources/output/njob");
FileInputFormat.addInputPath(job,input);
FileSystem system = FileSystem.get(new Configuration());
if(system.exists(output)){
system.delete(output,true);
System.out.println("存在的输出路径已经删除");
}
FileOutputFormat.setOutputPath(job,output);
boolean b = job.waitForCompletion(true);
if(!b){
System.exit(b?0:1);
}else {
boolean b1 = new NJob2Driver().fun2();
System.exit(b1?0:1);
}
}
}
自定义Mapper2
package com.mr.njob;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @author kate
*/
public class NJob2Mapper extends Mapper<LongWritable, Text, Text, Text> {
Text k = new Text();
Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1 获取1行数据
String line = value.toString();
// 2用“-”切割
String[] fields = line.split("-");
k.set(fields[0]);
v.set(fields[1]);
// 3 输出数据
context.write(k, v);
}
}
自定义Reduce2
package com.mr.njob;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class NJob2Reduce extends Reducer<Text, Text, Text, Text> {
Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
// hadoop a.txt 3
// hadoop b.txt 2
// hadoop c.txt 2
// hadoop c.txt-->2 b.txt-->2 a.txt-->3
StringBuilder sb = new StringBuilder();
// 1 拼接
for (Text value : values) {
sb.append(value.toString().replace("\t", "-->") + "\t");
}
v.set(sb.toString());
// 2 写出
context.write(key, v);
}
}
自定义Driver2
package com.mr.njob;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author kate
*/
public class NJob2Driver {
public boolean fun2() throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf2 = new Configuration();
Job job2 = Job.getInstance(conf2);
job2.setJarByClass(NJob2Driver.class);
job2.setMapperClass(NJob2Mapper.class);
job2.setReducerClass(NJob2Reduce.class);
job2.setMapOutputKeyClass(Text.class);
job2.setMapOutputValueClass(Text.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(Text.class);
Path input2 = new Path("src/main/resources/output/njob/part-r-00000");
Path output2 = new Path("src/main/resources/output/njob/2");
FileInputFormat.addInputPath(job2, input2);
FileSystem system2 = FileSystem.get(conf2);
if (system2.exists(output2)) {
system2.delete(output2, true);
System.out.println("存在的输出路径已经删除");
}
FileOutputFormat.setOutputPath(job2, output2);
return job2.waitForCompletion(true);
}
}
效果图:
第一部分:
第二部分: