Index串联案例
现有三个文件 a.html b.html c.html ,里面分别记录着单词,如下图所示,现需要计算出 每个单词在每个文件出现的次数,格式如下: Hello a.html-4 b.html-8 c.html-10
a.html
hello tom
hello jim
hello kitty
hello rose
b.html
hello jerry
hello jim
hello kitty
hello jack
c.html
hello jerry
hello java
hello c++
hello c++
hello hello
需求分析:
需要 Hello a.html-4 b.html-8 c.html-10 很明显最后,输出的单词作为key,value是一个字符串 这个字符串是拼接而成,且涉及到文件名和各自的聚合个数
我们能够轻而易举的计算出来的是 每个单词 出现的个数 如:Hello 10 java 20 ...
很明显 一个mapreduce是计算不出来的,所以我们采用两个mapreduce程序串联的写法进行计算
第一个mapredue
1)map:以单词-文件名作为key 个数作为value输出 到reduce
2)reduce:聚合求出
Hello-a.html 10
Hello-b.html 20
Hello-c.html 30
第二个mapreduce
1)map: 读取第一个mapreduce输出的文件 进行切割 Hello a.html 10 以Hello为key 以a.html-10拼接为value输出给reduce
2)reduce :拿到的数据格式应为: Hello(a.html-10,b.html-20,c.html-30)循环迭代器 拼接完成最终结果输出
第一个MapReduce程序
文件名的获取关键:重写父类的setup方法
String fileName = null;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSplit fs = (FileSplit) context.getInputSplit();
fileName = fs.getPath().getName();
}
如下图:在进行循环读取文件数据之前,先执行的setup方法,所以同一个maptask中 fileName都是同一个
mapreduce程序:
package cn.doit19.hadoop.review.index;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author:tom
* @Date:Created in 19:56 2020/11/17
*/
@SuppressWarnings("all")
public class Index1 {
//Index案例 串联案例
static class Index1Map extends Mapper<LongWritable, Text, Text, IntWritable> {
Text k = new Text();
String fileName = null;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSplit fs = (FileSplit) context.getInputSplit();
fileName = fs.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try {
//读一行执行一次
String line = value.toString();
//line的内容
//hello tom
//hello jim
//hello kitty
//hello rose
String[] words = line.split("\\s+");//{hello,tom }
for (String word : words) {
//拼接单词+文件名
k.set(word + "-" + fileName);
//输出结果到reduce
context.write(k, new IntWritable(1));
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
static class Index1Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
try {
// key + 迭代器 内容如下
// Hello-a.html (1,1,1,1,1) Hello-b.html(1,1,1,1,1)
int count = 0;
for (IntWritable value : values) {
count++;
}
context.write(key, new IntWritable(count));
} catch (Exception e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws Exception {
//初始化配置对象
Configuration conf = new Configuration();
//创建job对象
Job job = Job.getInstance(conf);
//设置map task 类
job.setMapperClass(Index1Map.class);
//设置reduce task 类
job.setReducerClass(Index1Reducer.class);
//设置map输出类型 kv
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置reduce 最终输出类型 kv
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置reduce 数量
// job.setNumReduceTasks(2);
//设置输入路径
FileInputFormat.setInputPaths(job, new Path("E:\\MR\\index"));
//设置输出路径
FileOutputFormat.setOutputPath(job, new Path("E:\\MR\\In\\index1"));
//提交任务
boolean s = job.waitForCompletion(true);
}
}
mapreduce程序输出结果:
第二个MapReduce程序
package cn.doit19.hadoop.review.index;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author:tom
* @Date:Created in 21:03 2020/11/17
*/
@SuppressWarnings("all")
public class Index2 {
static class Index2Map extends Mapper<LongWritable, Text, Text, Text> {
Text k = new Text();
Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try {
//读一行执行一次
String line = value.toString();
//line的内容
// Hello-a.html 10 Hello-b.html 20
String[] words = line.split("\\s+");//{Hello-a.html,10 }
String[] split = words[0].split("-");
k.set(split[0]);
//拼接value
v.set(split[1] + "-" + words[1]);
//输出结果到reduce
context.write(k, v);
} catch (Exception e) {
e.printStackTrace();
}
}
}
static class Index2Reducer extends Reducer<Text, Text, Text, Text> {
Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder sb = new StringBuilder();
for (Text value : values) {
sb.append(value + " ");
}
v.set(sb.toString().trim());
context.write(key, v);
}
}
public static void main(String[] args) throws Exception {
//初始化配置对象
Configuration conf = new Configuration();
//创建job对象
Job job = Job.getInstance(conf);
//设置map task 类
job.setMapperClass(Index2.Index2Map.class);
//设置reduce task 类
job.setReducerClass(Index2.Index2Reducer.class);
//设置map输出类型 kv
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//设置reduce 最终输出类型 kv
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//设置reduce 数量
// job.setNumReduceTasks(2);
//设置输入路径
FileInputFormat.setInputPaths(job, new Path("E:\\MR\\In\\index1"));
//设置输出路径
FileOutputFormat.setOutputPath(job, new Path("E:\\MR\\out\\index2"));
//提交任务
boolean s = job.waitForCompletion(true);
}
}
输出结果:
更多学习、面试资料尽在微信公众号:Hadoop大数据开发