Hadoop-MRjob串联之倒排索引
数据:
文件a.txt
a a a b
b c d
文件b.txt
c c c d
d a b
文件c.txt
d a d c
c f b
需求统计所有文件中单词出现的个数,按TAB切分,并计算在各个文件中的出现的个数
Map1->IndexMap1.java
package day4_jobs_input.jobs.index;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* indexMap1
* @author Fantome
* @date 2019/05/29
*/
public class IndexMap1 extends Mapper<LongWritable,Text, Text, IntWritable> {
Text k=new Text();
IntWritable v=new IntWritable();
String name;
/**
* 获得文件名
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSplit split = (FileSplit)context.getInputSplit();
name = split.getPath().getName();
}
/**
* 将文件中的词分开,k:文件名-词 v:1
* @param key
* @param value word
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
v.set(1);
String[] splits = value.toString().split("\t");
for (String word:splits){
//k:name-word
k.set(name+"-"+word);
context.write(k,v);
}
}
}
Reduce1->IndexReduce1.java
package day4_jobs_input.jobs.index;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* indexReduce1
* @author Fantome
* @date 2019/05/29
*/
public class IndexReduce1 extends Reducer<Text, IntWritable,Text,IntWritable> {
IntWritable v=new IntWritable();
/**
* 对name-words进行一次词频聚合
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//k:name-words v:1
int count=0;
for (IntWritable value:values){
count+=value.get();
}
v.set(count);
context.write(key,v);
}
}
Map2->IndexMap2.java
package day4_jobs_input.jobs.index;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* indexMap2
* @author Fantome
* @date 2019/05/29
*/
public class IndexMap2 extends Mapper<LongWritable, Text,Text,Text> {
Text k=new Text();
Text v=new Text();
/**
* 对value做切分,调整为//k:word v:name-->num输出
* @param key
* @param value
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//value:name-word \t num
String[] splits = value.toString().split("\t");
String[] name_word = splits[0].split("-");
//k:word v:name-->num
k.set(name_word[1]);
v.set(name_word[0]+"-->"+splits[1]);
context.write(k,v);
}
}
Reduce->IndexReduce2.java
package day4_jobs_input.jobs.index;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* indexReduce2
* @author Fantome
* @date 2019/05/29
*/
public class IndexReduce2 extends Reducer<Text,Text,Text,Text> {
Text v=new Text();
/**
* 聚合word 拼接 name-->num
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//k:word \t name-->num
StringBuffer name_num = new StringBuffer();
for (Text value:values){
String s = value.toString();
name_num.append(s).append("\t");
String allnn = name_num.toString();
v.set(allnn);
}
context.write(key,v);
}
}
Drive->IndexDriveAll.java
package day4_jobs_input.jobs.index;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;
public class IndexDriveAll {
public static void main(String[] args) throws Exception {
args=new String[]{"E:\\桌面\\大数据\\test\\index\\*",
"E:\\桌面\\大数据\\test\\indexOut1",
"E:\\桌面\\大数据\\test\\indexOut2"};
Configuration conf = new Configuration();
//job1
Job job1 = Job.getInstance(conf);
job1.setJarByClass(IndexDriveAll.class);
job1.setMapperClass(IndexMap1.class);
job1.setReducerClass(IndexReduce1.class);
job1.setMapOutputKeyClass(Text.class);
job1.setMapOutputValueClass(IntWritable.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job1,new Path(args[0]));
FileOutputFormat.setOutputPath(job1,new Path(args[1]));
//job2
Job job2 = Job.getInstance(conf);
job2.setJarByClass(IndexDriveAll.class);
job2.setMapperClass(IndexMap2.class);
job2.setReducerClass(IndexReduce2.class);
job2.setMapOutputKeyClass(Text.class);
job2.setMapOutputValueClass(Text.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job2,new Path(args[1]));
FileOutputFormat.setOutputPath(job2,new Path(args[2]));
//job1和job2串联
JobControl control = new JobControl("index");
ControlledJob ajob = new ControlledJob(job1.getConfiguration());
ControlledJob bjob = new ControlledJob(job2.getConfiguration());
bjob.addDependingJob(ajob);
control.addJob(ajob);
control.addJob(bjob);
Thread thread = new Thread(control);
thread.start();
//如果线程全部结束 退出
while (!control.allFinished()){
Thread.sleep(1000);
}
System.exit(0);
}
}
job1结果
a.txt-a 3
a.txt-b 2
a.txt-c 1
a.txt-d 1
b.txt-a 1
b.txt-b 1
b.txt-c 3
b.txt-d 2
c.txt-a 1
c.txt-b 1
c.txt-c 2
c.txt-d 2
c.txt-f 1
job2结果
a a.txt-->3 c.txt-->1 b.txt-->1
b b.txt-->1 a.txt-->2 c.txt-->1
c b.txt-->3 c.txt-->2 a.txt-->1
d a.txt-->1 c.txt-->2 b.txt-->2
f c.txt-->1