1.黑名单过滤
package cn.tl.mr;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
// 黑名单过滤
public class BlackListMapReduce {
// Mapper
public static class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
private Set<String> set = null;
private Text k = new Text();
private IntWritable v = new IntWritable();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
String content = context.getConfiguration().get("black");
String[] arr = content.split("\n");
set = new HashSet<String>(Arrays.asList(arr));
}
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String[] arr = value.toString().split("\\s");
// 黑名单过滤
if (!set.contains(arr[0])) {
k.set(arr[0]);
v.set(Integer.parseInt(arr[1]));
context.write(k, v);
}
}
}
// Reducer
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
for (IntWritable val : values) {
context.write(key, val);
}
}
}
// 读取黑名单
public static String getBlackList(Configuration conf, String hdfs) throws IOException {
FileSystem fs = FileSystem.get(conf);
FSDataInputStream in = fs.open(new Path(hdfs));
ByteArrayOutputStream bao = new ByteArrayOutputStream();
int len = 0;
byte[] b = new byte[65536];
while ((len = in.read(b)) != -1) {
bao.write(b, 0, len);
}
bao.close();
return new String(bao.toByteArray(), "utf8");
}
public static void main(String[] args) throws IOException, Exception {
// 集群信息
Configuration conf = new Configuration();
conf.set("black", getBlackList(conf, args[2]));
// Job实例
Job job = Job.getInstance(conf, "黑名单过滤");
// 设置启动类
job.setJarByClass(BlackListMapReduce.class);
// 设置mapper和reducer类
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// 设置输出<k,v>类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置输入输出路径
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 退出执行
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
运行结果如下:
2.MapReduce Join
Join主要分为:MapJoin和ReduceJoin。其中,MapJoin适合一个表数据量特别小的场景使用。类似与上面的黑名单方式,不过我们使用DistributedCache进行小文件分发到各个节点,数据量大的文件作为输入。
我们的数据表一(数据量小):id,name,province,city;表二:id,course,score。
MapJoin:
package cn.tl.mr;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class ResultWritable implements Writable {
private String id;
private String name;
private String province;
private String city;
private String course;
private String score;
public ResultWritable() {
}
public ResultWritable(String id, String name, String province, String city, String course, String score) {
set(id, name, province, city, course, score);
}
public void set(String id, String name, String province, String city, String course, String score) {
this.id = id;
this.name = name;
this.province = province;
this.city = city;