1.导入maven依赖
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-api</artifactId>
<version>3.3.6</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-runtime</artifactId>
<version>3.3.6</version>
</dependency>
2.编写主程序:test.java
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class test{
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//加载配置
Configuration conf = new Configuration(true);
//获取一个作业
Job job = Job.getInstance(conf);
//设置作业的程序class
job.setJarByClass(test.class);
//设置作业名
job.setJobName("test");
//设置作业数
job.setNumReduceTasks(2);
//将一些字典文件加载到缓存文件,用于表关联等操作
job.addCacheFile(new Path("/data/test/dict/dict.txt").toUri());
//设置输入的文件夹
TextInputFormat.addInputPath(job,new Path("/data/test/input"));
//设置输出的文件夹
TextOutputFormat.setOutputPath(job,new Path("/data/test/output"));
//设置自定义的map
job.setMapperClass(TMapper.class);
//设置map输出键值的键
job.setMapOutputKeyClass(TKey.class);
//设置map输出键值的值
job.setMapOutputValueClass(IntWritable.class);
//设置map的排序比较器
job.setSortComparatorClass(TSortComparator.class);
//设置分区函数
job.setPartitionerClass(TPartitioner.class);
//设置reduce的分组排序比较器
job.setGroupingComparatorClass(TGroupingComparator.class);
//设置自定义的reduce
job.setReducerClass(TReducer.class);
//设置reduce输出键值的键
job.setOutputKeyClass(Text.class);
//设置reduce输出键值的值
job.setOutputValueClass(IntWritable.class);
//提交任务,true表示提交任务并开启执行进度。返回为boolean表示任务的执行状态,true表示成功。
job.waitForCompletion(true);
}
}
3.编写自定义的key类:TKey.java
import lombok.Data;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
@Data
public class TKey implements WritableComparable<TKey> {
private int t1;
private int t2;
private String dict;
@Override
public int compareTo(TKey tKey) {
return Integer.compare(this.t1,tKey.getT1());
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(t1);
dataOutput.writeInt(t2);
dataOutput.writeUTF(dict);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.t1= dataInput.readInt();
this.t2= dataInput.readInt();
this.dict= dataInput.readUTF();
}
}
4.编写自定义的map类:TMapper.java
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
public class TMapper extends Mapper<LongWritable, Text, TKey, IntWritable> {
TKey mKey = new TKey();
IntWritable mval = new IntWritable(1);
public HashMap<String,String> dict = new HashMap<String,String>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//将job.addCacheFile缓存里的字典拿出来转为map
URI[] files = context.getCacheFiles();
Path path = new Path(files[0].getPath());
BufferedReader reader = new BufferedReader(new FileReader(new File(path.getName())));
String line = reader.readLine();
while (line != null){
String[] split = line.split(",");
dict.put(split[0],split[1]);
line = reader.readLine();
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException,InterruptedException {
//将每一行数据拿出来切分
String[] strs = StringUtils.split(value.toString(),',');
mKey.setT1(Integer.parseInt(strs[0]));
mKey.setT2(Integer.parseInt(strs[1]));
mKey.setDict(dict.get(strs[0]));
context.write(mKey, mval);
}
}
5.编写自定义的map排序比较器:TSortComparator.java
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class TSortComparator extends WritableComparator {
public TSortComparator(){
super(TKey.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b){
TKey k1 = (TKey) a;
TKey k2 = (TKey) b;
return Integer.compare(k1.getT1(),k2.getT1());
}
}
6.编写自定义的分区类:TPartitioner.java
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class TPartitioner extends Partitioner<TKey, IntWritable> {
@Override
public int getPartition(TKey tKey, IntWritable intWritable, int numPartitions) {
return tKey.getT1() % numPartitions;
}
}
7.编写自定义的reduce分组排序比较器:TGroupingComparator.java
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class TGroupingComparator extends WritableComparator {
public TGroupingComparator(){
super(TKey.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b){
TKey k1 = (TKey) a;
TKey k2 = (TKey) b;
return Integer.compare(k1.getT1(),k2.getT1());
}
}
8.编写自定义的reduce类:TReducer.java
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class TReducer extends Reducer<TKey, IntWritable, Text, IntWritable> {
Text rkey = new Text();
IntWritable rval = new IntWritable();
@Override
protected void reduce(TKey key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
Iterator<IntWritable> iter = values.iterator();
int sum = 0;
while(iter.hasNext()){
IntWritable val = iter.next();
sum+=val.get();
}
rval.set(sum);
rkey.set(key.getT1()+"-"+key.getT2()+"-"+key.getDict());
context.write(rkey,rval);
}
}
9.测试数据
1)将以下文件内容放到hdfs的/data/test/input/data.txt里
1,2
2,2
2,2
2,3
3,4
1,2
2,2
2,3
3,4
1,2
2)将以下文件内容放到hdfs的/data/test/dict/dict.txt里
1,beijing
2,shanghai
3,guangdong
10.打包编译运行
hadoop jar xxx.jar com.xxx.xxx.test