测试数据
file1:
2 3 4 12 1212 121 23 232
45 545 7667 323 5454 7676 2323
655 12 1212 121
23 232 45 545 7667 323 5454
7676 2323 655
43333 334 34 22222 2222 33333
121 232 4 545 65 87 454 234
121 232 4 545 65 87 454 234
121 232 4 545 65 87 454 234
file2:
2 3 4 12 1212 121 23 232
45 545 7667 323 5454 7676 2323
655 12 1212 121
23 232 45 545 7667 323 5454
7676 2323 655
43333 334 34 22222 2222 33333
121 232 4 545 65 87 454 234
121 232 4 545 65 87 454 234
121 232 4 545 65 87 454 234
自定义Mapper
package com.test.sort;
import com.test.phone.PhoneBean;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class SortMapper extends Mapper<LongWritable,Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString(); //拿到一行数据
String[] fields = line.split(" ");//切分成各个字段
for (String field :fields){
//封装数据为key-VALUE进行输出
context.write(new Text(field),new IntWritable(1));
}
}
}
自定义Partition
package com.test.sort;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class SortPartition extends Partitioner<Text, IntWritable> {
@Override
public int getPartition(Text text, IntWritable IntWritable, int numPartitions) {
int length = text.toString().length();//获取手机号码前三位
int partition = 4;
switch(length)
{ //根据手机号前置设置分区
case 1:
partition = 0;//必须从0开始
break;
case 2:
partition = 1;
break;
case 3:
partition = 2;
break;
case 4:
partition = 3;
break;
default:
break;
}
return partition;
}
}
自定义Reduce
package com.test.sort;
import com.test.phone.PhoneBean;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class SortReduce extends Reducer<Text, IntWritable ,Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable value: values){
sum+=value.get();
}
context.write(key,new IntWritable(sum));
}
}
自定义Driver
package com.test.sort;
import com.test.phone.PhoneBean;
import com.test.phone.PhoneMapper;
import com.test.phone.PhonePartition;
import com.test.phone.PhoneReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class SortDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
args = new String[2];
args[0] = "src/main/resources/input";
args[1] = "src/main/resources/output";
Configuration cfg = new Configuration();
cfg.set("mapreduce.framework.name", "local");
cfg.set("fs.defaultFS", "file:///");
final FileSystem filesystem = FileSystem.get(cfg);
if (filesystem.exists(new Path(args[0]))) {
filesystem.delete(new Path(args[1]), true);
}
Job job = Job.getInstance(cfg);
job.setJarByClass(SortDriver.class);
job.setMapperClass(SortMapper.class);
job.setReducerClass(SortReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setPartitionerClass(SortPartition.class);
job.setNumReduceTasks(5);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0])); // 输入路径
FileOutputFormat.setOutputPath(job, new Path(args[1])); // 输出路径
// 提交任务
int ec = job.waitForCompletion(true) ? 0 : 1;
System.exit(ec);
}
}