partitioner用处: 1.根据业务需要,产生多个输出文件 2.多个reduce任务在运行,提高整体job的运行效率
分区的例子必须打成jar运行
下面来举例partitioner,统计log文件,将中文放到一个分区,非中文放到一个分区
package oldapi;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Counters.Counter;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
// extends Configured implements Tool
public class OldApi{
public static void main(String[] args) throws IOException, URISyntaxException {
// TODO Auto-generated method stub
String INPUT_PATH = "hdfs://test:9000/log";
String OUT_PATH = "hdfs://test:9000/result";
Configuration conf = new Configuration();
Path path = new Path(OUT_PATH);
FileSystem fileSystem = FileSystem.get(new URI(OUT_PATH), conf);
if(fileSystem.exists(path)){
fileSystem.delete(path);
}
JobConf jobConf = new JobConf(conf, OldApi.class);
jobConf.setJarByClass(OldApi.class);
FileInputFormat.setInputPaths(jobConf, INPUT_PATH);
jobConf.setInputFormat(TextInputFormat.class);
jobConf.setMapperClass(MyMapper.class);
jobConf.setMapOutputKeyClass(Text.class);
jobConf.setMapOutputValueClass(LongWritable.class);
// 指定分区类
jobConf.setPartitionerClass(MyPartitioner.class);
//指定reduce任务数
jobConf.setNumReduceTasks(2);
jobConf.setCombinerClass(MyReducer.class);
jobConf.setReducerClass(MyReducer.class);
FileOutputFormat.setOutputPath(jobConf, path);
jobConf.setOutputKeyClass(Text.class);
jobConf.setOutputValueClass(LongWritable.class);
jobConf.setOutputFormat(TextOutputFormat.class);
JobClient.runJob(jobConf);
}
}
class MyPartitioner implements Partitioner<Text, LongWritable>{
@Override
public int getPartition(Text key, LongWritable value, int numPartitions) {
// TODO Auto-generated method stub
return key.toString().getBytes().length > key.toString().length()?0:1;
}
@Override
public void configure(JobConf job) {
// TODO Auto-generated method stub
}
}
class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{
@Override
public void map(LongWritable key, Text value,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
String vs[] = value.toString().split("\t");
Counter counter = reporter.getCounter("sensitive word", "中文");
for (String text : vs) {
if(text.getBytes().length > text.length()){
counter.increment(1);
}
output.collect(new Text(text), new LongWritable(1));
}
}
}
class MyReducer extends MapReduceBase implements Reducer<Text,LongWritable,Text,LongWritable>{
@Override
public void reduce(Text key, Iterator<LongWritable> values,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
// TODO Auto-generated method stub
long count = 0L;
while (values.hasNext()) {
count += values.next().get();
}
output.collect(key, new LongWritable(count));
}
}
将上述代码导出,打成jar包,上传到linux系统 /usr/downloads/下
执行hadoop -jar jar.jar命令, 会在result目录下生成两个文件part-00000和part-00001,分别存放汉字的统计数量和非汉字的统计数