0.MaxValue:要求输出cite75_99.txt中最大的CITED值:
要点:
1.Mapper只输出它所处理的数据中的最大值。(重写cleanup()函数)
2.设置Reducer数目为一个 -D mapred.reduce.tasks=1,同时也只输出所处理的最大值。(重写cleanup()函数)
3.cleanup()函数:在任务结束时执行一次。详见API。
代码如下:
- /*
- * MaxValues
- * 函数作用:输出Patent中最大数值
- * Author: jokes000
- * Date: 2011-12-15
- */
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.conf.Configured;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
- import org.apache.hadoop.util.Tool;
- import org.apache.hadoop.util.ToolRunner;
- public class MaxValue extends Configured implements Tool {
- public static class MapClass extends Mapper<LongWritable,Text,Text,Text> {
- int max = 0;
- // Map Method
- public void map(LongWritable key, Text value, Context context){
- String[] citation = value.toString().split(",", 2);
- try {
- int tmp = Integer.parseInt(citation[0]);
- if( tmp > max ) max = tmp;
- } catch(NumberFormatException e){
- // do nothing.
- }
- //context.write(new Text(citation[0]), new Text(citation[0]));
- }
- @Override
- protected void cleanup(Context context) throws IOException, InterruptedException {
- context.write(new Text(max+""), new Text(max+""));
- }
- }
- public static class Reduce extends Reducer<Text,Text,Text,IntWritable> {
- int max = 0;
- // Reduce Method
- public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
- //IntWritable[] top = new IntWritable[10];
- for(Text value : values) {
- try {
- int tmp = Integer.parseInt(value.toString());
- if( tmp > max ) max = tmp;
- } catch(NumberFormatException e) {
- // do nothing.
- }
- }
- //context.write(new Text("Max"), new IntWritable(max));
- }
- @Override
- protected void cleanup(Context context) throws IOException, InterruptedException {
- context.write(new Text("Max"), new IntWritable(max));
- }
- }
- @Override
- public int run(String[] arg0) throws Exception {
- Job job = new Job();
- job.setJarByClass(MaxValue.class);
- FileInputFormat.addInputPath(job, new Path(arg0[0]));
- FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
- job.setMapperClass(MapClass.class);
- job.setReducerClass(Reduce.class);
- job.setInputFormatClass(TextInputFormat.class);
- job.setOutputFormatClass(TextOutputFormat.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- job.waitForCompletion(true);
- return 0;
- }
- public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(new Configuration(), new MaxValue(), args);
- System.exit(res);
- }
- }
/*
* MaxValues
* 函数作用:输出Patent中最大数值
* Author: jokes000
* Date: 2011-12-15
*/
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MaxValue extends Configured implements Tool {
public static class MapClass extends Mapper<LongWritable,Text,Text,Text> {
int max = 0;
// Map Method
public void map(LongWritable key, Text value, Context context){
String[] citation = value.toString().split(",", 2);
try {
int tmp = Integer.parseInt(citation[0]);
if( tmp > max ) max = tmp;
} catch(NumberFormatException e){
// do nothing.
}
//context.write(new Text(citation[0]), new Text(citation[0]));
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(new Text(max+""), new Text(max+""));
}
}
public static class Reduce extends Reducer<Text,Text,Text,IntWritable> {
int max = 0;
// Reduce Method
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//IntWritable[] top = new IntWritable[10];
for(Text value : values) {
try {
int tmp = Integer.parseInt(value.toString());
if( tmp > max ) max = tmp;
} catch(NumberFormatException e) {
// do nothing.
}
}
//context.write(new Text("Max"), new IntWritable(max));
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(new Text("Max"), new IntWritable(max));
}
}
@Override
public int run(String[] arg0) throws Exception {
Job job = new Job();
job.setJarByClass(MaxValue.class);
FileInputFormat.addInputPath(job, new Path(arg0[0]));
FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
job.setMapperClass(MapClass.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(true);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new MaxValue(), args);
System.exit(res);
}
}
1.Top K Values: 要求输出adapt63_99.txt中的第9列CLAIMS值的最大的K个值:
要点:
1.Mapper值输出它所处理的数据中的最大的K个值。(重写 cleanup()函数)
2.设置Reducer数目为1 -D mapred.reduce.tasks=1,同时对Mapper中输出进行排序,输出最大的K个值(重写 cleanup()函数)
代码如下:
- /*
- * TopKValues
- * 函数作用:输出CLAIMS中最大的几个数值
- * Author: jokes000
- * Date: 2011-12-15
- */
- import java.io.IOException;
- import java.util.Arrays;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.conf.Configured;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
- import org.apache.hadoop.util.Tool;
- import org.apache.hadoop.util.ToolRunner;
- public class TopKValues extends Configured implements Tool {
- public static class MapClass extends Mapper<LongWritable,Text,Text,IntWritable> {
- // 全局变量
- int len; // K值
- int[] top; // 用于保存的数组
- // Map Method
- public void map(LongWritable key, Text value, Context context) {
- String[] fields = value.toString().split(",",-20);
- try {
- int claims = Integer.parseInt(fields[8]);
- add(claims);
- } catch(NumberFormatException e) {
- // do nothing..
- }
- }
- private void add(int value) {
- top[0] = value;
- Arrays.sort(top);
- }
- @Override
- protected void setup(Context context) {
- // 获取设置的"K"值,若没有K值,则设置该值为10
- len = context.getConfiguration().getInt("K", 10);
- top = new int[len+1];
- }
- @Override
- protected void cleanup(Context context) throws IOException, InterruptedException {
- for( int i = 1; i <= len; ++ i ) {
- context.write(new Text(top[i]+""), new IntWritable(top[i]));
- }
- }
- }
- public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable> {
- int[] top;
- int len;
- @Override
- protected void setup(Context context) {
- len = context.getConfiguration().getInt("K", 10);
- top = new int[len+1];
- }
- private void add(int value) {
- top[0] = value;
- Arrays.sort(top);
- }
- // Reduce Method
- public void reduce(Text key, Iterable<IntWritable> values, Context context) {
- for(IntWritable value : values) {
- add(value.get());
- }
- }
- @Override
- protected void cleanup(Context context) throws IOException, InterruptedException {
- for( int i = len; i > 0; -- i ) {
- context.write(new Text("No."+(len-i+1)), new IntWritable(top[i]));
- }
- }
- }
- @Override
- public int run(String[] arg0) throws Exception {
- Job job = new Job();
- job.setJarByClass(TopKValues.class);
- FileInputFormat.addInputPath(job, new Path(arg0[0]));
- FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
- try{
- int K = Integer.parseInt(arg0[2]);
- getConf().setInt("K", K);
- } catch(NumberFormatException e) {
- // do nothing..
- getConf().setInt("K", 20);
- }
- job.setMapperClass(MapClass.class);
- job.setReducerClass(Reduce.class);
- job.setInputFormatClass(TextInputFormat.class);
- job.setOutputFormatClass(TextOutputFormat.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(IntWritable.class);
- job.waitForCompletion(true);
- return 0;
- }
- public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(new Configuration(), new TopKValues(), args);
- System.exit(res);
- }
- }