import java.io.IOException;
import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class wordcount1 extends Configured implements Tool{
public static class mapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable>{
@Override
public void map(LongWritable key, Text value,
OutputCollector<Text, IntWritable> output, Reporter report)
throws IOException {
Map<String, Integer> map = new HashMap<String,Integer>();
String[] ss = value.toString().split(":");
FileSplit fs = (FileSplit)report.getInputSplit();
System.out.println(fs.getPath().toUri().toString());
for(int i=0;i<ss.length;i++){
if(!map.containsKey(ss[i])){
map.put(ss[i], 1);
}else{
int tmp = map.get(ss[i])+1;
map.put(ss[i], tmp);
}
}
for(Map.Entry<String, Integer> m : map.entrySet()){
System.out.println(m.getKey()+"\t"+m.getValue());
output.collect(new Text(m.getKey()), new IntWritable(m.getValue()));
}
}
}
public static class reducer extends MapReduceBase implements Reducer<Text, IntWritable, Text,IntWritable>{
@Override
public void reduce(Text key, Iterator<IntWritable> value,
OutputCollector<Text, IntWritable> output, Reporter report)
throws IOException {
int sum = 0;
while(value.hasNext()){
sum += value.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
@Override
public int run(String[] arg0) throws Exception {
Configuration conf = new Configuration();
JobConf job = new JobConf(conf, wordcount1.class);
FileInputFormat.addInputPath(job, new Path(arg0[0]));
FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
job.setJobName("test citation");
job.setMapperClass(mapper.class);
job.setReducerClass(reducer.class);
/*12/04/08 13:56:09 INFO mapred.JobClient: Reduce input groups=4
12/04/08 13:56:09 INFO mapred.JobClient: Combine output records=4
12/04/08 13:56:09 INFO mapred.JobClient: Map input records=4
12/04/08 13:56:09 INFO mapred.JobClient: Reduce shuffle bytes=0
12/04/08 13:56:09 INFO mapred.JobClient: Reduce output records=4
12/04/08 13:56:09 INFO mapred.JobClient: Spilled Records=8
12/04/08 13:56:09 INFO mapred.JobClient: Map output bytes=42
12/04/08 13:56:09 INFO mapred.JobClient: Map input bytes=33
12/04/08 13:56:09 INFO mapred.JobClient: Combine input records=5
12/04/08 13:56:09 INFO mapred.JobClient: Map output records=5
12/04/08 13:56:09 INFO mapred.JobClient: Reduce input records=4
* */
job.setCombinerClass(reducer.class);
//job.setNumReduceTasks(2);
job.setInputFormat(TextInputFormat.class);
job.setOutputFormat(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
JobClient.runJob(job);
return 0;
}
public static void main(String[] args) {
try {
System.exit(ToolRunner.run(new Configuration(), new wordcount1(), args));
} catch (Exception e) {
e.printStackTrace();
}
}
}
此例只能在单个map输入key/value对上进行聚集,
比如 value为 huhu xie xie map输出 huhu 1 xie 2
而如果不采用聚集则输出是 huhu 1 xie 1 xie 1
public class wordcount2 {
public static class mapper extends Mapper<LongWritable, Text, Text, IntWritable>{
private Map<String,Integer> map ;
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
map = new HashMap<String,Integer>();
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] ss = value.toString().split(":");
//相当于combiner的工作
for(int i=0;i<ss.length;i++){
if(!map.containsKey(ss[i])){
map.put(ss[i], 1);
}else{
int tmp = map.get(ss[i])+1;
map.put(ss[i], tmp);
}
}
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
for(Map.Entry<String, Integer> m : map.entrySet()){
context.write(new Text(m.getKey()), new IntWritable(m.getValue()));
}
}
}
public static class reducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> value,
Context context)
throws IOException, InterruptedException {
int sum = 0;
while(value.iterator().hasNext()){
sum += value.iterator().next().get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) {
try {
Job job = new Job();
job.setJarByClass(wordcount2.class);
job.setJobName("wordcount2");
FileInputFormat.addInputPath(job, new Path("input"));
FileOutputFormat.setOutputPath(job, new Path("output"));
job.setMapperClass(mapper.class);
job.setReducerClass(reducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit( job.waitForCompletion(true) ? 0 : 1 );
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
此例可以在多个key/value,也可以是不同文件的key/value 进行聚集,起作用相当于Combiner,但是后者只是hadoop的一种优化策略,并不保证其正确性,前者相对后者更灵活控制执行过程
存在一个问题:内存问题,由于这种方法是在处理完所有的文件后才产生map输出,故可能存在内存不足的问题,对于这一个很有效的方法是设定阈值N,达到N就输出,而不是要等到全部处理完成才输出
public class wordcount3 {
public static class mapper extends Mapper<LongWritable, Text, Text, IntWritable>{
private Map<String,Integer> map ;
private int N ;
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
map = new HashMap<String,Integer>();
N = 0;
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] ss = value.toString().split(":");
N++;
//相当于combiner的工作
for(int i=0;i<ss.length;i++){
if(!map.containsKey(ss[i])){
map.put(ss[i], 1);
}else{
int tmp = map.get(ss[i])+1;
map.put(ss[i], tmp);
}
}
if(N == 2){
for(Map.Entry<String, Integer> m : map.entrySet()){
context.write(new Text(m.getKey()), new IntWritable(m.getValue()));
}
N = 0;
map.clear();
System.out.println("write two key/value");
}
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
//写入最后<=N的 key/value
if(map.size()>0){
for(Map.Entry<String, Integer> m : map.entrySet()){
context.write(new Text(m.getKey()), new IntWritable(m.getValue()));
}
System.out.println("writable last "+ map.size()+ " key/value");
}
}
}
public static class reducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> value,
Context context)
throws IOException, InterruptedException {
int sum = 0;
while(value.iterator().hasNext()){
sum += value.iterator().next().get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) {
try {
Job job = new Job();
job.setJarByClass(wordcount3.class);
job.setJobName("wordcount2");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(mapper.class);
job.setReducerClass(reducer.class);
//job.setCombinerClass(reducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit( job.waitForCompletion(true) ? 0 : 1 );
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
N太大,内存溢出 N太小,聚集性能下降 N的选择很重要