Configuration conf = new Configuration();
初始化一个配置对象
可以传递一个参数loadDefaults
,是否从默认的文件中加载配置。默认情况下这个参数为True
Job job = Job.getInstance(conf);
初始化一个job对象
MRJobConfig中有默认的参数设置
map任务默认申请内存1G
申请cpu为1核
默认的输入格式化类。
job.setJarByClass(hadoop_learning.class);
设置job的主类
job.setJobName("test");
设置job的名字
job.setMapperClass(myMapper.class);
设置mapper类
看一下Mapper的源码
run方法,首先执行setup方法,之后将context中的内容依次调用map方法。然后再清除context
setup方法,在任务执行前会执行一次(只执行一次)
reduce端的run函数
public void run(Context context) throws IOException, InterruptedException {
setup(context);
try {
while (context.nextKey()) {// 判断是否还有组,一组调用一次reduce方法
reduce(context.getCurrentKey(), context.getValues(), context);// 获取这一组的key与value
// If a back up store is used, reset it
Iterator<VALUEIN> iter = context.getValues().iterator();
if(iter instanceof ReduceContext.ValueIterator) {
((ReduceContext.ValueIterator<VALUEIN>)iter).resetBackupStore();
}
}
} finally {
cleanup(context);
}
}
package com.cnnc.hadoop;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.StringTokenizer;
// LongWritable,Text 序列化的类型
//keyin, valuein ,keyout valueout,keyin放的是字符串的偏移量(这个块的偏移量加这一行的偏移量就是在全文的偏移量),valuein是字符串的值
public class myMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
//map 端会进行排序,可以自己写一个比较器(字典排序,或者数值排序)
private Text word = new Text();
private final IntWritable one = new IntWritable(1);
@Override
public void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()){
word.set(itr.nextToken());
context.write(word,one);// 传递的是对象的引用,如果word的值别修改了,那么相应的输出也会改变,其实context将key,value传出后,将其序列化为数组了,并将其添加进buffer中。因此最终的结果不会受影响
// 为什么要这么写?这样写可以减少内存开销
}
}
}
reduce
package com.cnnc.hadoop;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
//keyin ,valuein要与map函数的输出保持一致
public class myReduce extends Reducer<Text, IntWritable,Text,IntWritable> {
// 相同的key为一组数据,一组数据调用一次reduce方法
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
package com.cnnc.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import java.io.IOException;
public class hadoop_learning {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(hadoop_learning.class);
job.setJobName("test");
job.setMapperClass(myMapper.class);//设置map类
job.setReducerClass(myReduce.class);//设置reduce类
job.setMapOutputKeyClass(Text.class);// 设置mapoutput的类型
job.setMapOutputValueClass(IntWritable.class);// 设置mapoutput普通的类型
Path inputPath = new Path("/test/test.txt");//设置输入路径
FileInputFormat.addInputPath((JobConf) conf,inputPath);
Path outputPath = new Path("/test/result");// 输出路径
if(outputPath.getFileSystem(conf).exists(outputPath)){// 可以通过这种方式来获取HDFS的FileSystem实例化对象
outputPath.getFileSystem(conf).delete(outputPath,true);
}
job.waitForCompletion(true);
}
}
将jai包上传后执行hadoop jar hadoop_learning.jar xxx枯井