一,MapReduce的流程架构图:
二,MapReduce简单的word count的流程
三,MapReduce简单的word count的代码
1,主类
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WCJob {
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(WCJob.class);
job.setMapperClass(WCMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(WCReducer.class);
job.setCombinerClass(WCReducer.class);
FileInputFormat.addInputPath(job, new Path("/user/lyl/wc"));
Path outpath = new Path("/user/lyl/wcout");
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outpath)){
fs.delete(outpath,true);
}
FileOutputFormat.setOutputPath(job, outpath);
boolean flag = job.waitForCompletion(true);
if(flag){
System.out.println("job success!");
}
}
}
2,map类
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value,
Context context)
throws IOException, InterruptedException {
String str = value.toString();
String[] strs = StringUtils.split(str ,' ');
for (String s:strs) {
context.write(new Text(s), new IntWritable(1));
}
}
}
3,reduce类
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text text, Iterable<IntWritable> iterable,
Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable i :iterable) {
sum +=i.get();
}
context.write(text, new IntWritable(sum));
}
}
四,MapReduce的架构
一主多从架构
主 JobTracker: 负责调度分配每一个子任务task运行于TaskTracker上,如果发现有失败的task就重新分配其任务到其他节点。每一个hadoop集群中只一个 JobTracker,一般它运行在Master节点上。
从TaskTracker:TaskTracker主动与JobTracker通信,接收作业,并负责直接执行每一个任务,为了减少网络带宽TaskTracker最好运行在HDFS的DataNode上