Hadoop_Day05
WordCount代码
package org.hadoop.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WCDriver {
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(WCDriver.class);
job.setJobName("xljWC");
job.setNumReduceTasks(1);
job.setMapperClass(WCMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setReducerClass(WCReduce.class);
job.setOutputValueClass(Text.class);
job.setOutputValueClass(LongWritable.class);
Path path1 = new Path(args[0]);
Path path2 = new Path(args[1]);
FileSystem fileSystem = FileSystem.get(conf);
boolean b = fileSystem.exists(path2);
if (b) {
fileSystem.delete(path2, true);
}
FileInputFormat.addInputPath(job,path1);
FileOutputFormat.setOutputPath(job,path2);
job.waitForCompletion(true);
}
}
package org.hadoop.wordcount;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WCMap extends Mapper<LongWritable, Text,Text,LongWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] sArr = s.split(",");
for (String s1 : sArr) {
context.write(new Text(s1),new LongWritable(1L));
}
}
}
package org.hadoop.wordcount;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WCReduce extends Reducer<Text, LongWritable,Text,LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
long sum = 0;
for (LongWritable value : values) {
sum += value.get();
}
context.write(key,new LongWritable(sum));
}
}
使用简化的WordCount查看偏移量
package org.hadoop.wordcount2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WCDriver1 {
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(WCDriver1.class);
job.setJobName("xljWC1");
job.setNumReduceTasks(1);
job.setMapperClass(WCMap1.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setReducerClass(WCReduce1.class);
job.setOutputValueClass(Text.class);
job.setOutputValueClass(NullWritable.class);
Path path1 = new Path(args[0]);
Path path2 = new Path(args[1]);
FileSystem fileSystem = FileSystem.get(conf);
boolean b = fileSystem.exists(path2);
if (b) {
fileSystem.delete(path2, true);
}
FileInputFormat.addInputPath(job,path1);
FileOutputFormat.setOutputPath(job,path2);
job.waitForCompletion(true);
}
}
package org.hadoop.wordcount2;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WCMap1 extends Mapper<LongWritable, Text, Text, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
context.write(new Text("偏移量" + key + "\t内容" + value), NullWritable.get());
}
}
package org.hadoop.wordcount2;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WCReduce1 extends Reducer<Text, NullWritable, Text, NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
CRLF(windows默认格式\r\n)和LF(linux默认格式\r)下的偏移量有所不同
Yarn的工作机制
- mapreduce提交程序到客户端所在的节点上去,yarnrunner向resourcemanager申请一个Application,resourcemanager将该应⽤程序的资源路径和application_id返回给yarnrunner,该程序将运行所需要的资源提交到hdfs上去,资源提交完毕之后申请运行MapreduceApplicationMaster
- resourcemanager将⽤户的请求初始化成⼀个task,其中⼀个NodeManager领取到task任务,该NodeManager创建容器Container,并产⽣MapreduceApplicationMaster,Container从HDFS上拷⻉资源到本地,MapreduceApplicationMaster向resourcemanager申请运⾏maptask资源,RM将运⾏maptask任务分配给另外两个NodeManager,另两个NodeManager分别领取任务并创 建container。 ,MapreduceApplicationMaster向两个接收到任务的NodeManager发送程序启动脚本,这两个NodeManager分别启动maptask,maptask对数据分区排序
- MapreduceApplicationMaster等待所有maptask运⾏完毕后,向resourcemanager申请container,运⾏reducetask,reducetask向maptask获取相应分区的数据,程序运⾏完毕后,MapreduceApplicationMaster会向resourcemanager申请注销⾃⼰