MR模板优化
public class WordCountUpMR extends Configured implements Tool;
int status = ToolRunner.run(configuration,new WordCountUpMR(),args);import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
import java.util.List;
//接口Configured
public class WordCountUpMR extends Configured implements Tool{
/**
* map
*/
public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text mapOutputKey = new Text();
private IntWritable mapOutputValue = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
System.out.println("keyIn:"+key +" ValueIn:"+value);
String lineValue = value.toString();
String[] strs = lineValue.split(" ");
for(String str : strs){
mapOutputKey.set(str);
context.write(mapOutputKey,mapOutputValue);
}
}
}
/**
* reduce
*/
public static class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable outputValue = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
List<IntWritable> list = Lists.newArrayList(values);
System.out.println("keyIn:"+key +" ValueIn:"+list);
int sum = 0;
for(IntWritable value : list){
sum +=value.get();
}
outputValue.set(sum);
context.write(key,outputValue);
}
}
/**
* run
* @param args
* @return
* @throws Exception
*/
public int run(String args[]) throws Exception {
//driver
//1) get conf
Configuration configuration = this.getConf();
//2) create job
Job job = Job.getInstance(configuration, this.getClass().getSimpleName());
job.setJarByClass(this.getClass());
//3.1) input
Path path = new Path(args[0]);
FileInputFormat.addInputPath(job, path);
//3.2) map
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//3.3) reduce
job.setReducerClass(WordCountReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//3.4) output
Path output = new Path(args[1]);
FileOutputFormat.setOutputPath(job, output);
//4) commit
boolean isSuc = job.waitForCompletion(true);
return (isSuc) ? 0 : 1;
}
public static void main(String[] args) {
args = new String[]{
"hdfs://bigdata-pro-m01.kfk.com:9000/user/kfk/datas/wordcount.txt",
"hdfs://bigdata-pro-m01.kfk.com:9000/user/kfk/mr/output"
};
Configuration configuration = new Configuration();
try {
//先判断
Path fileOutPath = new Path(args[1]);
FileSystem fileSystem = FileSystem.get(configuration);
if(fileSystem.exists(fileOutPath)){
fileSystem.delete(fileOutPath,true);
}
//int status = wordCountMR.run(args);
int status = ToolRunner.run(configuration,new WordCountUpMR(),args);
System.exit(status);
} catch (Exception e) {
e.printStackTrace();
}
}
}
第十三小节:根据WordCount程序讲解MR运行流程
input -> map -> reduce -> output
map输入 -> <0,hadoop spark>
map输出 -> <hadoop,1><spark,1>
reduce输入 -> <hadoop,List(1,1)>
reduce输出 -> <hadoop,2>
知道的几个知识点:
第一点:
默认情况下,Map输入<key,value>对格式
key:偏移量
value:文本中的一行值
第二点:
map -> partition -> sort -> group -> reduce
*分区:
决定map输出的<key,value>,发送给哪个reduce去处理
*排序:
map ->
*分组:
将相同key的value组合在一起,放在List中
eg: (keyIn:spark ValueIn:[1, 1, 1, 1])
第三点:
reduce的输出结果,默认情况下,key和value作为一行数据输出
key和value之间的分隔符为制表符 \t
**********
第十四小节:数据类型
1.MR中所有的数据类型都要实现Writable接口,以便于这些类型定义的数据可以被序列化进行网络传输和文件存储
2.MR基本数据类型,想系统学习大数据的话,可以加入大数据技术学习扣扣君羊:522189307
BooleanWritable :布尔型
ByteWritable
DoubleWritable
FloatWritable
以下是常用的数据类型:
IntWritable
LongWritable
Text:使用UTF8格式存储我们的文本
NullWritable:当<key,value>中key或者value为空时使用
3.Writable <key,value> value数据对应的数据类型必须要实现Writable
write()是把每个对象序列化到输出流
readFields()是把输入流字节反序列化
4.WritableComparable - key排序,key数据对应的数据类型必须要实现WritableComparable接口
5.重写toString() 、equals()、hashCode()
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class UserWritable implements Writable {
private int id;
private String name;
public UserWritable(int id, String name) {
this.set(id,name);
}
public void set(int id, String name) {
this.id = id;
this.name = name;
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(id);
dataOutput.writeUTF(name);
}
public void readFields(DataInput dataInput) throws IOException {
this.id = dataInput.readInt();
this.name = dataInput.readUTF();
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
@Override
public String toString() {
return "UserWritable{" +
"id=" + id +
", name='" + name + '\'' +
'}';
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
UserWritable that = (UserWritable) o;
if (id != that.id) return false;
return name != null ? name.equals(that.name) : that.name == null;
}
@Override
public int hashCode() {
int result = id;
result = 31 * result + (name != null ? name.hashCode() : 0);
return result;
}
}
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class OrderWritable implements WritableComparable<OrderWritable> {
private String orderId;
private float price;
public OrderWritable(String orderId, float price) {
this.set(orderId,price);
}
public void set(String orderId, float price) {
this.orderId = orderId;
this.price = price;
}
public int compareTo(OrderWritable o) {
int compare = this.getOrderId().compareTo(o.getOrderId());
if(0 == compare){
compare = Float.valueOf(price).compareTo(Float.valueOf(o.getPrice()));
}
return compare;
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(orderId);
dataOutput.writeFloat(price);
}
public void readFields(DataInput dataInput) throws IOException {
this.orderId = dataInput.readUTF();
this.price = dataInput.readInt();
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public float getPrice() {
return price;
}
public void setPrice(float price) {
this.price = price;
}
}
第十五小节:MR基于YARN的运行原理
1.对于MR来说,如果我们要运行 在yarn上,就必须要打成jar包
原因: