【大数据开发】Hadoop的高级编程（二）

最新推荐文章于 2024-10-17 15:34:15 发布

大数据基础入门教程

最新推荐文章于 2024-10-17 15:34:15 发布

阅读量773

点赞数

文章标签：大数据 hadoop

本文链接：https://blog.csdn.net/mnbvxiaoxin/article/details/105715722

版权

MR模板优化

public class WordCountUpMR extends Configured implements Tool；

int status = ToolRunner.run(configuration,new WordCountUpMR(),args);import com.google.common.collect.Lists;

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.conf.Configured;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.util.Tool;import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;import java.util.List;
//接口Configuredpublic class WordCountUpMR extends Configured implements Tool{
    /**     * map     */    public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        private Text mapOutputKey = new Text();        private IntWritable mapOutputValue = new IntWritable(1);        @Override        protected void map(LongWritable key, Text value, Context context)                throws IOException, InterruptedException {            System.out.println("keyIn:"+key +"    ValueIn:"+value);            String lineValue  = value.toString();            String[] strs = lineValue.split(" ");            for(String str : strs){                mapOutputKey.set(str);                context.write(mapOutputKey,mapOutputValue);            }
        }    }    /**     * reduce     */    public static class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable outputValue = new IntWritable();        @Override        protected void reduce(Text key, Iterable<IntWritable> values, Context context)                throws IOException, InterruptedException {
            List<IntWritable> list = Lists.newArrayList(values);            System.out.println("keyIn:"+key +"    ValueIn:"+list);            int sum = 0;            for(IntWritable value : list){                sum +=value.get();            }            outputValue.set(sum);            context.write(key,outputValue);        }    }

    /**     * run     * @param args     * @return     * @throws Exception     */    public int run(String args[]) throws Exception {
        //driver        //1) get conf        Configuration configuration = this.getConf();
        //2) create job        Job job = Job.getInstance(configuration, this.getClass().getSimpleName());        job.setJarByClass(this.getClass());
        //3.1)  input        Path path = new Path(args[0]);        FileInputFormat.addInputPath(job, path);        //3.2) map        job.setMapperClass(WordCountMapper.class);        job.setMapOutputKeyClass(Text.class);        job.setMapOutputValueClass(IntWritable.class);
        //3.3) reduce        job.setReducerClass(WordCountReduce.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);
        //3.4) output        Path output = new Path(args[1]);        FileOutputFormat.setOutputPath(job, output);
        //4) commit        boolean isSuc = job.waitForCompletion(true);        return (isSuc) ? 0 : 1;    }
    public static void main(String[] args) {

        args = new String[]{         "hdfs://bigdata-pro-m01.kfk.com:9000/user/kfk/datas/wordcount.txt",        "hdfs://bigdata-pro-m01.kfk.com:9000/user/kfk/mr/output"        };
        Configuration configuration = new Configuration();        try {            //先判断            Path fileOutPath = new Path(args[1]);            FileSystem fileSystem = FileSystem.get(configuration);            if(fileSystem.exists(fileOutPath)){                fileSystem.delete(fileOutPath,true);            }            //int status = wordCountMR.run(args);            int status = ToolRunner.run(configuration,new WordCountUpMR(),args);            System.exit(status);        } catch (Exception e) {            e.printStackTrace();        }    }}

第十三小节：根据WordCount程序讲解MR运行流程

input -> map -> reduce -> output

map输入 -> <0,hadoop spark>

map输出 -> <hadoop,1><spark,1>

reduce输入 -> <hadoop,List(1,1)>

reduce输出 -> <hadoop,2>

知道的几个知识点:

第一点：

默认情况下，Map输入<key,value>对格式

key:偏移量

value:文本中的一行值

第二点：

map -> partition -> sort -> group -> reduce

*分区：

决定map输出的<key,value>，发送给哪个reduce去处理

*排序：

map ->

*分组:

将相同key的value组合在一起，放在List中

eg: (keyIn:spark ValueIn:[1, 1, 1, 1])

第三点：

reduce的输出结果，默认情况下，key和value作为一行数据输出

key和value之间的分隔符为制表符 \t

**********

第十四小节：数据类型

1.MR中所有的数据类型都要实现Writable接口，以便于这些类型定义的数据可以被序列化进行网络传输和文件存储

2.MR基本数据类型，想系统学习大数据的话，可以加入大数据技术学习扣扣君羊：522189307

BooleanWritable :布尔型

ByteWritable

DoubleWritable

FloatWritable

以下是常用的数据类型：

IntWritable

LongWritable

Text:使用UTF8格式存储我们的文本

NullWritable:当<key,value>中key或者value为空时使用

3.Writable <key,value> value数据对应的数据类型必须要实现Writable

write()是把每个对象序列化到输出流

readFields()是把输入流字节反序列化

4.WritableComparable - key排序，key数据对应的数据类型必须要实现WritableComparable接口

5.重写toString() 、equals()、hashCode()

import org.apache.hadoop.io.Writable;
import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;
public class UserWritable implements Writable {
    private int id;    private String name;
    public UserWritable(int id, String name) {        this.set(id,name);    }
    public void set(int id, String name) {        this.id = id;        this.name = name;    }
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeInt(id);        dataOutput.writeUTF(name);
    }
    public void readFields(DataInput dataInput) throws IOException {        this.id = dataInput.readInt();        this.name = dataInput.readUTF();    }
    public int getId() {        return id;    }
    public void setId(int id) {        this.id = id;    }
    public String getName() {        return name;    }
    public void setName(String name) {        this.name = name;    }
    @Override    public String toString() {        return "UserWritable{" +                "id=" + id +                ", name='" + name + '\'' +                '}';    }
    @Override    public boolean equals(Object o) {        if (this == o) return true;        if (o == null || getClass() != o.getClass()) return false;
        UserWritable that = (UserWritable) o;
        if (id != that.id) return false;        return name != null ? name.equals(that.name) : that.name == null;    }
    @Override    public int hashCode() {        int result = id;        result = 31 * result + (name != null ? name.hashCode() : 0);        return result;    }}

import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;
public class OrderWritable implements WritableComparable<OrderWritable> {
    private String orderId;    private float price;
    public OrderWritable(String orderId, float price) {       this.set(orderId,price);    }    public void set(String orderId, float price) {        this.orderId = orderId;        this.price = price;    }
    public int compareTo(OrderWritable o) {
        int compare = this.getOrderId().compareTo(o.getOrderId());        if(0 == compare){            compare =  Float.valueOf(price).compareTo(Float.valueOf(o.getPrice()));        }        return compare;    }
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(orderId);        dataOutput.writeFloat(price);    }
    public void readFields(DataInput dataInput) throws IOException {
        this.orderId  = dataInput.readUTF();        this.price = dataInput.readInt();    }
    public String getOrderId() {        return orderId;    }
    public void setOrderId(String orderId) {        this.orderId = orderId;    }
    public float getPrice() {        return price;    }
    public void setPrice(float price) {        this.price = price;    }}