MapReduce I

最新推荐文章于 2021-03-31 20:45:19 发布

Wang_Qinghe

最新推荐文章于 2021-03-31 20:45:19 发布

阅读量255

点赞数

分类专栏： mapreduce 文章标签： mapreduce

本文链接：https://blog.csdn.net/wqhaber/article/details/77917360

版权

mapreduce 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

MapReduce

MR : 编程模型。

WordCountMR

1.编写Mapper

package com.hadoop.mr;

        import org.apache.hadoop.io.IntWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.io.LongWritable;
        import org.apache.hadoop.mapreduce.Mapper;

        import java.io.IOException;

        /**
         * WordCountMapper
         */
        public class WordCountMapper extends Mapper<LongWritable, Text, Text,IntWritable> {

            protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                String line = value.toString();
                String[] arr = line.split(" ");

                Text keyOut = new Text();
                IntWritable valueOut = new IntWritable(1);
                for(String word : arr){
                    keyOut.set(word);
                    context.write(keyOut,valueOut);
                }
            }
        }

2.编写Reducer

package com.hadoop.mr;

        import org.apache.hadoop.io.IntWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.Reducer;

        import java.io.IOException;

        /**
         * WordCountReducer
         */
        public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

            protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
                int count = 0 ;
                for(IntWritable iw : values){
                    count = count + iw.get() ;
                }
                context.write(key,new IntWritable(count));
            }
        }

3.编写App

package com.it18zhang.hadoop.mr;

        import org.apache.hadoop.conf.Configuration;
        import org.apache.hadoop.fs.Path;
        import org.apache.hadoop.io.IntWritable;
        import org.apache.hadoop.io.Text;
        import org.apache.hadoop.mapreduce.Job;
        import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
        import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

        import java.io.IOException;

        /**
         */
        public class App {
            public static void main(String[] args) throws Exception {
                Configuration conf = new Configuration();
                Job job = Job.getInstance(conf);

                job.setJobName("WordCount");
                job.setJarByClass(App.class);

                job.setMapperClass(WordCountMapper.class);
                job.setReducerClass(WordCountReducer.class);

                //添加输入路径
                FileInputFormat.addInputPath(job,new Path(args[0]));
                //设置输出路径
                FileOutputFormat.setOutputPath(job,new Path(args[1]));

                //设置mapreduce输出
                job.setMapOutputKeyClass(Text.class);
                job.setMapOutputValueClass(IntWritable.class);

                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(IntWritable.class);

                job.setNumReduceTasks(2);

                job.waitForCompletion(true) ;
            }
        }

4.运行

5.导出jar到hadoop集群上运行
    5.1)导出jar
    5.2)部署到centos
    5.3)启动yarn集群
        start-yarn.sh

    5.4)查看yarn webui
        http://s100:8088/

    5.5)准备数据

    5.6)执行job
        hadoop jar my-hadoop-day04.jar com.it18zhang.hadoop.mr.App /user/centos/1.txt /user/centos/out

Combiner

是map端的reduce，预聚合。减少网络流量。对map的每个分区进行聚合。

Mapper

run(){
        setup();
        while(...){
            //...
            map();
        }
        cleanup();
    }

Reducer

run(){
        setup();
        while(...){
            //
            reduce(key,Iteratable<IntWritable> it ...);
        }
        cleanup();
    }

file:/tmp/hadoop-Administrator/mapred/staging/Administrator897294152/.staging

splitSize

minSplitSize maxSplitSize blockSize

min = 7
max = 7
block = 32M
splitsize = 7

24 / 7 =

hello world
h#ello world

考察切片的计算法则

1.修改切片的min max

job.getConfiguration().set("mapreduce.input.fileinputformat.split.minsize","14");
        job.getConfiguration().set("mapreduce.input.fileinputformat.split.maxsize","14");

// FileInputFormat.setMinInputSplitSize(job,7);
// FileInputFormat.setMaxInputSplitSize(job,7);

MultipleInputs

多个输入。

DBInputFormat

count           //总的记录数,100
int chunks = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);        //map个数,3
count / chunks  //每个切片的记录数。 33

1.实现DBWritable类

package com.hadoop.mr.input.db;

        import org.apache.hadoop.io.Writable;
        import org.apache.hadoop.mapreduce.lib.db.DBWritable;

        import java.io.DataInput;
        import java.io.DataOutput;
        import java.io.IOException;
        import java.sql.PreparedStatement;
        import java.sql.ResultSet;
        import java.sql.SQLException;

        /**
         *MyDBWritable
         */
        public class MyDBWritable implements DBWritable,Writable{
            public int id ;
            public String orderno ;
            public float price ;
            public int cid ;

            public void write(PreparedStatement statement) throws SQLException {

            }

            public void readFields(ResultSet rs) throws SQLException {
                this.id = rs.getInt("id") ;
                this.orderno = rs.getString("orderno") ;
                this.price = rs.getFloat("price") ;
                this.cid = rs.getInt("cid") ;
            }

            //串行
            public void write(DataOutput out) throws IOException {
                out.writeInt(id);
                out.writeUTF(orderno);
                out.writeFloat(price);
                out.writeInt(cid);

            }
            //反串行
            public void readFields(DataInput in) throws IOException {
                this.id = in.readInt() ;
                this.orderno = in.readUTF();
                this.price = in.readFloat() ;
                this.cid = in.readInt() ;
            }
        }

2.自定义DBInputFormat

package com.hadoop.mr.input.db;

        import org.apache.hadoop.conf.Configuration;
        import org.apache.hadoop.mapreduce.InputSplit;
        import org.apache.hadoop.mapreduce.JobContext;
        import org.apache.hadoop.mapreduce.MRJobConfig;
        import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;

        import java.io.IOException;
        import java.sql.ResultSet;
        import java.sql.SQLException;
        import java.sql.Statement;
        import java.util.ArrayList;
        import java.util.List;

        /**
         * Created by Administrator on 2017/8/17.
         */
        public class MyDBInputFormat extends DBInputFormat<MyDBWritable>{
            public void setConf(Configuration conf) {
                super.setConf(conf);
            }

            public List<InputSplit> getSplits(JobContext job) throws IOException {

                ResultSet results = null;
                Statement statement = null;
                try {
                    statement = connection.createStatement();

                    results = statement.executeQuery(getCountQuery());
                    results.next();

                    long count = results.getLong(1);
                    int chunks = 3; //job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
                    long chunkSize = (count / chunks);

                    results.close();
                    statement.close();

                    List<InputSplit> splits = new ArrayList<InputSplit>();

                    // Split the rows into n-number of chunks and adjust the last chunk
                    // accordingly
                    for (int i = 0; i < chunks; i++) {
                        DBInputSplit split;

                        if ((i + 1) == chunks)
                            split = new DBInputSplit(i * chunkSize, count);
                        else
                            split = new DBInputSplit(i * chunkSize, (i * chunkSize) + chunkSize);

                        splits.add(split);
                    }

                    connection.commit();
                    return splits;
                } catch (SQLException e) {
                    throw new IOException("Got SQLException", e);
                } finally {
                    try {
                        if (results != null) {
                            results.close();
                        }
                    } catch (SQLException e1) {
                    }
                    try {
                        if (statement != null) {
                            statement.close();
                        }
                    } catch (SQLException e1) {
                    }

                    closeConnection();
                }
            }
        }

输出格式

1.TextOutputFormat
    默认格式
2.SequenceFileOutputFormat
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

MR特性

1.计数器
    debug-tool远程调试类。

2.计数器限制
    [mapred-site.xml]
    mapreduce.job.counters.limit=120

3.计数器名称长度限制(64,代码中硬性约束)
    192.168.11.113:13932:pool-3-thread-1:DBReduce@546740333:reduce()