MapReduce------自定义mysql的InputFormat和OutputFormat

最新推荐文章于 2024-08-14 20:18:02 发布

luoyunfan6

最新推荐文章于 2024-08-14 20:18:02 发布

阅读量314

点赞数

分类专栏： Hadoop 文章标签： mapreduce mysql 自定义

本文链接：https://blog.csdn.net/luoyunfan6/article/details/100629006

版权

Hadoop 专栏收录该内容

9 篇文章 1 订阅

订阅专栏

本文介绍了如何使用MapReduce自定义MySQL的InputFormat和OutputFormat，以从stu表中读取学生数据，统计各年龄段人数，并将结果写入stu_res表。示例包括student类的设计，Utils模块的c3p0数据库连接，以及测试主类和自定义的MysqlInputFormat及MySQLOutputFormat的实现。

摘要由CSDN通过智能技术生成

需求

从stu表中读取数据，统计各个年龄段的人数，并输出到stu_res表中。

CREATE TABLE `stu` (
  `id` int(11) NOT NULL DEFAULT '0',
  `student_id` int(11) DEFAULT NULL,
  `name` varchar(45) DEFAULT NULL,
  `age` int(1) DEFAULT NULL,
  `sex` varchar(1) DEFAULT NULL,
  `birthday` varchar(45) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

数据：
INSERT INTO `stu` VALUES ('1', '1', 'zs', '12', '1', '2019-07-17');
INSERT INTO `stu` VALUES ('2', '2', 'goudan', '13', '1', '2019-07-17');
INSERT INTO `stu` VALUES ('3', '3', 'gg', '12', '2', '2019-07-17');
INSERT INTO `stu` VALUES ('4', '4', 'hh', '12', '2', '2019-07-17');
INSERT INTO `stu` VALUES ('5', '5', 'mm', '13', '1', '2019-07-17');
INSERT INTO `stu` VALUES ('6', '6', 'tt', '12', '1', '2019-07-17');

结果表:
CREATE TABLE `stu_res` (
  `age` int(11) NOT NULL DEFAULT '0',
  `cnt` int(11) DEFAULT 0
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

结果数据：
12	4
13	2

student类

因为结果只需要统计年龄，所以其他字段不在写。

 
public class Student {
    private int id;
    private int age;

    public Student(int id, int age) {
        this.id = id;
        this.age = age;
    }

    public Student() {
    }

    public int getId() {
        return id;
    }

    public void setId(int id) {
        this.id = id;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }

    @Override
    public String toString() {
        return this.id+" "+this.age;
    }
}

Utils(用于c3p0连接数据库)

import com.mchange.v2.c3p0.ComboPooledDataSource;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanListHandler;

import java.sql.SQLException;
import java.util.List;

public class Utils {
    private static ComboPooledDataSource dataSource;
    static{
        dataSource = new ComboPooledDataSource();
    }
    public static QueryRunner getQueryRunner(){
        return new QueryRunner(dataSource);
    }

    public static void main(String[] args) throws SQLException {
        QueryRunner queryRunner =  getQueryRunner();
        String sql = "select * from stu";
        List<Student> query = queryRunner.query(sql, new BeanListHandler<Student>(Student.class));
        System.out.println(query.size());
        System.out.println(queryRunner);
    }
}

测试主类

 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;

import java.io.IOException;

public class WordCount {
    public static class MyMapper extends Mapper<LongWritable, Text,Text,IntWritable>{
        IntWritable iw = new IntWritable(1);
        Text text = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();

            String[] datas = line.split(" ");
            text.set(datas[1]);
            context.write(text,iw);
        }
    }
    //TextInputFormat
    public static class MyReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
        IntWritable iw = new IntWritable();
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int num = 0;
            for(IntWritable value:values){
                num+=value.get();
            }
            iw.set(num);
            context.write(key,iw);
        }
    }
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        BasicConfigurator.configure();
        Job job = Job.getInstance(conf);
        job.setJarByClass(WordCount.class);
        
        job.setMapperClass(MyMapper.class);
         
        job.setReducerClass(MyReducer.class);
        
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setInputFormatClass(MySqlInputFormat.class);
        job.setOutputFormatClass(MySqlOutputFormat.class);
 
        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);
    }
}

自定义MysqlInputFormat

import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.BeanListHandler;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.*;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

public class MySqlInputFormat  extends InputFormat <LongWritable, Text>{


    public MySqlInputFormat(){

    }
    public static class MySqlInputSplit extends InputSplit implements Writable{
        private long start;
        private long end;

        public MySqlInputSplit(){

        }

        public MySqlInputSplit(long start, long end) {
            this.start = start;
            this.end = end;
        }
        public long getStart() {
            return start;
        }

        public void setStart(int start) {
            this.start = start;
        }

        public long getEnd() {
            return end;
        }

        public void setEnd(int end) {
            this.end = end;
        }

        public long getLength() throws IOException, InterruptedException {
            return this.end-this.start;
        }

        public String[] getLocations() throws IOException, InterruptedException {
            return new String[0];
        }

        public void write(DataOutput dataOutput) throws IOException {
            dataOutput.writeLong(start);
            dataOutput.writeLong(end);
        }

        public void readFields(DataInput dataInput) throws IOException {
            this.start=dataInput.readLong();
            this.end=dataInput.readLong();
        }


    }
    public List<InputSplit> getSplits(JobContext context) {
        List<InputSplit> list = new ArrayList<InputSplit>();
        long count = 0;
        QueryRunner queryRunner = Utils.getQueryRunner();
        try {
            count = queryRunner.query("select * from stu",new BeanListHandler<Student>(Student.class)).size();
        } catch (SQLException e) {
            e.printStackTrace();
        }
        long chunk = 2;
        long chunksize = (count/2);
        //判断是否数据足够。
        //将数据进行切片，也就是一个map里面有一个切片，一个切片有上面定义的chunk = 2 条数据。
        for(int i = 0;i<chunksize;i++){
            MySqlInputSplit mi = null;
            if(i+1==chunksize){
                mi = new MySqlInputSplit(i*chunk,count);
                list.add(mi);
            }else{
                mi = new MySqlInputSplit(i*chunk,i*chunk+chunk);
                list.add(mi);
            }
        }
        //切片集合。
        return list;
    }

    public  static class MySqlRecordReader extends RecordReader<LongWritable, Text>{
        private MySqlInputSplit split;
        //从MySql中查出来的结果集
        private Iterator<Student> dbcursor;
        //定义索引，每次都会被初始化成0，也就是只能读取自己切片中的 k,v
        private int index;
        private LongWritable k; //偏移量，再下面会自动封装成切片数据的开始，就会知道读多少行 ，对应map泛型的第一个值。
        private Text v;     //每次读到的结果，会通过返回出去，对应  map泛型的第二个。



        public MySqlRecordReader(InputSplit split,TaskAttemptContext context) throws IOException, InterruptedException{
            super();
            initialize(split,context);;
        }


        public MySqlRecordReader(){
        }

        //初始化，将一些对象new出来，并把得到的切片（1个）强转。
        public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
            this.split = (MySqlInputSplit)split;
            this.k =  new LongWritable();
            v = new Text();
        }
        //读取数据，并把数据封装到当前MySqlRecordReader的k v中。
        public boolean nextKeyValue() throws IOException, InterruptedException {
            if(this.dbcursor == null) {
                QueryRunner runner = Utils.getQueryRunner();
                String sql = "select * from stu limit " + this.split.start + "," + this.split.getLength();
                try {
                    dbcursor = runner.query(sql, new BeanListHandler<Student>(Student.class)).iterator();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
            boolean hasNext = this.dbcursor.hasNext();
            if(hasNext){
                //获取游标的下一个值
                Student stu = this.dbcursor.next();
               
                this.k.set(this.split.start+index);
                index ++;
                 
                this.v = new Text(stu.toString());
            }
            return hasNext;
        }

        public LongWritable getCurrentKey() throws IOException, InterruptedException {
            return this.k;
        }

        public Text getCurrentValue() throws IOException, InterruptedException {
            return this.v;
        }

        public float getProgress() throws IOException, InterruptedException {
            return 0;
        }

        public void close() throws IOException {

        }
    }

    public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        return new MySqlRecordReader(split,context);
    }
}

自定义MySQLOutputFormat

import org.apache.commons.dbutils.QueryRunner;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;

import java.io.IOException;
import java.sql.SQLException;

public class MySqlOutputFormat <K,V> extends OutputFormat<K,V> {



    public static class MySqlRecordWriter<K,V> extends RecordWriter<K, V>{
        public QueryRunner runner  = null;

        public MySqlRecordWriter(){

        }
        public MySqlRecordWriter(TaskAttemptContext context){
                //获取MySql的连接

            runner = Utils.getQueryRunner();
        }


        public void write(K key, V value) throws IOException, InterruptedException {
            try {
                runner.update("insert into stu_res(age,cnt) values (?,?)",Integer.parseInt(key.toString()),Integer.parseInt(value.toString()));
            } catch (SQLException e) {
                e.printStackTrace();
            }
        }

        public void close(TaskAttemptContext context) throws IOException, InterruptedException {

        }

    }
    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
        return new MySqlRecordWriter<K, V>(context);
    }

    public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {

    }

    public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
        return new FileOutputCommitter(null, context);
    }
}

luoyunfan6

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
MapReduce------自定义mysql的InputFormat和OutputFormat

需求从stu表中读取数据，统计各个年龄段的人数，并输出到stu_res表中。CREATE TABLE `stu` ( `id` int(11) NOT NULL DEFAULT '0', `student_id` int(11) DEFAULT NULL, `name` varchar(45) DEFAULT NULL, `age` int(1) DEFAULT NULL...
复制链接

扫一扫

专栏目录