大数据学习之bulkLoad实现批量导入

优点:

  1. 如果我们一次性入库hbase巨量数据,处理速度慢不说,还特别占用Region资源, 一个比较高效便捷的方法就是使用 “Bulk Loading”方法,即HBase提供的HFileOutputFormat类。

  2. 它是利用hbase的数据信息按照特定格式存储在hdfs内这一原理,直接生成这种hdfs内存储的数据格式文件,然后上传至合适位置,即完成巨量数据快速入库的办法。配合mapreduce完成,高效便捷,而且不占用region资源,增添负载。

限制:

  1. 仅适合初次数据导入,即表内数据为空,或者每次入库表内都无数据的情况。

  2. HBase集群与Hadoop集群为同一集群,即HBase所基于的HDFS为生成HFile的MR的集群:

代码编写:

提前在hbase中创建好表

生成hfile基本流程:

1.设置Mapper的输出KV类型

K :ImmutableBytesWritable 代表行键

V : KeyValue 代表之值

2.开发Mapper

从原数据表中读取数据,按照你的要求进行处理;

输出rowkey作为K,,输出一些keyvalue(put)作为V

3.配置job参数

zookeeper的连接地址

配置输出 OutputFormat为HFileOutputFormat2,并为其设置参数

4.提交job

导入HFile到RegionServer 的流程

构建一个表描述对象

构建一个Region定位工具

然后用LoadIncrementalHFiles来doBulkload操作

package bolkloading;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.*;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.FileInputStream;
import java.io.IOException;
import java.util.SimpleTimeZone;

/**
 * 设置Mapper的kv类型
 **/

class BulkLoadingMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> {
    //因为是继承至Mapper所以要重写Mapper方法
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        super.map(key, value, context);
        //1.对数据做分割,这里以tab键为一个分割  \t
        String[] split = value.toString().split("\t");
        //判断数据的长度,什么时候开始做处理
        if (split.length>6){
            String phoneName = split[0];
            String wgId = split[1];
            String city = split[2];
            String qxId = split[3];
            String waitTime = split[4];
            String startTime = split[5];
            String endTime = split[6];
            String dataTime = split[7];

            //设计行键,将电话编号和城市编号下划线连接,方便后面分割
            String rowkey = phoneName+"_"+wgId;

            //包装keyvalue(行键,列簇,列名,列值)
            KeyValue wgId2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "wgId".getBytes(), wgId.getBytes());
            KeyValue city2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "city".getBytes(), city.getBytes());
            KeyValue quId2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "quId".getBytes(), qxId.getBytes());
            KeyValue waitTime2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "waitTime".getBytes(),waitTime.getBytes());
            KeyValue startTime2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "startTime".getBytes(),startTime.getBytes());
            KeyValue endTime2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "endTime".getBytes(),endTime.getBytes());
            KeyValue dataTime2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "dataTime".getBytes(),dataTime.getBytes());

            //现在rowkey和value都有了,用context.write() 往里面写数据,要写入的rowkey要求是ImmutableBytesWritable类型的所以包装一下
            ImmutableBytesWritable rowkey2 = new ImmutableBytesWritable(rowkey.getBytes());

            context.write(rowkey2,wgId2);
            context.write(rowkey2,city2);
            context.write(rowkey2,quId2);
            context.write(rowkey2,wgId2);
            context.write(rowkey2,waitTime2);
            context.write(rowkey2,startTime2);
            context.write(rowkey2,endTime2);
            context.write(rowkey2,dataTime2);
        }
    }
}

/**
 * 配置job参数
 * 现在rowkey2是行键 wgId2是单元格,由于hbase表的列簇是info,所以要与之映射,不然添加不进去
 **/

public class BolkLoad {
    public static void main(String[] args) throws  Exception {
        //1.获取配置文件,获取hbase或hadoop的都可以原因是hbase与hadoop集群是一样的,这里获取hbase,方便文件转成HFile
  Configuration conf = HBaseConfiguration.create();
        //2.获取zookeeper相关配置
        conf.set("hbase.zookeeper.quorum","mater:2181,node1:2181,node2:2181");
        //3.创建Job作业
        Job job = Job.getInstance();
        //设置job名字
        job.setJobName("BolkLoad dianxin_data");
        //设置job主类名
        job.setJarByClass(BolkLoad.class);

        //注意:这里可以设置reduce任务个数,但不会生效,reduce任务是由region个数决定的
        job.setNumReduceTasks(4);

        //4.配置Map相关的内容
        //设置Map的类
        job.setMapperClass(BulkLoadingMapper.class);
        //设置Map的K V
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);

        //5.由于写入不是有序的,所以要设置一个他的有序写入
        job.setPartitionerClass(SimpleTotalOrderPartitioner.class);


        //6.配置Reduce保证分区是有序的
        job.setReducerClass(KeyValueSortReducer.class);
        //7.用FileInputFormat.addInputPath配置输入路径,new Path指定这个dianxin_bulk文件在hdfs下面的路径
        FileInputFormat.addInputPath(job,new Path("/data/DIANXIN/input/dianxin_data"));
        //7.1配置输出路径
        FileOutputFormat.setOutputPath(job,new Path("/data/DIANXIN/output/dianxin_data"));

            //获取表,首先获取连接,然后拿这个连接调用getAdmin获取master相关东西,然后getTable获取表,需要新建传一个tableName.valueof类型的表名;
            //通过getRegionLocator
获取region的所在位置
        Connection conn = ConnectionFactory.createConnection(conf);
        Admin admin = conn.getAdmin();
        Table dianxin_bulk = conn.getTable(TableName.valueOf("dianxin_bulk"));
        RegionLocator regionLocator = conn.getRegionLocator(TableName.valueOf("dianxin_bulk"));

        //通过HFileOutputmat2将文件生成HFile文件,()里传 (job任务,table表达对应的表名叫什么,Region表的位置信息)
        HFileOutputFormat2.configureIncrementalLoad(job,dianxin_bulk,regionLocator);
        System.out.println("===========HFile文件生成成功,对应HBase表:dianxin_bulk========");

        //8.开始执行MapperReduce任务
        boolean b = job.waitForCompletion(true);
        //这个返回值b决定了是否成功启动,所以需要if判断
        if (b){
            System.out.println("数据导入成功,开始将HFile文件加载到HBase中");
            //8.1 将HFile文件加载到HBase中,先创建一个类
            LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf);
            //8.2用这个类返回值调用doBulkLoad开始加载;传入四个参数(输出路径,,mater对象admin,表名,region的信息)
            load.doBulkLoad(new Path("/data/DIANXIN/output/dianxin_data"),admin,dianxin_bulk,regionLocator);
            System.out.println("文件加载成功");
        }else {
            System.out.println("数据导入失败");
        }

        //=========到此输入已经导入成功=============下面开始将文件上传到HDFS中
        /**
         * 1.将文件上传到HDFS上 :/data/DIANXIN/input/dianxin_data
         * 2.在HBase中 建表: create 表名 列簇   create 'dianxin_bulk' , 'info'
         * 3.打包上传
         * 4.执行 hadoop jar  包名.jar  相对路径.类名
         *       hadoop jar  ....jar   .....BolkLoad
         *
         * */



    }
}

上传流程:

        1.选择这个项目,点击package打包;

        2.若无代码中的目录则需要新建;

        去hadoop命令窗口里输入 hadoop  fs -mkdir -p /data/DIANXIN/ input

        master:50070去web hadoop页面查看;

        3.打包好的文件在taget/maven-archiver下;执行 hadoop jar 包名.jar 相对路径.类名

        4.命令窗口 scan查看是否放入成功 

说明:

1. 最终输出结果,无论是map还是reduce,输出部分key和value的类型必须是: < ImmutableBytesWritable, KeyValue>或者< ImmutableBytesWritable, Put>。
2. 最终输出部分,Value类型是KeyValue 或Put,对应的Sorter分别是KeyValueSortReducer或PutSortReducer。
3. MR例子中HFileOutputFormat2.configureIncrementalLoad(job, dianxin_bulk, regionLocator);自动对job进行配置。SimpleTotalOrderPartitioner是需要先对key进行整体排序,然后划分到每个reduce中,保证每一个reducer中的的key最小最大值区间范围,是不会有交集的。因为入库到HBase的时候,作为一个整体的Region,key是绝对有序的。

4. MR例子中最后生成HFile存储在HDFS上,输出路径下的子目录是各个列族。如果对HFile进行入库HBase,相当于move HFile到HBase的Region中,HFile子目录的列族内容没有了,但不能直接使用mv命令移动,因为直接移动不能更新HBase的元数据。

5. HFile入库到HBase通过HBase中 LoadIncrementalHFiles的doBulkLoad方法,对生成的HFile文件入库

所需依赖:父工程

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.shujia</groupId>
    <artifactId>hadoop_bigdata</artifactId>
    <packaging>pom</packaging>
    <version>1.0-SNAPSHOT</version>

    <modules>
        <module>hadoop-hive</module>
        <module>hadoop-hbase</module>
    </modules>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <hadoop-version>2.7.6</hadoop-version>
        <junit-version>4.13.1</junit-version>
    </properties>

    <dependencyManagement>
        <dependencies>
            <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-common</artifactId>
                <version>${hadoop-version}</version>
            </dependency>

            <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-client</artifactId>
                <version>${hadoop-version}</version>
            </dependency>

            <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-hdfs</artifactId>
                <version>${hadoop-version}</version>
            </dependency>

            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>${junit-version}</version>
            </dependency>

            <!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
            <dependency>
                <groupId>com.janeluo</groupId>
                <artifactId>ikanalyzer</artifactId>
                <version>2012_u6</version>
            </dependency>

            <!-- https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc -->
            <dependency>
                <groupId>org.apache.hive</groupId>
                <artifactId>hive-jdbc</artifactId>
                <version>1.2.1</version>
            </dependency>

            <dependency>
                <groupId>org.apache.hive</groupId>
                <artifactId>hive-exec</artifactId>
                <version>1.2.1</version>
            </dependency>

            <dependency>
                <groupId>org.apache.hbase</groupId>
                <artifactId>hbase-client</artifactId>
                <version>1.4.6</version>
            </dependency>

            <dependency>
                <groupId>org.apache.hbase</groupId>
                <artifactId>hbase-server</artifactId>
                <version>1.4.6</version>
            </dependency>

            <dependency>
                <groupId>mysql</groupId>
                <artifactId>mysql-connector-java</artifactId>
                <version>5.1.49</version>
            </dependency>

        </dependencies>
    </dependencyManagement>

</project>

hadoop-hbase 子工程:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<!--    <parent>
        <artifactId>hadoop_bigdata</artifactId>
        <groupId>com.shujia</groupId>
        <version>1.0-SNAPSHOT</version>
    </parent>-->
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.shujia</groupId>
    <artifactId>hadoop-hbase</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <hadoop-version>2.7.6</hadoop-version>
        <junit-version>4.13.1</junit-version>
    </properties>


    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop-version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop-version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop-version}</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
        <dependency>
            <groupId>com.janeluo</groupId>
            <artifactId>ikanalyzer</artifactId>
            <version>2012_u6</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc -->
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-jdbc</artifactId>
            <version>1.2.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>1.2.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.4.6</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>1.4.6</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.49</version>
        </dependency>


    </dependencies>

    <build>
        <plugins>
            <!-- compiler插件, 设定JDK版本 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>2.3.2</version>
                <configuration>
                    <encoding>UTF-8</encoding>
                    <source>1.8</source>
                    <target>1.8</target>
                    <showWarnings>true</showWarnings>
                </configuration>
            </plugin>


            <!-- 带依赖jar 插件-->
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>

    </build>

</project>

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值