使用MapReduce 从HDFS中导入数据到Hbase中

准备工作:

  1. hdfs中创建test目录
  2. 上传文件info.txt 到hdfs中
  3. 编写内容:
10001	张三	90
10002	李四	60
10003 	王二	78
10005	李欣	89

1 导入相关依赖:

 <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.12</version>
    </dependency>


    <!--hbase-->
    <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-client</artifactId>
      <version>2.2.4</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-protocol</artifactId>
      <version>2.2.4</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-common</artifactId>
      <version>2.2.4</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-mapreduce -->
    <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-mapreduce</artifactId>
      <version>2.2.4</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-zookeeper -->
    <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-zookeeper</artifactId>
      <version>2.2.4</version>
    </dependency>


    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>2.9.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-common</artifactId>
      <version>2.9.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-hdfs</artifactId>
      <version>2.9.2</version>
    </dependency>

    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-yarn-common</artifactId>
      <version>2.9.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-yarn-client</artifactId>
      <version>2.9.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-yarn-server-resourcemanager</artifactId>
     <version>2.9.2</version>
    </dependency>

    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-mapreduce-client-core</artifactId>
      <version>2.9.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
      <version>2.9.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-mapreduce-client-common</artifactId>
      <version>2.9.2</version>
    </dependency>

    <dependency>
      <groupId>org.anarres.lzo</groupId>
      <artifactId>lzo-hadoop</artifactId>
      <version>1.0.6</version>
    </dependency>

2 自定义Mapper类:

package com.yuan.mapper;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class StuMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //从HDFS中读取的数据
        String lineValue = value.toString();
        //读取出来的每行数据使用\t进行分割,存于String数组
        String[] values = lineValue.split("\t");

        //根据数据中值的含义取值
        String rowKey = values[0];
        String name = values[1];
        String grade = values[2];

        //初始化rowKey  ImmutableBytesWritable类型一般作为RowKey的类型;
        ImmutableBytesWritable rowKeyWritable = new ImmutableBytesWritable(Bytes.toBytes(rowKey));

        //初始化put对象
        Put put = new Put(Bytes.toBytes(rowKey));

        //参数分别:列族、列、值

        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes(name));
        put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("grade"), Bytes.toBytes(grade));

        //写数据
        context.write(rowKeyWritable, put);
    }

}

3 自定义Reduce类:

package com.yuan.reduce;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.NullWritable;

import java.io.IOException;

public class StuReduce extends TableReducer<ImmutableBytesWritable,Put,NullWritable>{
    @Override
    protected void reduce(ImmutableBytesWritable key, Iterable<Put> values, Context context) throws IOException, InterruptedException {
        //读出来的每一行数据写入到fruit表中
        for(Put put: values){
            context.write(NullWritable.get(), put);
        }
    }
}

4 自定义Drive 类:

package com.yuan.drive;


import com.yuan.mapper.StuMapper;
import com.yuan.reduce.StuReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import java.io.IOException;

/**
 * @author 19417
 */
public class StuRunner {
    public static void main(String[] arg) throws Exception {

        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum", "192.168.220.128:2181");
       /* Connection conn = ConnectionFactory.createConnection(conf);*/

        String[] args = {"hdfs://xmaster:9000/test/info.txt"};

        //创建Job任务
        Job job = Job.getInstance(conf, StuRunner.class.getSimpleName());
        //设置主类
        job.setJarByClass(StuRunner.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));

        //设置Mapper
        job.setMapperClass(StuMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(Put.class);

        //设置Reducer
        TableMapReduceUtil.initTableReducerJob("stu", StuReduce.class, job);

        //设置Reduce数量,最少1个
        job.setNumReduceTasks(1);

        boolean isSuccess = job.waitForCompletion(true);
        if (!isSuccess) {
            throw new IOException("Job running with error");
        }

        int status = isSuccess ? 0 : 1;

        System.exit(status);
    }


}

5 打 jar包 上传:

查看数据:
在这里插入图片描述

可以通过以下两种方式将HDFS数据导入HBase: 1. 使用HBase提供的工具类:HBase提供了hbase org.apache.hadoop.hbase.mapreduce.ImportTsv这个工具类,可以将TSV、CSV、SequenceFile等格式的数据导入HBase使用该工具进行数据导入的步骤如下: (1) 将待导入数据转化为逗号分隔的文本文件(CSV文件),例如: id,name,age 1,张三,18 2,李四,20 3,王五,22 (2) 使用以下命令进行数据导入: $ hadoop jar /path/to/hbase.jar \ org.apache.hadoop.hbase.mapreduce.ImportTsv \ -Dimporttsv.separator=',' \ -Dimporttsv.columns=HBASE_ROW_KEY,cf:name,cf:age \ test_table \ /path/to/data.csv 其,-Dimporttsv.separator=','表示CSV文件字段之间的分隔符为逗号;-Dimporttsv.columns=HBASE_ROW_KEY,cf:name,cf:age表示将CSV文件的第一列作为行键,第二列和第三列分别放入名为cf:name和cf:age的列族。 2. 使用自定义MapReduce程序:如果需要对数据进行自定义转换或多步处理,可以使用自定义的MapReduce程序将数据HDFS导入HBase。 具体步骤如下: (1) 编写自定义Mapper类,将HDFS数据转换为HBase数据格式。 (2) 编写自定义Reducer类,将Mapper阶段输出的键值对写入HBase。 (3) 配置MapReduce作业,并提交到Hadoop集群上运行。在作业配置指定HBase表的名称、列族以及行键。 (4) 等待MapReduce作业完成,检查HBase数据是否正确导入
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

缘不易

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值