大数据之hbase_将静态文件导入到hbase表中

最新推荐文章于 2022-06-13 15:57:35 发布

普罗米修斯之火

最新推荐文章于 2022-06-13 15:57:35 发布

阅读量1.2k

点赞数

分类专栏： Hbase 文章标签： hbase

本文链接：https://blog.csdn.net/WuBoooo/article/details/108310730

版权

Hbase 专栏收录该内容

8 篇文章 1 订阅

订阅专栏

shell客户端将csv静态文件导入到hbase中

使用importTsv工具
简介 Importtsv是hbase自带的一个 csv文件–>HFile文件的工具，它能将csv文件转成HFile文件，并发送给regionserver
它的本质，是内置的一个将csv文件转成hfile文件的mr程序！

例如将以下数据导入到hbase表中:
1,zss,M,34
2,lss,M,33
3,mby,M,29
4,zhoushen,M,24
5,dengzq,F,28

首先将该数据以user.csv格式存入到hdfs的/csv/input/中
执行如下语句,该语句指定了数据切割的符号,输出到表格的行键,列族,属性值,输出的目录,输出的表格,和输入的表格

hbase org.apache.hadoop.hbase.mapreduce.ImportTsv  -Dimporttsv.separator=, -Dimporttsv.columns='HBASE_ROW_KEY,cf:name,cf:gender,cf:age'  -Dimporttsv.bulk.output=/csv/output tb_imp_user /csv/input

执行完后,会在/csv/output中生成一个hfile文件,再执行如下语句:

hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles /csv/output tb_imp_user

执行完后就会将/csv/output中的hfile文件存到tb_imp_user表中

ImportTsv命令的参数说明如下：
-Dimporttsv.skip.bad.lines=false - 若遇到无效行则失败
-Dimporttsv.separator=, - 使用特定分隔符,默认是tab也就是\t
-Dimporttsv.timestamp=currentTimeAsLong - 使用导入时的时间戳
-Dimporttsv.mapper.class=my.Mapper - 使用用户自定义Mapper类替换TsvImporterMapper
-Dmapreduce.job.name=jobName - 对导入使用特定mapreduce作业名
-Dcreate.table=no - 避免创建表，注：如设为为no，目标表必须存在于HBase中
-Dno.strict=true - 忽略HBase表列族检查。默认为false
-Dimporttsv.bulk.output=/user/yarn/output 作业的输出目录

java客户端使用MR程序将静态文件导入到hbase中

import com.google.gson.Gson;
import com.google.gson.JsonSyntaxException;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;

/**
 * @Date:Create：in 2020/8/30 0030
 * @Description:
 */
public class LoadFileData2HbaseTable {
    /**
     * 处理每行数据  生成rowkey和movieBean
     */
    static class LoadFileData2HbaseTableMapper extends Mapper<LongWritable, Text, Text, MovieBean> {
        Gson gs = new Gson();
        Text k = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            try {
                String line = value.toString();
                MovieBean mb = gs.fromJson(line, MovieBean.class);
                // 设计rowkey主键
                String movie = mb.getMovie();
                String timeStamp = mb.getTimeStamp();
                //将电影id以左边补零的方法,补全成五位数
                String rowkey = StringUtils.leftPad(movie, 5, '0') + "_" + timeStamp;
                k.set(rowkey);
                context.write(k, mb);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }
    static class LoadFileData2HbaseTableReducer extends TableReducer<Text, MovieBean, ImmutableBytesWritable> {
        @Override
        protected void reduce(Text key, Iterable<MovieBean> values, Context context) throws IOException, InterruptedException {
            String rowkey = key.toString();
            for (MovieBean mb : values) {
                // 获取四个属性
                String movie = mb.getMovie();
                double rate = mb.getRate();
                String timeStamp = mb.getTimeStamp();
                String uid = mb.getUid();
                //将主键和属性值都给到put对象,最后直接输出put对象即可
                Put put = new Put(Bytes.toBytes(rowkey));
                put.addColumn("info".getBytes(), "movie".getBytes(), Bytes.toBytes(movie));
                put.addColumn("info".getBytes(), "rate".getBytes(), Bytes.toBytes(rate));
                put.addColumn("info".getBytes(), "timeStamp".getBytes(), Bytes.toBytes(timeStamp));
                put.addColumn("info".getBytes(), "uid".getBytes(), Bytes.toBytes(uid));
                context.write(null, put);
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum","linux01:2181,linux02:2181,linux03:2181");
       Job job = Job.getInstance(conf);
        job.setMapperClass(LoadFileData2HbaseTableMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(MovieBean.class);
        //输入路径
        FileInputFormat.setInputPaths(job,new Path("D:\\txt\\mrdata\\movie\\input"));
        // 插入数据的表要存在
        TableMapReduceUtil.initTableReducerJob("tb_mr_movie" , LoadFileData2HbaseTableReducer.class,job);
        job.waitForCompletion(true) ;
    }
}

读取hbase一个表中的数据插入到另一张表中

/**
 * 使用MR程序
 *    读取hbase一个表中的数据插入到另一张 表中
 * @author ThinkPad
 */
public class MR {
	static class MRMapper extends TableMapper<Text, Text> {
		/**
		 * 参数一 rowkey 参数二  结果 参数三 输出的key 参数四 输出的value
		 */
		@Override
		protected void map(ImmutableBytesWritable key, Result value,
				Mapper<ImmutableBytesWritable, Result, Text, Text>.Context context)
				throws IOException, InterruptedException {
			// 获取字符串的 rowkey
			String k = new Text(new String(key.copyBytes())).toString();
			//获取指定的属性的值 
			String name = Bytes.toString(value.getValue("f".getBytes(), "name".getBytes()));
			String gender = Bytes.toString(value.getValue("f".getBytes(), "gender".getBytes()));
			System.out.println(k + "  " + name);
			// 以行建为key 以多个属性组装的结果为value传递 到reduce中
			context.write(new Text(k), new Text(name + ":" + gender));
		}
	}
/**
 * reduce端
 *   接收传入的key和value
 *   reduce方法在一个行建执行一次 
 * @author ThinkPad
 */
	static class MRReducer extends TableReducer<Text, Text, ImmutableBytesWritable> {
		@Override
		protected void reduce(Text key, Iterable<Text> iters,
				Reducer<Text, Text, ImmutableBytesWritable, Mutation>.Context context)
				throws IOException, InterruptedException {
			//创建put对象 
			Put put = new Put(key.getBytes());
			// 获取接收的map的value值
			Text next = iters.iterator().next();
			// 将value转换成字符串
			String v = next.toString();
			//处理字符串获取 各个属性的值 
			String[] split = v.split(":");
			String name = split[0];
			String gender = split[1];
			// 将各个属性的值添加到对应的列中 
			put.addColumn("f".getBytes(), "name".getBytes(), Bytes.toBytes(name));
			put.addColumn("f".getBytes(), "gender".getBytes(), Bytes.toBytes(gender));
			// 将put对象写出去
			context.write(null, put);
		}
	}
	public static void main(String[] args) throws Exception {
       // 获取整合的初始化对象
		Configuration conf = HBaseConfiguration.create();
		// 连接ZK的位置
		conf.set("hbase.zookeeper.quorum", "linux01,linux02,linux03");
		// 获取job对象
		Job job = Job.getInstance(conf);
        // 穿件扫描对象用来扫描源hbase中的所有的数据
		Scan scan = new Scan();
		// 接收的扫描的数据的行数
		scan.setCaching(200);
		scan.setCacheBlocks(false);
		job.setJarByClass(MR.class);
		// TableMapReduceUtil.initTableMapJob("user", "f", MRMapper.class, Text.class,
		// Text.class, job);
		// 初始化  源表
		TableMapReduceUtil.initTableMapperJob("user", scan, MRMapper.class, Text.class, Text.class, job);
		// 插入数据的表要存在
		TableMapReduceUtil.initTableReducerJob("user2", MRReducer.class, job);
		boolean b = job.waitForCompletion(true);
		if (b)
			System.exit(1);
	}
}

普罗米修斯之火

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
大数据之hbase_将静态文件导入到hbase表中

shell客户端将csv静态文件导入到hbase中使用importTsv工具简介 Importtsv是hbase自带的一个 csv文件–>HFile文件的工具，它能将csv文件转成HFile文件，并发送给regionserver它的本质，是内置的一个将csv文件转成hfile文件的mr程序！例如将以下数据导入到hbase表中:1,zss,M,342,lss,M,333,mby,M,294,zhoushen,M,245,dengzq,F,28首先将该数据以user.csv格式存入
复制链接

扫一扫