HBase读写tif文件/其他格式文件

最新推荐文章于 2021-08-28 12:05:08 发布

独孤尚亮dugushangliang

最新推荐文章于 2021-08-28 12:05:08 发布

阅读量509

点赞数 3

分类专栏： Spark 文章标签： HBase Java Spark

本文链接：https://blog.csdn.net/weixin_40450867/article/details/102712565

版权

Spark 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

写入的基本思路是将文件读取为字节型数组，再插入hbase。

使用java实现将本地图片写入到hbase中，从hbase中读取并写入到本地。以tif格式为例。

写到hbase中再写到本地得到的图和原始图是一致的，也含有地理信息。

此方法不仅可以用于图片文件，对于word pdf 等其他文件也同样可以存入hbase并写出到本地。

一个是java实现，一个是用spark/scala实现

由于本人java水平有限，采用java写for循环逐个插入的效率较低，用spark插入的速度较快。

1 导入包

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.File;

2 写入到hbase中的函数

public static void WriteTif2Hbase(String imgPath,String tableName,String rowKey,String cf,String column) throws IOException{
        //写入
        Configuration configuration = HBaseConfiguration.create();

        HTable table = new HTable(configuration, tableName);

        FileInputStream fis = new FileInputStream(imgPath);
        byte[] bbb = new byte[fis.available()];//读图为流，但字节数组还是空的
        fis.read(bbb);//将文件内容写入字节数组
        fis.close();

        // 002是行键
        Put put = new Put(rowKey.getBytes());
        // cf1是列族，img是列，bbb是插入的值（图片转化成的字节数组）
        put.add(cf.getBytes(), column.getBytes(), bbb);
        table.put(put);
        table.close();
        System.out.println("Write Success");
    }

3 从hbase到本地

    public static void ReadHbase2Local(String outPath,String tableName,String rowKey) throws IOException{
        Configuration configuration = HBaseConfiguration.create();

        HTable table = new HTable(configuration, tableName);

        //读取
        //将hbase获取的二进制流转化成文件夹中的图片

        Get get = new Get(rowKey.getBytes());
        Result rs = table.get(get);
        byte[] bs = rs.value(); //保存get result的结果，字节数组形式
        table.close();
        File file=new File(outPath);//将输出的二进制流转化后的图片的路径
        FileOutputStream fos=new FileOutputStream(file);
        fos.write(bs);
        fos.close();
        System.out.println("Write success");
    }

4 main函数

    public static void main(String[] args) throws IOException{
        //创建表  create 'test','cf1';
        String imgPath="E:\\data\\ser\\20190928T022549_20190928T022549_T51TYJ_S_ABAU.tif";
        String tableName="test";
        String rowKey="002";
        String cf="cf1";
        String column="img";
        WriteTif2Hbase(imgPath,tableName,rowKey,cf,column);

        String outPath="E:\\data\\ser\\test1.tif";
        ReadHbase2Local(outPath,tableName,rowKey);


    }

5 完整代码

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.File;
public class ReadHbase {
    public static void main(String[] args) throws IOException{
        //创建表  create 'test','cf1';
        String imgPath="E:\\data\\ser\\20190928T022549_20190928T022549_T51TYJ_S_ABAU.tif";
        String tableName="test";
        String rowKey="002";
        String cf="cf1";
        String column="img";
        WriteTif2Hbase(imgPath,tableName,rowKey,cf,column);

        String outPath="E:\\data\\ser\\test1.tif";
        ReadHbase2Local(outPath,tableName,rowKey);


    }
    public static void WriteTif2Hbase(String imgPath,String tableName,String rowKey,String cf,String column) throws IOException{
        //写入
        Configuration configuration = HBaseConfiguration.create();

        HTable table = new HTable(configuration, tableName);

        FileInputStream fis = new FileInputStream(imgPath);
        byte[] bbb = new byte[fis.available()];//读图为流，但字节数组还是空的
        fis.read(bbb);//将文件内容写入字节数组
        fis.close();

        // 002是行键
        Put put = new Put(rowKey.getBytes());
        // cf1是列族，img是列，bbb是插入的值（图片转化成的字节数组）
        put.add(cf.getBytes(), column.getBytes(), bbb);
        table.put(put);
        table.close();
        System.out.println("Write Success");
    }
    public static void ReadHbase2Local(String outPath,String tableName,String rowKey) throws IOException{
        Configuration configuration = HBaseConfiguration.create();

        HTable table = new HTable(configuration, tableName);

        //读取
        //将hbase获取的二进制流转化成文件夹中的图片

        Get get = new Get(rowKey.getBytes());
        Result rs = table.get(get);
        //指定列族和列
        //byte[] bs=result.getValue(Bytes.toBytes("cf1"),Bytes.toBytes("001001"));
        //获取
        byte[] bs = rs.value(); //保存get result的结果，字节数组形式
        table.close();
        File file=new File(outPath);//将输出的二进制流转化后的图片的路径
        FileOutputStream fos=new FileOutputStream(file);
        fos.write(bs);
        fos.close();
        System.out.println("Write success");
    }
}

6 java批量写入

对一个文件夹下的所有tif写入到hbase，列族需存在

只用了简单的for循环逐个插入，效率比较低。

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.File;
import java.text.SimpleDateFormat;

public class ReadHbase5 {
    public static void main(String[] args) throws IOException{
        //创建表  create 'test','cf';
        //列出所有行键：count 'test', INTERVAL=>1
        //删除表 disable 'test'   ; drop 'test'
        //增加列族 alter 'test', {NAME=>'cf2'}
        String tableName="test";
        //String rowKey="002";
        String cf="cf2";
        String column="img";
        File file=new File("E:\\data\\ser\\sample");
        File[] tempList = file.listFiles();

        //对所有的tif文件写入
        for (int i = 0; i < tempList.length; i++) {
            if (tempList[i].isFile() && (tempList[i].toString().endsWith(".tif"))) {
                //绝对路径
                String imgPath=tempList[i].toString();
                //文件名作为rowkey
                String rowKey=tempList[i].getName();
                //写入
                WriteTif2Hbase(imgPath,tableName,rowKey,cf,column);
                //时间获取
                SimpleDateFormat sf=new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                String strsystime = sf.format(System.currentTimeMillis());//系统当前时间
                System.out.println("Success: "+strsystime+": "+rowKey);

            }
        }
        System.out.println("Success All ");

        //String imgPath="E:\\data\\ser\\20190928T022549_20190928T022549_T51TYJ_S_ABAU.tif";
        //WriteTif2Hbase(imgPath,tableName,rowKey,cf,column);

        //String outPath="E:\\data\\ser\\test1.tif";
        //ReadHbase2Local(outPath,tableName,rowKey);


    }
    public static void WriteTif2Hbase(String imgPath,String tableName,String rowKey,String cf,String column) throws IOException{
        //写入
        Configuration configuration = HBaseConfiguration.create();

        HTable table = new HTable(configuration, tableName);

        FileInputStream fis = new FileInputStream(imgPath);
        byte[] bbb = new byte[fis.available()];//读图为流，但字节数组还是空的
        fis.read(bbb);//将文件内容写入字节数组
        fis.close();

        // 002是行键
        Put put = new Put(rowKey.getBytes());
        // cf1是列族，img是列，bbb是插入的值（图片转化成的字节数组）
        put.add(cf.getBytes(), column.getBytes(), bbb);
        table.put(put);
        table.close();
        System.out.println("Write Success");
    }
    public static void ReadHbase2Local(String outPath, String tableName,  String rowKey) throws IOException{
        Configuration configuration = HBaseConfiguration.create();

        HTable table = new HTable(configuration, tableName);

        //读取
        //将hbase获取的二进制流转化成文件夹中的图片

        Get get = new Get(rowKey.getBytes());
        Result rs = table.get(get);
        byte[] bs = rs.value(); //保存get result的结果，字节数组形式
        table.close();
        File file=new File(outPath);//将输出的二进制流转化后的图片的路径
        FileOutputStream fos=new FileOutputStream(file);
        fos.write(bs);
        fos.close();
        System.out.println("Write success");
    }
}

对于2M大的图大约2秒插入一个

220个文件前后用时四分钟。

Write Success
Success: 2019-10-26 21:19:58: 20190928T022549_20190928T022549_T51TYJ_S_AAAA.tif
Write Success
Success: 2019-10-26 21:20:00: 20190928T022549_20190928T022549_T51TYJ_S_AAAB.tif
Write Success
Success: 2019-10-26 21:20:03: 20190928T022549_20190928T022549_T51TYJ_S_AAAC.tif
Write Success
Success: 2019-10-26 21:20:03: 20190928T022549_20190928T022549_T51TYJ_S_AAAD.tif
Write Success
Success: 2019-10-26 21:20:03: 20190928T022549_20190928T022549_T51TYJ_S_AAAE.tif
Write Success
。。。。。。。。。。。。。。
Success: 2019-10-26 21:22:50: 20190928T022549_20190928T022549_T51TYJ_S_AHAQ.tif
Write Success
Success: 2019-10-26 21:22:53: 20190928T022549_20190928T022549_T51TYJ_S_AHAR.tif
Write Success
。。。。。。。。。。。。。。
Success: 2019-10-26 21:23:51: 20190928T022549_20190928T022549_T51TYJ_S_AQAL.tif
Write Success
Success: 2019-10-26 21:23:51: 20190928T022549_20190928T022549_T51TYJ_S_AQAM.tif
Write Success
Success: 2019-10-26 21:23:51: 20190928T022549_20190928T022549_T51TYJ_S_AQAN.tif
Write Success
Success: 2019-10-26 21:23:53: 20190928T022549_20190928T022549_T51TYJ_S_AQAO.tif
Success All 

Process finished with exit code 0

7 spark批量写入到hbase

对一个文件夹内的所有文件存入到hbase中，行键为文件名，列族:列为cf1:img

主要用到了sc.binaryFiles这个函数，将文件夹下的文件以二进制方式读取，一个文件对应rdd的一个元素。

rdd的每个元素为键值对rdd，类型为(String,org.apache.spark.input.PortableDataStream)，其中第一个x._1为文件的路径，第二个 x._2 为PortableDataStream对象，可以通过toArray()方法转换为字节型数组。

字符串转为字节数组下面的两种方法是一样的，但是b2的方法更加通用，对于整数小数的也能使用

val b1= "字符串".getBytes

 val b2=Bytes.toBytes("字符串")

b1.mkString == b2.mkString          //res10: Boolean = true

ReadHbase4.scala

import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{HBaseAdmin, Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.{SparkConf, SparkContext}
//import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import java.text.SimpleDateFormat
object ReadHbase4{
  def main(args: Array[String]) {
    val conf=new SparkConf().setMaster("local[2]").setAppName("testhbase")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    val sc=new SparkContext(conf)
//hbase表名
    val tablename = "test"

    val hbaseConf = HBaseConfiguration.create()
    //在Windows系统的本地模式下下面两行不需要也能运行，如果在集群模式下就得需要了
    //设置zooKeeper集群地址，也可以通过将hbase-site.xml导入classpath，但是建议在程序里这样设置
    // hbaseConf.set("hbase.zookeeper.quorum","localhost")
    //设置zookeeper连接端口，默认2181
    //hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
    hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tablename)

    // 初始化job，TableOutputFormat 是 org.apache.hadoop.hbase.mapred 包下的
    val jobConf = new JobConf(hbaseConf)
    jobConf.setOutputFormat(classOf[TableOutputFormat])

    //使用sc.binaryFiles将目录下的所有文件读取为二进制数组，也可以加个filter过滤指定后缀的
    val rdd2=sc.binaryFiles("E:\\data\\ser\\sample")

    val rdd3=rdd2.filter(_._1.endsWith(".tif")).map{x=>{        //x._1为文件的路径，x._2 为org.apache.spark.input.PortableDataStream
      val rowkey=x._1.split('/').last      //将路径的文件名提取出来作为行键
      val put = new Put(Bytes.toBytes(rowkey))     //用行键创建一个Put
      put.addColumn(Bytes.toBytes("cf1"),Bytes.toBytes("img"),x._2.toArray()) //指定列族，列和写入的数据
        // println(x._2.toArray().length)
      val sf:SimpleDateFormat =new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
      val strsystime:String = sf.format(System.currentTimeMillis());//系统当前时间

      println(s"$strsystime : $rowkey : success")
      (new ImmutableBytesWritable, put)
    }}
    rdd3.saveAsHadoopDataset(jobConf)

  }
}

输出信息

对于220个2M大小的文件，可以看到前三个比较慢一点，后面的210+个用时四秒写入成功。

可以看到比第六节的快不少，用java方式的比较原始，还是用spark快一些。

参考

https://blog.csdn.net/login_sonata/article/details/53440059

独孤尚亮dugushangliang

关注

3
点赞
踩
3

收藏

觉得还不错? 一键收藏
打赏
3
评论
HBase读写tif文件/其他格式文件

目录1 导入包2 写入到hbase中的函数3 从hbase到本地4 main函数5 完整代码6 java批量写入7 spark批量写入到hbase写入的基本思路是将文件读取为字节型数组，再插入hbase。使用java实现将本地图片写入到hbase中，从hbase中读取并写入到本地。以tif格式为例。写到hbase中再写到本地得到的图和原始图是一致...
复制链接

扫一扫