向hbase导入数据的三种方式

最新推荐文章于 2024-04-24 10:15:22 发布

北京小峻

最新推荐文章于 2024-04-24 10:15:22 发布

阅读量3.9k

点赞数 5

分类专栏：大数据

本文链接：https://blog.csdn.net/weixin_45896475/article/details/105385357

版权

大数据专栏收录该内容

118 篇文章

订阅专栏

向hbase导入数据的三种方式

1.Put对象,是去RPC请求连接数据并一条一条插入或者多条插入.
2.缓存到一定大小之后单次请求一次性插入数据(内存)
3.现在hdfs中已经存在了大量的数据,bulkload方式
text,sequence,csv文件… —>Hflie文件 <<<<<<<<<—表.region

第一种方式PUT

用java连接hbase导入依赖(仅供参考)

<dependencies>
		<dependency>
			<groupId>org.apache.zookeeper</groupId>
			<artifactId>zookeeper</artifactId>
			<version>3.4.6</version>
		</dependency>
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.12</version>
			<scope>compile</scope>
		</dependency>

		<dependency>
			<groupId>org.apache.hbase</groupId>
			<artifactId>hbase-client</artifactId>
			<version>2.0.4</version>
		</dependency>

		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>2.7.7</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-common</artifactId>
			<version>2.7.7</version>
		</dependency>
		<dependency>
			<groupId>org.apache.hbase</groupId>
			<artifactId>hbase-server</artifactId>
			<version>2.0.4</version>
		</dependency>
		<!-- 使用mr程序操作hbase 数据的导入 -->
		<dependency>
			<groupId>org.apache.hbase</groupId>
			<artifactId>hbase-mapreduce</artifactId>
			<version>2.0.4</version>
		</dependency>

		<dependency>
			<groupId>com.google.code.gson</groupId>
			<artifactId>gson</artifactId>
			<version>2.8.5</version>
		</dependency>
		<!-- phoenix 凤凰 用来整合Hbase的工具 -->
		<dependency>
			<groupId>org.apache.phoenix</groupId>
			<artifactId>phoenix-core</artifactId>
			<version>5.0.0-HBase-2.0</version>
		</dependency>
	</dependencies>
	
	<build>
		<plugins>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-compiler-plugin</artifactId>
				<version>3.5.1</version>
				<configuration>
					<source>1.8</source>
					<target>1.8</target>
				</configuration>
			</plugin>

			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-assembly-plugin</artifactId>
				<version>2.6</version>
				<configuration>
					<descriptorRefs>
						<descriptorRef>jar-with-dependencies</descriptorRef>
					</descriptorRefs>
				</configuration>
				<executions>
					<execution>
						<id>make-assembly</id>
						<!-- bind to the packaging phase -->
						<phase>package</phase>
						<goals>
							<goal>single</goal>
						</goals>
					</execution>
				</executions>
			</plugin>

		</plugins>
	</build>

创建一个连接hbase的工具类

public class HbaseUtils {
    public static Connection getHbaseConnection() throws Exception {
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum", "doit01:2181,doit02:2181,doit03:2181");
        return ConnectionFactory.createConnection(conf);
    }
}

需求一:在java程序中查看hbase的列族的内容

在hbase shell中

//创建表和列族
create "doit02","f1"

在java程序中

//基于HbaseUtils的工具类使用java连接Hbase服务(hbase开启hdfs,zookeeper,HBASE服务)
public class Test1 {
    public static void main(String[] args) throws Exception {
        //调用封装好的工具类
        Connection connect = HbaseUtils.getHbaseConnection();
        //首先获取一个表
        Table table = connect.getTable(TableName.valueOf("doit02".getBytes()));
		//然后获取表的描述器
        TableDescriptor descriptor = table.getDescriptor();
        //而后获取表的列族
		ColumnFamilyDescriptor[] cfs = descriptor.getColumnFamilies();
        for(ColumnFamilyDescriptor columnFamilyDescriptor :cfs){
		//最终才能向列族插入数据
            byte[] name = columnFamilyDescriptor.getName();
            System.out.println(Bytes.toString(name));
        }
        table.close();
        connect.close();
    }
}
打印台打印:
f1

需求二:在java程序中向hbase插入列族和值(一条数据)

public class Test1 {
    public static void main(String[] args) throws Exception {
        //调用封装好的工具类
        Connection connect = HbaseUtils.getHbaseConnection();
        //首先获取一个表
        Table table = connect.getTable(TableName.valueOf("doit02"));
        //创建put对象的时候指定行键rowkey
        Put put = new Put(Bytes.toBytes("rk001"));
        //put设置  列族  属性  值
        put.addColumn("f1".getBytes(),"name".getBytes(),Bytes.toBytes("shizijun"));
        table.put(put);
        table.close();
        connect.close();

    }
}

hbase shell集群检验

hbase(main):003:0> scan "doit02"
ROW                   COLUMN+CELL                                               
0 row(s)
Took 0.3089 seconds                                                             
hbase(main):004:0> scan "doit02"
ROW                   COLUMN+CELL                                               
 rk001                column=f1:name, timestamp=1586312591078, value=shizijun   
1 row(s)
Took 0.0620 seconds

需求三:在java中向HBASE插入多条数据

public class Test1 {
    public static void main(String[] args) throws Exception {
        //调用封装好的工具类
        Connection connect = HbaseUtils.getHbaseConnection();
        //首先获取一个表
        Table table = connect.getTable(TableName.valueOf("doit02"));

        List<Put> list = new ArrayList<>();

        Put put1 = new Put("rk002".getBytes());
        put1.addColumn("f1".getBytes(),"name".getBytes(),"zhangliya".getBytes());

        Put put2 = new Put("rk002".getBytes());
        put2.addColumn("f1".getBytes(),"age".getBytes(),Bytes.toBytes(34));

        Put put3 = new Put("rk002".getBytes());
        put3.addColumn("f1".getBytes(),"gender".getBytes(),Bytes.toBytes("F"));

        list.add(put1);
        list.add(put2);
        list.add(put3);

        //添加多个值
        table.put(list);

        table.close();
        connect.close();

    }
        //在idea中封装成一个方法用来调用快捷键"ctrl+alt+M"会自动封装
    private static void PutOneValue(Connection connect, Table table) throws IOException {
        //创建put对象的时候指定行键rowkey
        Put put = new Put(Bytes.toBytes("rk001"));
        //put设置  列族  属性  值
        put.addColumn("f1".getBytes(),"name".getBytes(),Bytes.toBytes("shizijun"));
        table.put(put);
        table.close();
        connect.close();
    }
}

hbase集群检验

hbase(main):007:0> 
hbase(main):008:0* scan "doit02"
ROW                   COLUMN+CELL                                               
 rk001                column=f1:name, timestamp=1586312591078, value=shizijun   
 rk002                column=f1:age, timestamp=1586320890300, value=\x00\x00\x00
                      "                                                         
 rk002                column=f1:gender, timestamp=1586320890300, value=F        
 rk002                column=f1:name, timestamp=1586320890300, value=zhangliya  
2 row(s)
Took 0.0819 seconds

第二种方式缓存 BufferedMutator

跟第一种比,表面看不出来,但是底层运行大不一样,速度确实快一点点

public class Test2 {
    public static void main(String[] args) throws Exception {
        Connection conn = HbaseUtils.getHbaseConnection();
        BufferedMutator bm = conn.getBufferedMutator(TableName.valueOf("doit02"));
        List<Put> list = new ArrayList<>();

        Put put1 = new Put("rk001".getBytes());
        put1.addColumn("f1".getBytes(),"name".getBytes(),"shizijun".getBytes());

        Put put2 = new Put("rk001".getBytes());
        put2.addColumn("f1".getBytes(),"age".getBytes(),"27".getBytes());

        Put put3 = new Put("rk001".getBytes());
        put3.addColumn("f1".getBytes(),"gender".getBytes(),"M".getBytes());

        list.add(put1);
        list.add(put2);
        list.add(put3);

        bm.mutate(list);
        bm.close();
        conn.close();

    }
}

在集群中检验(重复列名自动更新)

hbase(main):013:0> scan "doit02"
ROW                        COLUMN+CELL                                                              
 rk001                     column=f1:age, timestamp=1586323875282, value=27                         
 rk001                     column=f1:gender, timestamp=1586323875282, value=M                       
 rk001                     column=f1:name, timestamp=1586323875282, value=shizijun                  
 rk002                     column=f1:age, timestamp=1586323840464, value=27                         
 rk002                     column=f1:gender, timestamp=1586323840464, value=M                       
 rk002                     column=f1:name, timestamp=1586323840464, value=shizijun                  
2 row(s)
Took 0.0921 seconds

第三种方式批量导入

先在本地写csv文件创建文件夹

[root@doit01 /]# mkdir /csv

写需要导入的文件

[root@doit01 /]# vi teacher.csv
t001,szj,23,beijing
t002,zly,22,neimenggu
t003,fj,35,usa
t004,dlrb,28,china
t005,ltf,27,xinlitun
t006,lgb,54,baoding

将文件上传至hdfs

[root@doit01 csv]# hdfs dfs -put ./teacher.csv  /date/csv/

在HBASE shell端创建一个表

create "teacher","f"

在正常虚拟机界面下有一个命令(csv转Hfile命令):

[root@doit01 csv]# hbase  org.apache.hadoop.hbase.mapreduce.ImportTsv \
-Dimporttsv.separator=, \
-Dimporttsv.columns='HBASE_ROW_KEY,f:city,f:province,f:address'  \
-Dimporttsv.bulk.output=/date/output \
teacher \
/date/csv/

//将Hfile文件转存在HBASE中命令

[root@doit01 csv]# hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles /date/output teacher

//在hbase shell中检验

hbase(main):004:0> scan "teacher" 
ROW                  COLUMN+CELL                                            
 t001                column=f:address, timestamp=1586338522928, value=beijin
                     g                                                      
 t001                column=f:city, timestamp=1586338522928, value=szj      
 t001                column=f:province, timestamp=1586338522928, value=23   
 t002                column=f:address, timestamp=1586338522928, value=neimen
                     ggu                                                    
 t002                column=f:city, timestamp=1586338522928, value=zly      
 t002                column=f:province, timestamp=1586338522928, value=22   
 t003                column=f:address, timestamp=1586338522928, value=usa   
 t003                column=f:city, timestamp=1586338522928, value=fj       
 t003                column=f:province, timestamp=1586338522928, value=35   
 t004                column=f:address, timestamp=1586338522928, value=china 
 t004                column=f:city, timestamp=1586338522928, value=dlrb     
 t004                column=f:province, timestamp=1586338522928, value=28   
 t005                column=f:address, timestamp=1586338522928, value=xinlit
                     un                                                     
 t005                column=f:city, timestamp=1586338522928, value=ltf      
 t005                column=f:province, timestamp=1586338522928, value=27   
 t006                column=f:address, timestamp=1586338522928, value=baodin
                     g                                                      
 t006                column=f:city, timestamp=1586338522928, value=lgb      
 t006                column=f:province, timestamp=1586338522928, value=54   
6 row(s)
Took 0.3793 seconds