向hbase导入数据的三种方式
1.Put对象,是去RPC请求连接数据并一条一条插入或者多条插入.
2.缓存到一定大小之后单次请求一次性插入数据(内存)
3.现在hdfs中已经存在了大量的数据,bulkload方式
text,sequence,csv文件… —>Hflie文件 <<<<<<<<<—表.region
第一种方式PUT
用java连接hbase导入依赖(仅供参考)
<dependencies>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.4.6</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.0.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>2.0.4</version>
</dependency>
<!-- 使用mr程序操作hbase 数据的导入 -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-mapreduce</artifactId>
<version>2.0.4</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.5</version>
</dependency>
<!-- phoenix 凤凰 用来整合Hbase的工具 -->
<dependency>
<groupId>org.apache.phoenix</groupId>
<artifactId>phoenix-core</artifactId>
<version>5.0.0-HBase-2.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.6</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<!-- bind to the packaging phase -->
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
创建一个连接hbase的工具类
public class HbaseUtils {
public static Connection getHbaseConnection() throws Exception {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "doit01:2181,doit02:2181,doit03:2181");
return ConnectionFactory.createConnection(conf);
}
}
需求一:在java程序中查看hbase的列族的内容
在hbase shell中
//创建表和列族
create "doit02","f1"
在java程序中
//基于HbaseUtils的工具类使用java连接Hbase服务(hbase开启hdfs,zookeeper,HBASE服务)
public class Test1 {
public static void main(String[] args) throws Exception {
//调用封装好的工具类
Connection connect = HbaseUtils.getHbaseConnection();
//首先获取一个表
Table table = connect.getTable(TableName.valueOf("doit02".getBytes()));
//然后获取表的描述器
TableDescriptor descriptor = table.getDescriptor();
//而后获取表的列族
ColumnFamilyDescriptor[] cfs = descriptor.getColumnFamilies();
for(ColumnFamilyDescriptor columnFamilyDescriptor :cfs){
//最终才能向列族插入数据
byte[] name = columnFamilyDescriptor.getName();
System.out.println(Bytes.toString(name));
}
table.close();
connect.close();
}
}
打印台打印:
f1
需求二:在java程序中向hbase插入列族和值(一条数据)
public class Test1 {
public static void main(String[] args) throws Exception {
//调用封装好的工具类
Connection connect = HbaseUtils.getHbaseConnection();
//首先获取一个表
Table table = connect.getTable(TableName.valueOf("doit02"));
//创建put对象的时候指定行键rowkey
Put put = new Put(Bytes.toBytes("rk001"));
//put设置 列族 属性 值
put.addColumn("f1".getBytes(),"name".getBytes(),Bytes.toBytes("shizijun"));
table.put(put);
table.close();
connect.close();
}
}
hbase shell集群检验
hbase(main):003:0> scan "doit02"
ROW COLUMN+CELL
0 row(s)
Took 0.3089 seconds
hbase(main):004:0> scan "doit02"
ROW COLUMN+CELL
rk001 column=f1:name, timestamp=1586312591078, value=shizijun
1 row(s)
Took 0.0620 seconds
需求三:在java中向HBASE插入多条数据
public class Test1 {
public static void main(String[] args) throws Exception {
//调用封装好的工具类
Connection connect = HbaseUtils.getHbaseConnection();
//首先获取一个表
Table table = connect.getTable(TableName.valueOf("doit02"));
List<Put> list = new ArrayList<>();
Put put1 = new Put("rk002".getBytes());
put1.addColumn("f1".getBytes(),"name".getBytes(),"zhangliya".getBytes());
Put put2 = new Put("rk002".getBytes());
put2.addColumn("f1".getBytes(),"age".getBytes(),Bytes.toBytes(34));
Put put3 = new Put("rk002".getBytes());
put3.addColumn("f1".getBytes(),"gender".getBytes(),Bytes.toBytes("F"));
list.add(put1);
list.add(put2);
list.add(put3);
//添加多个值
table.put(list);
table.close();
connect.close();
}
//在idea中封装成一个方法用来调用快捷键"ctrl+alt+M"会自动封装
private static void PutOneValue(Connection connect, Table table) throws IOException {
//创建put对象的时候指定行键rowkey
Put put = new Put(Bytes.toBytes("rk001"));
//put设置 列族 属性 值
put.addColumn("f1".getBytes(),"name".getBytes(),Bytes.toBytes("shizijun"));
table.put(put);
table.close();
connect.close();
}
}
hbase集群检验
hbase(main):007:0>
hbase(main):008:0* scan "doit02"
ROW COLUMN+CELL
rk001 column=f1:name, timestamp=1586312591078, value=shizijun
rk002 column=f1:age, timestamp=1586320890300, value=\x00\x00\x00
"
rk002 column=f1:gender, timestamp=1586320890300, value=F
rk002 column=f1:name, timestamp=1586320890300, value=zhangliya
2 row(s)
Took 0.0819 seconds
第二种方式 缓存 BufferedMutator
跟第一种比,表面看不出来,但是底层运行大不一样,速度确实快一点点
public class Test2 {
public static void main(String[] args) throws Exception {
Connection conn = HbaseUtils.getHbaseConnection();
BufferedMutator bm = conn.getBufferedMutator(TableName.valueOf("doit02"));
List<Put> list = new ArrayList<>();
Put put1 = new Put("rk001".getBytes());
put1.addColumn("f1".getBytes(),"name".getBytes(),"shizijun".getBytes());
Put put2 = new Put("rk001".getBytes());
put2.addColumn("f1".getBytes(),"age".getBytes(),"27".getBytes());
Put put3 = new Put("rk001".getBytes());
put3.addColumn("f1".getBytes(),"gender".getBytes(),"M".getBytes());
list.add(put1);
list.add(put2);
list.add(put3);
bm.mutate(list);
bm.close();
conn.close();
}
}
在集群中检验(重复列名自动更新)
hbase(main):013:0> scan "doit02"
ROW COLUMN+CELL
rk001 column=f1:age, timestamp=1586323875282, value=27
rk001 column=f1:gender, timestamp=1586323875282, value=M
rk001 column=f1:name, timestamp=1586323875282, value=shizijun
rk002 column=f1:age, timestamp=1586323840464, value=27
rk002 column=f1:gender, timestamp=1586323840464, value=M
rk002 column=f1:name, timestamp=1586323840464, value=shizijun
2 row(s)
Took 0.0921 seconds
第三种方式批量导入
先在本地写csv文件创建文件夹
[root@doit01 /]# mkdir /csv
写需要导入的文件
[root@doit01 /]# vi teacher.csv
t001,szj,23,beijing
t002,zly,22,neimenggu
t003,fj,35,usa
t004,dlrb,28,china
t005,ltf,27,xinlitun
t006,lgb,54,baoding
将文件上传至hdfs
[root@doit01 csv]# hdfs dfs -put ./teacher.csv /date/csv/
在HBASE shell端创建一个表
create "teacher","f"
在正常虚拟机界面下有一个命令(csv转Hfile命令):
[root@doit01 csv]# hbase org.apache.hadoop.hbase.mapreduce.ImportTsv \
-Dimporttsv.separator=, \
-Dimporttsv.columns='HBASE_ROW_KEY,f:city,f:province,f:address' \
-Dimporttsv.bulk.output=/date/output \
teacher \
/date/csv/
//将Hfile文件转存在HBASE中命令
[root@doit01 csv]# hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles /date/output teacher
//在hbase shell中检验
hbase(main):004:0> scan "teacher"
ROW COLUMN+CELL
t001 column=f:address, timestamp=1586338522928, value=beijin
g
t001 column=f:city, timestamp=1586338522928, value=szj
t001 column=f:province, timestamp=1586338522928, value=23
t002 column=f:address, timestamp=1586338522928, value=neimen
ggu
t002 column=f:city, timestamp=1586338522928, value=zly
t002 column=f:province, timestamp=1586338522928, value=22
t003 column=f:address, timestamp=1586338522928, value=usa
t003 column=f:city, timestamp=1586338522928, value=fj
t003 column=f:province, timestamp=1586338522928, value=35
t004 column=f:address, timestamp=1586338522928, value=china
t004 column=f:city, timestamp=1586338522928, value=dlrb
t004 column=f:province, timestamp=1586338522928, value=28
t005 column=f:address, timestamp=1586338522928, value=xinlit
un
t005 column=f:city, timestamp=1586338522928, value=ltf
t005 column=f:province, timestamp=1586338522928, value=27
t006 column=f:address, timestamp=1586338522928, value=baodin
g
t006 column=f:city, timestamp=1586338522928, value=lgb
t006 column=f:province, timestamp=1586338522928, value=54
6 row(s)
Took 0.3793 seconds