准备事项
该案例使用了Maven来管理依赖,下面是pom.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.huadian.bigdata</groupId>
<artifactId>hadoop</artifactId>
<version>1.0-SNAPSHOT</version>
<repositories>
<repository>
<id>aliyun</id>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
</repository>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
<repository>
<id>jboss</id>
<url>http://repository.jboss.com/nexus/content/groups/public</url>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<hadoop.version>2.7.3</hadoop.version>
<hive.version>1.2.1</hive.version>
<hbase.version>1.2.0-cdh5.7.6</hbase.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- Hive Client -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-service</artifactId>
<version>${hive.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive.version}</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<!-- see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.7.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.20.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
使用时还需要hbase-site.xml配置文件,放到resource中
查询指定数据
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IOUtils;
import java.io.IOException;
public class A_GetDemo {
public static void main(String[] args) {
//1、读取配置信息
Configuration conf = HBaseConfiguration.create();
//System.out.println(conf);
Connection conn = null;
Table table = null;
try {
//2.获取连接
conn = ConnectionFactory.createConnection( conf );
//3.获取HBASE table的句柄,可以对表中的数据进行CURD操作
table = conn.getTable( TableName.valueOf( "命名空间:表名" ) );
//4、创建Get对象,就是rowkey
Get get = new Get( Bytes.toBytes( "字段" ) );
//5.根据rowkey去查询数据
Result result = table.get( get );
//6.解析数据
System.out.println("RowKey:"+Bytes.toString( result.getRow() ));
for (Cell cell:result.rawCells()) {
String cf = Bytes.toString(CellUtil.cloneFamily( cell ));
String filed = Bytes.toString(CellUtil.cloneQualifier( cell ));
String value = Bytes.toString(CellUtil.cloneValue( cell ));
System.out.println(cf+":"+filed +"->" +value);
}
} catch (IOException e) {
e.printStackTrace();
}finally {
//释放连接
IOUtils.closeStream( table );
IOUtils.closeStream( conn);
}
}
}
查询一定范围内的数据
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IOUtils;
import java.io.IOException;
public class B_ScanDemo {
public static void main(String[] args) {
//1、读取配置信息
Configuration conf = HBaseConfiguration.create();
//System.out.println(conf);
Connection conn = null;
Table table = null;
try {
//2.获取连接
conn = ConnectionFactory.createConnection( conf );
//3.获取HBASE table的句柄,可以对表中的数据进行CURD操作
table = conn.getTable( TableName.valueOf( "ns1:sale_orders" ) );
//5.根据rowkey去查询数据
scanData(table);
} catch (IOException e) {
e.printStackTrace();
}finally {
IOUtils.closeStream( table );
IOUtils.closeStream( conn);
}
}
private static void scanData(Table table) {
//创建Scan 扫描器实例对象
Scan scan = new Scan();
/**
* 查询范围
* Rowkey满足前缀匹配的原则
* 可以设置返回【startKey,stopkey),设置的范围越小,查询越快,性能越高
*/
scan.setStartRow( Bytes.toBytes( "开始范围" ) );
scan.setStopRow( Bytes.toBytes( "结束范围" ) );
/**
* 设置,查询某一些列簇和列的值
*/
scan.addFamily( Bytes.toBytes("info") );
scan.addColumn( Bytes.toBytes("info"),Bytes.toBytes("date") );
scan.addColumn( Bytes.toBytes("info"),Bytes.toBytes("order_amt") );
scan.addColumn( Bytes.toBytes("info"),Bytes.toBytes("order_id") );
//解析数据
try {
ResultScanner scanner = table.getScanner( scan );
for (Result result:scanner) {
//6.解析数据
System.out.println("RowKey:"+Bytes.toString( result.getRow() ));
for (Cell cell:result.rawCells()) {
String cf = Bytes.toString(CellUtil.cloneFamily( cell ));
String filed = Bytes.toString(CellUtil.cloneQualifier( cell ));
String value = Bytes.toString(CellUtil.cloneValue( cell ));
System.out.println(cf+":"+filed +"->" +value);
}
System.out.println("---------------------------------");
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
查询过滤
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IOUtils;
import java.io.IOException;
public class B_ScanDemo {
public static void main(String[] args) {
//1、读取配置信息
Configuration conf = HBaseConfiguration.create();
//System.out.println(conf);
Connection conn = null;
Table table = null;
try {
//2.获取连接
conn = ConnectionFactory.createConnection( conf );
//3.获取HBASE table的句柄,可以对表中的数据进行CURD操作
table = conn.getTable( TableName.valueOf( "ns1:sale_orders" ) );
//5.根据rowkey去查询数据
scanData(table);
} catch (IOException e) {
e.printStackTrace();
}finally {
IOUtils.closeStream( table );
IOUtils.closeStream( conn);
}
}
private static void scanData(Table table) {
//创建Scan 扫描器实例对象
Scan scan = new Scan();
/**
* 查询范围
* Rowkey满足前缀匹配的原则
* 可以设置返回【startKey,stopkey),设置的范围越小,查询越快,性能越高
*/
scan.setStartRow( Bytes.toBytes( "434017_2015-04-21 00:00:00" ) );
scan.setStopRow( Bytes.toBytes( "434017_2015-04-22 00:00:00" ) );
/***
* 设置,查询某一些列簇和列的值
*/
scan.addFamily( Bytes.toBytes("info") );
scan.addColumn( Bytes.toBytes("info"),Bytes.toBytes("date") );
scan.addColumn( Bytes.toBytes("info"),Bytes.toBytes("order_amt") );
scan.addColumn( Bytes.toBytes("info"),Bytes.toBytes("order_id") );
/**
* 设置过滤器
* scan.setFilter( filter )
* 设置一个过滤器,值 的过滤,对应的值满足某个要求,才符合查询要求
*/
Filter filter = new SingleColumnValueFilter(
Bytes.toBytes("列簇"),
Bytes.toBytes("要过滤的字段"),
CompareFilter.CompareOp.GREATER_OR_EQUAL, //大于等于
Bytes.toBytes("字段具体值")
);
/**
设置多个过滤器
FilterList filterList = new FilterList();
filterList.addFilter( filter );
scan.setFilter( filterList );
*/
try {
ResultScanner scanner = table.getScanner( scan );
for (Result result:scanner) {
//6.解析数据
System.out.println("RowKey:"+Bytes.toString( result.getRow() ));
for (Cell cell:result.rawCells()) {
String cf = Bytes.toString(CellUtil.cloneFamily( cell ));
String filed = Bytes.toString(CellUtil.cloneQualifier( cell ));
String value = Bytes.toString(CellUtil.cloneValue( cell ));
System.out.println(cf+":"+filed +"->" +value);
}
System.out.println("---------------------------------");
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
设置查询优化
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IOUtils;
import java.io.IOException;
public class B_ScanDemo {
public static void main(String[] args) {
//1、读取配置信息
Configuration conf = HBaseConfiguration.create();
//System.out.println(conf);
Connection conn = null;
Table table = null;
try {
//2.获取连接
conn = ConnectionFactory.createConnection( conf );
//3.获取HBASE table的句柄,可以对表中的数据进行CURD操作
table = conn.getTable( TableName.valueOf( "ns1:sale_orders" ) );
//5.根据rowkey去查询数据
scanData(table);
} catch (IOException e) {
e.printStackTrace();
}finally {
IOUtils.closeStream( table );
IOUtils.closeStream( conn);
}
}
private static void scanData(Table table) {
//创建Scan 扫描器实例对象
Scan scan = new Scan();
/**
* 查询范围
* Rowkey满足前缀匹配的原则
* 可以设置返回【startKey,stopkey),设置的范围越小,查询越快,性能越高
*/
scan.setStartRow( Bytes.toBytes( "434017_2015-04-21 00:00:00" ) );
scan.setStopRow( Bytes.toBytes( "434017_2015-04-22 00:00:00" ) );
/***
* 设置,查询某一些列簇和列的值
*/
scan.addFamily( Bytes.toBytes("info") );
scan.addColumn( Bytes.toBytes("info"),Bytes.toBytes("date") );
scan.addColumn( Bytes.toBytes("info"),Bytes.toBytes("order_amt") );
scan.addColumn( Bytes.toBytes("info"),Bytes.toBytes("order_id") );
/**
* 设置过滤器
* scan.setFilter( filter )
* 设置一个过滤器,值 的过滤,对应的值满足某个要求,才符合查询要求
*/
Filter filter = new SingleColumnValueFilter(
Bytes.toBytes("info"),
Bytes.toBytes("order_amt"),
CompareFilter.CompareOp.GREATER_OR_EQUAL, //大于等于
Bytes.toBytes("2015-04-21 07:35:10")
);
/*
设置多个过滤器
FilterList filterList = new FilterList();
filterList.addFilter( filter );
scan.setFilter( filterList );
*/
/*****************************设置 查询优化********************************/
//优化一:表示每次获取一条数据的多少列,默认值-1,表示不设置
//不能与filter连用
scan.setBatch( 2 );
//优化二:表示的是,每次RPC请求的记录数,默认值1,设置的值不能太大,使用内存
//比如设置为1000,每次会从regionServer中读取1000条数据,这次给客户端返回100条
//剩下的900条,下次直接在缓存里面拿
scan.setCaching( 1 );
//优化点三:这个属性要合理的设置
//查询的数据 要不要 放到内存中进行缓存,使用的RegionServer的内存,
//如果只是查询一次的话,就不要设置缓存
scan.setCacheBlocks( true );
try {
ResultScanner scanner = table.getScanner( scan );
for (Result result:scanner) {
//6.解析数据
System.out.println("RowKey:"+Bytes.toString( result.getRow() ));
for (Cell cell:result.rawCells()) {
String cf = Bytes.toString(CellUtil.cloneFamily( cell ));
String filed = Bytes.toString(CellUtil.cloneQualifier( cell ));
String value = Bytes.toString(CellUtil.cloneValue( cell ));
System.out.println(cf+":"+filed +"->" +value);
}
System.out.println("---------------------------------");
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
建立压缩预分区表
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IOUtils;
import java.io.IOException;
public class C_CreateTableDemo {
public static void main(String[] args) {
//1、读取配置信息
Configuration conf = HBaseConfiguration.create();
//System.out.println(conf);
Connection conn = null;
Table table = null;
HBaseAdmin admin = null;
try {
//2.获取连接
conn = ConnectionFactory.createConnection( conf );
admin = (HBaseAdmin) conn.getAdmin();
//创建命名空间
String namespace = "ns2";
NamespaceDescriptor nsDesc = NamespaceDescriptor.create( namespace ).build();
admin.createNamespace( nsDesc );
/**
* 创建表:表名,列簇,
* 压缩,设置预分区、BLOCKCACHE
*/
String tableName = namespace+":stu_info";
//判定表是否存在,存在删除
if(admin.tableExists( tableName )){
//先禁用
admin.disableTable( tableName );
//再删除
admin.deleteTable( tableName );
}
//HTableDescriptor:表名 + 列簇
//a.表的描述符
HTableDescriptor desc = new HTableDescriptor( TableName.valueOf( tableName ) );
//b.列簇的描述符
HColumnDescriptor family = new HColumnDescriptor( "info" );
//设置列簇相关属性
//设置属性一:是否在查询数据的时候,将结果进行缓存,针对用户自己定义表,一般是false
family.setBlockCacheEnabled( false );
//设置属性二:设置压缩
family.setCompressionType( Compression.Algorithm.SNAPPY );
//设置属性三:设置版本,如果是多版本的话,默认查询出来的是最新版本的数据
//默认值是1 ,cell的数据只会保留一份
family.setVersions( 1,3 );
//将列簇添加到表中
desc.addFamily( family );
//设置的表的预分区
byte[][] splitKeys = {
Bytes.toBytes("1"),Bytes.toBytes("3"),
Bytes.toBytes("5"),Bytes.toBytes("8")
};
admin.createTable( desc,splitKeys );
} catch (IOException e) {
e.printStackTrace();
}finally {
IOUtils.closeStream( admin);
IOUtils.closeStream( conn);
}
}
}
插入数据
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IOUtils;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class D_PutDemo {
public static void main(String[] args) {
//1、读取配置信息
Configuration conf = HBaseConfiguration.create();
//System.out.println(conf);
Connection conn = null;
Table table = null;
try {
//2.获取连接
conn = ConnectionFactory.createConnection( conf );
//3.获取HBASE table的句柄,可以对表中的数据进行CURD操作
table = conn.getTable( TableName.valueOf( "ns2:stu_info" ) );
//putData(table);
delete(table);
} catch (IOException e) {
e.printStackTrace();
}finally {
IOUtils.closeStream( table );
IOUtils.closeStream( conn);
}
}
/**
* 向表中插入数据,
* 使用场景:一条记录(一行数据)的所有列一并存储到HBase中
* 添加用户(id,name,age)
*/
private static void putData(Table table) {
//模拟数据
HashMap<String, String> stuMap = new HashMap<>();
stuMap.put( "id","1122" );
stuMap.put( "name","zs22" );
stuMap.put( "age","1722" );
stuMap.put( "address","zhejaingjiaxing22" );
//使用put插入数据
//put "ns2:stu_info" rowkey,"CF:Column" 值
Put put = new Put(Bytes.toBytes( "100001" ));
byte [] cf = Bytes.toBytes( "info" );
for (Map.Entry<String,String> entry:stuMap.entrySet()) {
put.addColumn( cf, Bytes.toBytes( entry.getKey() ), Bytes.toBytes( entry.getValue() ) );
}
try {
//将数据插入到表中:单条数据插入
table.put( put );
//多条数据插入
//void put(List<Put> puts)
} catch (IOException e) {
e.printStackTrace();
}
}
//删除数据
/**
* HBase表中,没有真正意义上的删除(执行命令,并不会立马将数据删除),
* 而是给数据 打标签,所以查询时候看不到
*
* 真正删除,是在每个Region进行MajorCompaction的时候进行
* HBase为了防止小文件过多,HBase在必要时候会将这些小文件(store file)合并成大文件,
* 这个过程称为Compaction
* MajorCompaction:
* minorCompaction
*
* @param table
*/
private static void delete(Table table){
Delete delete = new Delete(Bytes.toBytes( "100001" ));
//delete.addFamily( Bytes.toBytes( "info" ) );
delete.addColumn( Bytes.toBytes( "info" ),Bytes.toBytes( "id" ) );
try {
table.delete( delete );
} catch (IOException e) {
e.printStackTrace();
}
}
}