背景:
由于数据量非常大,客户需要对过期的数据进行清理,例如:hbase表中有2017年与2016年的数据,现在需要将2016的数据进行清理,即批删除操作。又因为hbase在删除方面较弱,提供单行删除功能。
- 接到这个要求后,通过查找资料,想到三条路线:
- 利用hbase shell命令来删除;
- 利用hbase-java的api;
- 利用容灾备份export及import工具;
- 利用mapreduce来删除
- 本文主要使用java的api进行清理,步骤如下
第一节 清理代码
本次操作的数据如图1所示,可以发现,表rowkey中含有日期时间,所以在批量删除时采用rowkey的时间来过滤。
代码第一版,以A表为例,删除2016年的数据,则在下面代码中的arg[0]设为:2016即可
package Test;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.filter.SubstringComparator;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class ClearDatafromHBase {
private static Configuration config = HBaseConfiguration.create();
private static HTable tableEvent = null;
/**
* Perparing clear data
*/
public static List<Delete> getDeleteList(ResultScanner rs) {
List<Delete> list = new ArrayList<Delete>();
try {
for (Result result : rs) {
Delete delete = new Delete(result.getRow());
list.add(delete);
}
} finally {
rs.close();
}
return list;
}
/**
* Clear data from event
*/
public static void deleteRowkeyOfEvent(String string) {
try {
tableEvent = new HTable(config, "A");
RowFilter rf = new RowFilter(CompareFilter.CompareOp.EQUAL, new SubstringComparator(string));
Scan scan = new Scan();
scan.setFilter(rf);
ResultScanner scanner = tableEvent.getScanner(scan);
List<Delete> list = getDeleteList(scanner);
if (list.size() > 0) {
tableEvent.delete(list);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (null != tableEvent) {
try {
tableEvent.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public static void main(String[] args) throws IOException {
ClearDatafromHBase dh = new ClearDatafromHBase();
String string = args[0];
dh.deleteRowkeyOfEvent(string);
}
}
代码第二版,能够制定表名和时间,运行方法与第一版相同,只是参数[0]表示表名,参数[1]表示时间
package Test;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.filter.SubstringComparator;
import org.apache.hadoop.hbase.util.Bytes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Created by lihao on 2017/8/9.
* Clear the databases of timeout from hbase
*/
public class ClearDatafromHBase {
private static String nsPrefix = "razor:";
private static Logger logger = LoggerFactory.getLogger(ClearDatafromHBase.class);
private static HConnection hBaseConn;
private static Configuration config = null;
private static HTable tableEvent1 = null;
private static HTableInterface table = null;
// private static HTableInterface tableError = null;
// private static HTableInterface tableClientdata = null;
// private static HTableInterface tableUsinglog = null;
static {
try {
config = HBaseConfiguration.create();
hBaseConn = HConnectionManager.createConnection(config);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void init(String tableName) throws IOException {
table = hBaseConn.getTable(tableName);
TableName name = table.getName();
System.out.println(name);
}
/**
* Perparing clear data
*/
public static List<Delete> getDeleteList(ResultScanner rs) {
List<Delete> list = new ArrayList<Delete>();
try {
for (Result result : rs) {
Delete delete = new Delete(result.getRow());
list.add(delete);
}
} finally {
rs.close();
}
return list;
}
public static Scan getScannerByRowkey(String string) {
Scan scan = new Scan();
RowFilter rf = new RowFilter(CompareFilter.CompareOp.EQUAL, new SubstringComparator(string));
scan.setFilter(rf);
return scan;
}
public void clearData(Scan scan, HTableInterface table, String date) {
ResultScanner resultScan = null;
try {
resultScan = table.getScanner(scan);
List<Delete> list = getDeleteList(resultScan);
if (list.size() > 0) {
table.delete(list);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (null != table) {
try {
table.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public void deleteOfHbase(String string) {
clearData(getScannerByRowkey(string), table, string);
}
public static void close(HTableInterface table) {
if (table != null) {
try {
table.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
String tablename = nsPrefix+args[0];
String time = args[1];
if (args == null || args.length <= 0) {
logger.error("输入参数错误");
throw new RuntimeException("输入参数错误");
}
logger.info("开始清理数据");
try {
init(tablename);
ClearDatafromHBase dh = new ClearDatafromHBase();
dh.deleteOfHbase(time);
logger.info("数据清理结束");
} catch (IOException e) {
logger.error("清理数据失败", e);
e.printStackTrace();
} finally {
close(table);
}
}
}
第二节 按照时间戳清理
如何rowkey中不含有时间标志的话,可以根据时间戳的范围进行删除。
- example of code
public static void deleteTimeRange(String tableName, Long minTime, Long maxTime) {
Table table = null;
Connection connection = null;
try {
Scan scan = new Scan();
scan.setTimeRange(minTime, maxTime);
connection = HBaseOperator.getHbaseConnection();
table = connection.getTable(TableName.valueOf(tableName));
ResultScanner rs = table.getScanner(scan);
List<Delete> list = getDeleteList(rs);
if (list.size() > 0) {
table.delete(list);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (null != table) {
try {
table.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (connection != null) {
try {
connection.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
private static List<Delete> getDeleteList(ResultScanner rs) {
List<Delete> list = new ArrayList<>();
try {
for (Result r : rs) {
Delete d = new Delete(r.getRow());
list.add(d);
}
} finally {
rs.close();
}
return list;
}
参考文献:
- http://blog.csdn.net/qq_27593415/article/details/53167980
- http://blog.csdn.net/u011518678/article/details/50805036
- http://bbs.csdn.net/topics/390630222
- http://blog.csdn.net/songchunhong/article/details/51898143
- http://www.aboutyun.com/thread-8306-1-1.html
- http://blog.csdn.net/qqpy789/article/details/52486964
第三节 总结
- 在这个过程中总结的相关知识点
-
这里是列表文本hbase连接对表的操作
-
hbase shell命令的使用
-
hbase filter的使用
-
hbase容灾备份
-
hbase的mapreduce任务
本文仅工作之余所做初版,后期会进行修改及更新操作,如有转载,请标明出处.