(Hbase版本直接使用jar有较大的变化,我使用的Hbase1.3.3,安装可见上一个博客)
https://blog.csdn.net/qq_40304825/article/details/91335063
如果是纯命令行的Linux,推荐使用Filezilla在Windows与Linux虚拟机之间文件传输。
一、效果
在Hadoop服务器上256MB的日志文件(.log)
在Hbase上70W条数据
运行结果如下
感觉还行吧!
二、需求分析
将存放在Hadoop上的文件,通过java代码将其解析放入Hbase中,并统计INFO与ERROR信息的数目
三、整体步骤
(1)一定要导入正确的jar包
一定要导入正确的jar包
一定要导入正确的jar包
这个jar包我们可以在Hbase的lib下加载进来,或者去百度mvnrepository这个仓库下载(不过我经常容易选错,感觉有点遭不住)
(2)创建一个Maven项目
如果你想用数据仓库的形式导入jar包,可以创这个项目,如果手动导入的话,则随便一个java工程即可。
(3)上代码
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class DataStorage {
//这里使用HBaseConfiguration.create()与直接new Configuration()有很大区别
private static Configuration configuration = HBaseConfiguration.create();
TableName tableName;
//将表名传递给HTableDescriptor,这就是对表的描述之类的
HTableDescriptor hTableDescriptor;
//连接池,这是在Hbase1.X以后推荐的版本
static Connection connection = null;
//线程池,开多了不好管理,开少了相当于没开
static ExecutorService executor;
//静态函数,程序运行时有点慢,但是之后运行较快,所有操作共用一个连接池和配置文件
static {
try {
configuration = HBaseConfiguration.create();
//配置Hbase的时候,应该都见过这些参数,跟着hbase-site.xml配置即可
configuration.set("hbase.zookeeper.quorum", "192.168.138.128");
configuration.set("hbase.zookeeper.property.clientPort", "2181");
configuration.set("hbase.master", "192.168.138.128:600000");
//开启线程池,并以此开启连接池
executor = Executors.newFixedThreadPool(20);
connection = ConnectionFactory.createConnection(configuration, executor);
} catch (Exception e) {
}
}
//构造函数,设定要操作的表的名称
public DataStorage(String tableName, String[] familyName) {
//设置表名
this.tableName = TableName.valueOf(tableName);
//设置表描述
this.hTableDescriptor = new HTableDescriptor(this.tableName);
//设置列族
for (int i = 0 ;i<familyName.length;i++)
this.hTableDescriptor.addFamily(new HColumnDescriptor(familyName[i]));
}
//将Buffer数据插入Hbase中
private void insertBufferToHbase(String family, BufferedReader bufferedReader) throws IOException {
//了解Hbase的存储形式,知道Hbase会有一个缓冲区,bestBathPutSize我就是根据缓冲区大小写的。
int bestBathPutSize = 3177;
//这里不理解可以将Mutation改为Put,Put是继承了Mutation的
List<Mutation> mutations = new ArrayList<Mutation>();
//相应的,也可以改成Table
BufferedMutator table = connection.getBufferedMutator(hTableDescriptor.getTableName());
try {
while (true) {
try {
//这是自定义函数,将数据转换成Put的形式
Put put = dataToPut(family,bufferedReader.readLine());
//保证添加的数据不为空
if (put != null)
mutations.add(put);
//如果数据量达到一定程度,就先存入
if (mutations.size() == bestBathPutSize){
table.mutate(mutations);
table.flush();
mutations.clear();
}
} catch (IndexOutOfBoundsException e) {
break;
}
}
//将剩下的数据存入
table.mutate(mutations);
table.flush();
} catch (IOException e) {
} finally {
//一定要记得关闭相关的流,文件等,不然线程池可能会受到影响
if (null != table) {
try {
table.close();
} catch (IOException e) {
}
}
if (null != bufferedReader) {
try {
bufferedReader.close();
} catch (IOException e) {
}
}
}
}
//将数据从Hadoop取出来,然后存入Hbase中
public void saveDataFromHadoopToHbase(String family, String path) {
//我们将许多的.log文件放在文件夹里面,因此我们需要遍历
URI uri;
FileSystem fileSystem;
FSDataInputStream fsDataInputStream = null;
BufferedReader bufferedReader = null;
FileStatus[] fileStatus = null;
try {
uri = new URI(path);
fileSystem = FileSystem.get(uri, configuration);
//遍历hadoop文件
fileStatus = fileSystem.listStatus(new Path(uri));
for (FileStatus file:fileStatus) {
//判断是否为日志文件
if (file.getPath().getName().contains(".log")){
//将日志文件的内容存入流中
fsDataInputStream = fileSystem.open(file.getPath());
bufferedReader = new BufferedReader(new InputStreamReader(fsDataInputStream));
//调用函数将其存入Hbase中
this.insertBufferToHbase(family,bufferedReader);
}
}
} catch (URISyntaxException e) {
} catch (IOException e) {
} finally {
//关闭相关的数据流与各种缓存
try {
if (null == fileStatus){
}
if (null == fsDataInputStream){
fsDataInputStream.close();
}
if (null == bufferedReader){
bufferedReader.close();
}
} catch (IOException e){
}
}
}
//简单的插入数据,以Puts的形式插入数据到Hbase中
public void insertPutToHbase(List<Put> puts) {
if (puts == null) {
System.out.println("puts is null");
return;
}
Table table = null;
try {
table = connection.getTable(hTableDescriptor.getTableName());
table.put(puts);
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (table != null)
table.close();
} catch (IOException e) {
}
}
}
//数据清洗,将符合规范的数据变成Put
private Put dataToPut(String family, String data) {
if (data == null) {
throw new IndexOutOfBoundsException();
}
String[] fragment = data.split(" ");
Put put = null;
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
if (fragment.length != 0) {
try {
simpleDateFormat.parse(fragment[0]);
put = new Put(Bytes.toBytes(fragment[0] + fragment[1] + 5));
put.addColumn(Bytes.toBytes(family), Bytes.toBytes(fragment[2]), Bytes.toBytes(fragment[3]));
put.setDurability(Durability.SKIP_WAL);
} catch (ParseException e) {
}
}
return put;
}
//将线程池与连接池关闭
public void close() {
try {
if (null != connection) {
connection.close();
}
if (null != executor) {
executor.shutdown();
}
} catch (IOException e) {
}
}
//创建表
public void createTable() throws IOException {
Admin admin = connection.getAdmin();
if (admin.tableExists(hTableDescriptor.getTableName())) {
System.out.println("table is Exists!");
} else {
try {
admin.createTable(hTableDescriptor);
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (admin != null)
admin.close();
} catch (IOException e) {
}
}
System.out.println("create table successfully");
}
}
//以列名对数据进行查询
public int queryByColumn(String family, String column, boolean isPrint){
Table table = null;
int count = 0;
try {
//也是以连接池的形式对数据进行操作
table = connection.getTable(hTableDescriptor.getTableName());
//设置filter,接下来就是正常的Hbase查询操作
SingleColumnValueFilter filter = new SingleColumnValueFilter(Bytes.toBytes(family),
Bytes.toBytes(column), CompareFilter.CompareOp.NOT_EQUAL, Bytes.toBytes(" "));
filter.setFilterIfMissing(true);
Scan scan = new Scan();
scan.setFilter(filter);
ResultScanner rs = table.getScanner(scan);
for (Result r : rs) {
String rowkey = new String(r.getRow());
String value = new String(r.getValue(Bytes.toBytes(family),Bytes.toBytes(column)));
//如果是否打印,因为数据量多,影响观感
if (isPrint){
System.out.println("rowkey:"+rowkey+"\tvalue:"+value);
}
count++;
}
} catch (IOException e) {
}finally {
if (null != table){
try {
table.close();
} catch (IOException e) {
}
}
}
return count;
}
public static void main(String[] args) {
String tableName = "log";
String[] family = {"style"};
int count;
DataStorage dataStorage = new DataStorage(tableName, family);
String path = "hdfs://192.168.138.128:9000/Logs/";
Long time = System.currentTimeMillis();
dataStorage.saveDataFromHadoopToHbase(family[0],path);
Long time1 = System.currentTimeMillis();
System.out.println("将Hadoop文件内容存入Hbase时间:"+(time1 - time)+"毫秒");
count = dataStorage.queryByColumn(family[0],"ERROR",false);
Long time2 = System.currentTimeMillis();
System.out.println("统计ERROR数据量:"+count+"条");
System.out.println("查询统计ERROR时间:"+(time2 - time1)+"毫秒");
count = dataStorage.queryByColumn(family[0],"INFO",false);
Long time3 = System.currentTimeMillis();
System.out.println("统计INFO数据量:"+count);
System.out.println("统计INFO时间:"+(time3 - time2)+"毫秒");
//关闭连接池,关闭线程池
dataStorage.close();
}
}
前期搞版本搞了很久,后期又想搞协处理器,后面应该会学习到相应的东西。