参考:
http://www.cnblogs.com/lion.net/p/3932741.html
http://www.linuxidc.com/Linux/2013-11/92556.htm
第一步:如果查询hive数据需要添加hive-contrib.jar支持。
add jar /opt/cloudera/parcels/CDH-5.5.4-1.cdh5.5.4.p0.9/lib/hive/lib/hive-contrib.jar;
第二步:Hive的日志表创建,并同步到Hbase中。
1) 创建自增长ID的java类UDFRowSequence.java,并添加到Hive环境中。
在Eclipse中创建Maven项目,然后使用Maven将项目打包成Jar文件,过程中需要引入hadoop-common-2.2.0.jar、hive-exec-0.13.1.jar这两个文件。然后导出打成jar文件hive_udf.jar.
package org.rowsequence;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.io.LongWritable;
/**
* UDFRowSequence.
*/
@Description(name = "row_sequence",
value = "_FUNC_() - Returns a generated row sequence number starting from 1")
@UDFType(deterministic = false)
public class RowSequence extends UDF {
private LongWritable result = new LongWritable();
public RowSequence() {
result.set(0);
}
public LongWritable evaluate() {
result.set(result.get() + 1);
return result;
}
}
2) 在Hive中创建日志表
然后将我们自定义的hive_udf.jar添加到Hive运行环境中
hive> ADD JAR /home/hadoop/hive-0.13.1/lib/idoall.org-0.0.1-SNAPSHOT-jar-with-dependencies.jar;
接下来执行创建序列化自增主键函数
hive>create temporary function row_sequence as 'org.rowsequence.RowSequence';
接下来执行在Hive中创建收集Nginx的日志表(nginx_accesslog)
hive>create table nginx_accesslog (ipaddress STRING, identd STRING, user STRING,finishtime STRING,requestline string, returncode INT, size INT,referer string,agent string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.dynamic_type.DynamicSerDe'WITH SERDEPROPERTIES ('serialization.format'='org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol','quote.delim'='("|\\[|\\])','field.delim'=' ','serialization.null.format'='-')STORED AS TEXTFILE location '/user/hive/warehouse/nginxlog';
接下来将nginx日志上传至/opt下。
接着将log日志导入hive中。
hive>load data local inpath '/opt/access_log' overwrite into table apachelog;
3) 在Hive中创建Hbase可以识别的表,这时在Hbase中,可以看到h2b_nginx_accesslog表自动创建了
hive>CREATE TABLE h2b_nginx_accesslog(key int,ipaddress string,identd string,user string,finishtime string,requestline string,returncode string,size string, referer string,agent string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,log:ipaddress,log:identd,log:user,log:finishtime,log:requestline,log:returncode,log:size,log:referer,log:agent") TBLPROPERTIES ("hbase.table.name" = "h2b_nginx_accesslog");
4) 将Hive的日志表(nginx_accesslog)导入到表(h2b_nginx_accesslog)中,会自动同步到hbase
insert overwrite table h2b_nginx_accesslog select a.* from (select row_sequence(), ipaddress, identd, user, finishtime, requestline, returncode, size, referer, agent from nginx_accesslog) a;
http://www.cnblogs.com/lion.net/p/3932741.html
http://www.linuxidc.com/Linux/2013-11/92556.htm
第一步:如果查询hive数据需要添加hive-contrib.jar支持。
add jar /opt/cloudera/parcels/CDH-5.5.4-1.cdh5.5.4.p0.9/lib/hive/lib/hive-contrib.jar;
第二步:Hive的日志表创建,并同步到Hbase中。
1) 创建自增长ID的java类UDFRowSequence.java,并添加到Hive环境中。
在Eclipse中创建Maven项目,然后使用Maven将项目打包成Jar文件,过程中需要引入hadoop-common-2.2.0.jar、hive-exec-0.13.1.jar这两个文件。然后导出打成jar文件hive_udf.jar.
package org.rowsequence;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.io.LongWritable;
/**
* UDFRowSequence.
*/
@Description(name = "row_sequence",
value = "_FUNC_() - Returns a generated row sequence number starting from 1")
@UDFType(deterministic = false)
public class RowSequence extends UDF {
private LongWritable result = new LongWritable();
public RowSequence() {
result.set(0);
}
public LongWritable evaluate() {
result.set(result.get() + 1);
return result;
}
}
2) 在Hive中创建日志表
然后将我们自定义的hive_udf.jar添加到Hive运行环境中
hive> ADD JAR /home/hadoop/hive-0.13.1/lib/idoall.org-0.0.1-SNAPSHOT-jar-with-dependencies.jar;
接下来执行创建序列化自增主键函数
hive>create temporary function row_sequence as 'org.rowsequence.RowSequence';
接下来执行在Hive中创建收集Nginx的日志表(nginx_accesslog)
hive>create table nginx_accesslog (ipaddress STRING, identd STRING, user STRING,finishtime STRING,requestline string, returncode INT, size INT,referer string,agent string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.dynamic_type.DynamicSerDe'WITH SERDEPROPERTIES ('serialization.format'='org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol','quote.delim'='("|\\[|\\])','field.delim'=' ','serialization.null.format'='-')STORED AS TEXTFILE location '/user/hive/warehouse/nginxlog';
接下来将nginx日志上传至/opt下。
接着将log日志导入hive中。
hive>load data local inpath '/opt/access_log' overwrite into table apachelog;
3) 在Hive中创建Hbase可以识别的表,这时在Hbase中,可以看到h2b_nginx_accesslog表自动创建了
hive>CREATE TABLE h2b_nginx_accesslog(key int,ipaddress string,identd string,user string,finishtime string,requestline string,returncode string,size string, referer string,agent string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,log:ipaddress,log:identd,log:user,log:finishtime,log:requestline,log:returncode,log:size,log:referer,log:agent") TBLPROPERTIES ("hbase.table.name" = "h2b_nginx_accesslog");
4) 将Hive的日志表(nginx_accesslog)导入到表(h2b_nginx_accesslog)中,会自动同步到hbase
insert overwrite table h2b_nginx_accesslog select a.* from (select row_sequence(), ipaddress, identd, user, finishtime, requestline, returncode, size, referer, agent from nginx_accesslog) a;