把 hive 安装目录下的lib 添加到java项目中
简单UDF示例
1、先开发一个java类,继承UDF,并重载evaluate方法
-
package com.hive.udf;
-
import org.apache.hadoop.hive.ql.exec.UDF;
-
public class ToLowerCase extends UDF {
-
public String evaluate(String field) {
-
String result = field.toLowerCase();
-
return result;
-
}
-
}
2、打成jar包上传到服务器
3、将jar包添加到hive的classpath
hive>add JAR /home/hadoop/udf.jar;
4.创建临时函数与开发好的java class关联
hive>create temporary function tolowercase as 'com.hive.udf.ToLowerCase'
5、即可在hql中使用自定义的函数 tolowercase
hive>select tolowercase('ANGEL') from dual;
Json数据解析UDF开发
rating.json
-
{"movie":"260","rate":"4","timeStamp":"978300760","uid":"1"}
-
{"movie":"1028","rate":"5","timeStamp":"978301777","uid":"1"}
-
{"movie":"1029","rate":"5","timeStamp":"978302205","uid":"1"}
-
{"movie":"1207","rate":"4","timeStamp":"978300719","uid":"1"}
-
{"movie":"2028","rate":"5","timeStamp":"978301619","uid":"1"}
-
{"movie":"531","rate":"4","timeStamp":"978302149","uid":"1"}
-
{"movie":"3114","rate":"4","timeStamp":"978302174","uid":"1"}
-
{"movie":"608","rate":"4","timeStamp":"978301398","uid":"1"}
-
{"movie":"1246","rate":"4","timeStamp":"978302091","uid":"1"}
-
{"movie":"1357","rate":"5","timeStamp":"978298709","uid":"2"}
-
{"movie":"3068","rate":"4","timeStamp":"978299000","uid":"2"}
-
{"movie":"1537","rate":"4","timeStamp":"978299620","uid":"2"}
-
{"movie":"647","rate":"3","timeStamp":"978299351","uid":"2"}
-
{"movie":"2194","rate":"4","timeStamp":"978299297","uid":"2"}
-
{"movie":"648","rate":"4","timeStamp":"978299913","uid":"2"}
-
{"movie":"2268","rate":"5","timeStamp":"978299297","uid":"2"}
-
{"movie":"2628","rate":"3","timeStamp":"978300051","uid":"2"}
-
{"movie":"1103","rate":"3","timeStamp":"978298905","uid":"2"}
-
{"movie":"2916","rate":"3","timeStamp":"978299809","uid":"2"}
创建rat_json表并且导入
-
create table rat_json(line string) row format delimited;
-
load data local inpath '/home/hadoop/rating.json' into table rat_json;
开发一个java类,继承UDF,并重载evaluate方法
-
package com.hive.udf;
-
import org.apache.hadoop.hive.ql.exec.UDF;
-
import org.codehaus.jackson.map.ObjectMapper;
-
public class JsonParser extends UDF {
-
public String evaluate(String jsonLine) {
-
ObjectMapper objectMapper = new ObjectMapper();
-
try {
-
MovieRateBean bean = objectMapper.readValue(jsonLine, MovieRateBean.class);
-
return bean.toString();
-
} catch (Exception e) {
-
// TODO Auto-generated catch block
-
e.printStackTrace();
-
}
-
return "";
-
}
-
}
-
package com.hive.udf;
-
public class MovieRateBean {
-
private String movie;
-
private String rate;
-
private String timeStamp;
-
private String uid;
-
public String getMovie() {
-
return movie;
-
}
-
public void setMovie(String movie) {
-
this.movie = movie;
-
}
-
public String getRate() {
-
return rate;
-
}
-
public void setRate(String rate) {
-
this.rate = rate;
-
}
-
public String getTimeStamp() {
-
return timeStamp;
-
}
-
public void setTimeStamp(String timeStamp) {
-
this.timeStamp = timeStamp;
-
}
-
public String getUid() {
-
return uid;
-
}
-
public void setUid(String uid) {
-
this.uid = uid;
-
}
-
@Override
-
public String toString() {
-
return movie + "\t" + rate + "\t" + timeStamp + "\t" + uid;
-
}
-
}
2、打成jar包上传到服务器
3、将jar包添加到hive的classpath
hive>add JAR /home/hadoop/JsonParser.jar;
4、创建临时函数与开发好的java class关联
Hive>create temporary function JsonParser as 'com.hive.udf.JsonParser';
5、即可在hql中使用自定义的函数JsonParser
select JsonParser(line) from rat_json;
把存储json字符串数据分割存储到其他表
-
drop table if exists t_rating;
-
create table t_rating(movieid string,rate int,timestring string,uid string)
-
row format delimited fields terminated by '\t';
-
insert overwrite table t_rating
-
select split(JsonParser(line),'\t')[0]as movieid,split(JsonParser(line),'\t')[1] as rate,split(JsonParser(line),'\t')[2] as timestring,split(JsonParser(line),'\t')[3] as uid from rat_json limit 10;
-
使用hive内置jason函数
-
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json limit 10;
-----------
transform案例:
1、先加载rating.json文件到hive的一个原始表 rat_json
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;
2、需要解析json数据成四个字段,插入一张新的表 t_rating
insert overwrite table t_rating
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json;
3、使用transform+python的方式去转换unixtime为weekday
先编辑一个python脚本文件
########python######代码
vi weekday_mapper.py
#!/bin/python
import sys
import datetime
for line in sys.stdin:
line = line.strip()
movieid, rating, unixtime,userid = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([movieid, rating, str(weekday),userid])
保存文件
然后,将文件加入hive的classpath:
hive>add FILE /home/hadoop/weekday_mapper.py;
hive>create TABLE u_data_new as
SELECT
TRANSFORM (movieid, rate, timestring,uid)
USING 'python weekday_mapper.py'
AS (movieid, rate, weekday,uid)
FROM t_rating;
select distinct(weekday) from u_data_new limit 10;