hive UDF开发实例

把 hive 安装目录下的lib 添加到java项目中


简单UDF示例
1、先开发一个java类,继承UDF,并重载evaluate方法

package com.hive.udf;

import org.apache.hadoop.hive.ql.exec.UDF;

public class ToLowerCase extends UDF {

	public String evaluate(String field) {
		String result = field.toLowerCase();
		return result;
	}

}

2、打成jar包上传到服务器

3、将jar包添加到hive的classpath

hive>add JAR /home/hadoop/udf.jar;

4.创建临时函数与开发好的java class关联

hive>create temporary function tolowercase as 'com.hive.udf.ToLowerCase'
5、即可在hql中使用自定义的函数  tolowercase

hive>select tolowercase('ANGEL') from dual;

Json数据解析UDF开发

rating.json

{"movie":"260","rate":"4","timeStamp":"978300760","uid":"1"}
{"movie":"1028","rate":"5","timeStamp":"978301777","uid":"1"}
{"movie":"1029","rate":"5","timeStamp":"978302205","uid":"1"}
{"movie":"1207","rate":"4","timeStamp":"978300719","uid":"1"}
{"movie":"2028","rate":"5","timeStamp":"978301619","uid":"1"}
{"movie":"531","rate":"4","timeStamp":"978302149","uid":"1"}
{"movie":"3114","rate":"4","timeStamp":"978302174","uid":"1"}
{"movie":"608","rate":"4","timeStamp":"978301398","uid":"1"}
{"movie":"1246","rate":"4","timeStamp":"978302091","uid":"1"}
{"movie":"1357","rate":"5","timeStamp":"978298709","uid":"2"}
{"movie":"3068","rate":"4","timeStamp":"978299000","uid":"2"}
{"movie":"1537","rate":"4","timeStamp":"978299620","uid":"2"}
{"movie":"647","rate":"3","timeStamp":"978299351","uid":"2"}
{"movie":"2194","rate":"4","timeStamp":"978299297","uid":"2"}
{"movie":"648","rate":"4","timeStamp":"978299913","uid":"2"}
{"movie":"2268","rate":"5","timeStamp":"978299297","uid":"2"}
{"movie":"2628","rate":"3","timeStamp":"978300051","uid":"2"}
{"movie":"1103","rate":"3","timeStamp":"978298905","uid":"2"}
{"movie":"2916","rate":"3","timeStamp":"978299809","uid":"2"}


创建rat_json表并且导入
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;

开发一个java类,继承UDF,并重载evaluate方法

package com.hive.udf;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.codehaus.jackson.map.ObjectMapper;

public class JsonParser extends UDF {

	public String evaluate(String jsonLine) {
		ObjectMapper objectMapper = new ObjectMapper();

		try {
			MovieRateBean bean = objectMapper.readValue(jsonLine, MovieRateBean.class);
			return bean.toString();

		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return "";
	}

}

package com.hive.udf;

public class MovieRateBean {
	private String movie;
	private String rate;
	private String timeStamp;
	private String uid;

	public String getMovie() {
		return movie;
	}

	public void setMovie(String movie) {
		this.movie = movie;
	}

	public String getRate() {
		return rate;
	}

	public void setRate(String rate) {
		this.rate = rate;
	}

	public String getTimeStamp() {
		return timeStamp;
	}

	public void setTimeStamp(String timeStamp) {
		this.timeStamp = timeStamp;
	}

	public String getUid() {
		return uid;
	}

	public void setUid(String uid) {
		this.uid = uid;
	}

	@Override
	public String toString() {
		return movie + "\t" + rate + "\t" + timeStamp + "\t" + uid;
	}

}

2、打成jar包上传到服务器
3、将jar包添加到hive的classpath
hive>add JAR /home/hadoop/JsonParser.jar;
4、创建临时函数与开发好的java class关联
Hive>create temporary function JsonParser as 'com.hive.udf.JsonParser';
5、即可在hql中使用自定义的函数JsonParser

select JsonParser(line) from rat_json;


把存储json字符串数据分割存储到其他表
drop table if exists t_rating;
create table t_rating(movieid string,rate int,timestring string,uid string)
row format delimited fields terminated by '\t';
insert overwrite table t_rating
select split(JsonParser(line),'\t')[0]as movieid,split(JsonParser(line),'\t')[1] as rate,split(JsonParser(line),'\t')[2] as timestring,split(JsonParser(line),'\t')[3] as uid from rat_json limit 10;

使用hive内置jason函数
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate  from rat_json limit 10;

-----------
transform案例:


1、先加载rating.json文件到hive的一个原始表 rat_json
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;


2、需要解析json数据成四个字段,插入一张新的表 t_rating
insert overwrite table t_rating
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate  from rat_json;


3、使用transform+python的方式去转换unixtime为weekday
先编辑一个python脚本文件
########python######代码
vi weekday_mapper.py
#!/bin/python
import sys
import datetime


for line in sys.stdin:
  line = line.strip()
  movieid, rating, unixtime,userid = line.split('\t')
  weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
  print '\t'.join([movieid, rating, str(weekday),userid])


保存文件
然后,将文件加入hive的classpath:
hive>add FILE /home/hadoop/weekday_mapper.py;
hive>create TABLE u_data_new as
SELECT
  TRANSFORM (movieid, rate, timestring,uid)
  USING 'python weekday_mapper.py'
  AS (movieid, rate, weekday,uid)
FROM t_rating;


select distinct(weekday) from u_data_new limit 10;




  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值