- 现有原始 json 数据(rating.json)如下
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
{"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}
{"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}
{"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"}
{"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"}
{"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"}
- 创建json表并将数据导入进去
create table json(data string);
load data local inpath '/home/hadoop/json.txt' into table json;
- 创建 rate 这张表用来存储解析 json 出来的字段:
create table rate(movie int, rate int, unixtime int, userid int) row format delimited fields
terminated by '\t';
insert overwrite table rate
select
get_json_object(data,'$.movie') as moive,
get_json_object(data,'$.rate') as rate,
get_json_object(data,'$.timeStamp') as unixtime,
get_json_object(data,'$.uid') as userid
from json;
hive (default)> select * from rate;
OK
rate.movie rate.rate rate.unixtime rate.userid
1193 5 978300760 1
661 3 978302109 1
914 3 978301968 1
3408 4 978300275 1
2355 5 978824291 1
1197 3 978302268 1
1287 5 978302039 1
2804 5 978300719 1
594 4 978302268 1
-
将结果数据保存到rate.txt 用作本地测试
-
使用 transform+python 的方式去转换 unixtime 为 weekday 保存为weekday_mapper.py 使用
python3的语法
python2报错,不知道什么原因
import sys
import datetime
def main():
try:
for line in sys.stdin:
line = line.strip()
# print(line)
movie, rate, unixtime, userid = line.split('\t')
weekday = datetime.datetime.fromtimestamp(
float(unixtime)).isoweekday()
# print str(weekday)
print('\t'.join([movie, rate, str(weekday), userid]))
except Exception as e:
print(e)
return None
if __name__ == '__main__':
main()
- 本地测试
python weekday_mapper.py < rate.txt
- 创建最后的用来存储调用 python 脚本解析出来的数据的表:lastjsontable
create table lastjsontable(movie int, rate int, weekday int, userid int) row format delimited
fields terminated by '\t';
- 上传到hive环境中
hive (default)> add file /Users/liuzhiwei/Desktop/weekday_mapper.py;
Added resources: [/Users/liuzhiwei/Desktop/weekday_mapper.py]
- 执行
hive (default)> insert into table lastjsontable select transform(movie,rate,unixtime,userid)
> using 'python weekday_mapper.py' as(movie,rate,weekday,userid) from rate;
- 结果
hive (default)> select * from lastjsontable;
OK
lastjsontable.movie lastjsontable.rate lastjsontable.weekday lastjsontable.userid
1193 5 1 1
661 3 1 1
914 3 1 1
3408 4 1 1
2355 5 7 1
1197 3 1 1
1287 5 1 1
2804 5 1 1
594 4 1 1
Time taken: 0.074 seconds, Fetched: 9 row(s)
hive (default)>
one more something
transform 不要多用
转自:https://blog.csdn.net/purisuit_knowledge/article/details/81565295