UDF解析json

Hive是基于Hadoop的一个数据仓库工具,可以将结构化的数据文件映射为一张数据库表,并提供类SQL查询功能。本文描述了HIve的一些基本操作,如有错误之处还请指出。

1360770-20180603091447129-2013009712.png

常用语法


    
    
  1. #显示相关信息
  2. show tables;
  3. show databases;
  4. show partitions;
  5. show functions;
  6. desc extended table_name;
  7. desc formatted table_name;
  8. #创建库
  9. create database test_db;
  10. #删除库
  11. drop database 库名;
  12. #删除表
  13. drop table 表名;
  14. #重命名表名
  15. ALTER TABLE table_name RENAME TO new_table_name;
  16. #清空表数据
  17. truncate table 表名;

建表语句


    
    
  1. CREATE [ EXTERNAL] TABLE [ IF NOT EXISTS] table_name
  2. [(col_name data_type [ COMMENT col_comment], ...)]
  3. [ COMMENT table_comment]
  4. [PARTITIONED BY (col_name data_type [ COMMENT col_comment], ...)]
  5. [CLUSTERED BY (col_name, col_name, ...)
  6. [SORTED BY (col_name [ ASC| DESC], ...)] INTO num_buckets BUCKETS]
  7. [ ROW FORMAT row_format]
  8. [ STORED AS file_format]
  9. [LOCATION hdfs_path]

创建内部表


    
    
  1. create table if not exists my_tb( id int, name string)
  2. row format delimited fields terminated by ',';

创建外部表


    
    
  1. #创建外部表要指明表的数据所在路径
  2. create table if not exists my_ex_tb(id int,name string)
  3. row format delimited fields terminated by ','
  4. location 'hdfs://192.168.38.3:9000/externdb/my_ex_tb/';

在删除表的时候,内部表的元数据和数据会被一起删除,而外部表只删除元数据,不删除数据。

加载数据到表目录下


    
    
  1. #在表的最后插入或覆盖整个表的数据(into/overwrite)
  2. load data local inpath '/root/1.txt' into(overwrite) table my_ex_tb;

创建分区表


    
    
  1. create table if not exists my_par_tb(id int,name string)
  2. partitioned by(country string)
  3. row format delimited fields terminated by ',';
  4. load data local inpath '/root/1.txt' into table my_par_tb partition(country='China');
  5. load data local inpath '/root/1.txt.us' into table my_par_tb partition(country='US');
  6. #1.txt中的数据
  7. 1,张三
  8. 2,李四
  9. 3,王五
  10. #1.txt.us中的数据
  11. 1,张三
  12. 2,李四
  13. 3,王五
  14. #select * from my_par_tb显示的数据
  15. 1 张三 China
  16. 2 李四 China
  17. 3 王五 China
  18. 1 张三 US
  19. 2 李四 US
  20. 3 王五 US
  21. #查某个分区里的数据
  22. select * from my_par_tb where country= 'China'
  23. 1 张三 China
  24. 2 李四 China
  25. 3 王五 China

添加删除分区


    
    
  1. #添加分区
  2. alter table my_par_tb add partition(country='Eng') partition(country='Ame');
  3. #删除分区
  4. alter table my_par_tb drop partition(country='Eng') partition(country='Ame');
  5. #显示表中的分区
  6. show partitions my_par_tb;
  7. country=China
  8. country=US

创建分桶表


    
    
  1. create table if not exists my_buck_tb(id int,name string)
  2. clustered by(id) sorted by(id)
  3. into 4 buckets
  4. row format delimited fields terminated by ',';
  5. #指定开启分桶
  6. set hive.enforce.bucketing= true;
  7. #分了几个桶就设置几个reduce,将从其他表中查出来多个文件,分表放入到多个桶里。
  8. set mapreduce.job.reduces= 4;
  9. #从my_tb表中查出数据插入到分桶表里
  10. insert into table my_buck_tb
  11. #指定map输出的数据根据id去分区,排序(cluster by等价于distribute by+sort by的效果)
  12. select id,name from my_tb cluster by(id);

保存查询结果

默认情况下查询结果显示在屏幕上,可以将查询结果保存到表里。


    
    
  1. #将查询结果保存到一张新创建的表中
  2. create table tmp_tb as select * from my_tb;
  3. #将查询结果保存到一张已经存在的表中
  4. insert into table tmp_tb select * from my_tb;
  5. #将查询结果保存到指定目录下(本地或hdfs上)
  6. #本地
  7. insert overwrite local directory '/root/out_tb/'
  8. select * from my_tb;
  9. #hdfs
  10. insert overwrite directory '/out_tb/'
  11. select * from my_tb;

join操作


    
    
  1. a表数据:
  2. 1,张三
  3. 2,李四
  4. 3,c
  5. 4,a
  6. 5,e
  7. 6,r
  8. b表数据:
  9. 1,绿间
  10. 3,青峰
  11. 4,黑子
  12. 9,红发
  13. 建表:
  14. create table a(id int,name string)
  15. row format delimited fields terminated by ',';
  16. create table b(id int,name string)
  17. row format delimited fields terminated by ',';
  18. 导入数据:
  19. load data local inpath '/root/a.txt' into table a;
  20. load data local inpath '/root/b.txt' into table b;
  21. #内连接(交集)
  22. select * from a inner join b on a.id=b.id;
  23. +-------+---------+-------+---------+--+
  24. | a.id | a.name | b.id | b.name |
  25. +-------+---------+-------+---------+--+
  26. | 1 | 张三 | 1 | 绿间 |
  27. | 3 | c | 3 | 青峰 |
  28. | 4 | a | 4 | 黑子 |
  29. +-------+---------+-------+---------+--+
  30. #左连接
  31. select * from a left join b on a.id=b.id;
  32. +-------+---------+-------+---------+--+
  33. | a.id | a.name | b.id | b.name |
  34. +-------+---------+-------+---------+--+
  35. | 1 | 张三 | 1 | 绿间 |
  36. | 2 | 李四 | NULL | NULL |
  37. | 3 | c | 3 | 青峰 |
  38. | 4 | a | 4 | 黑子 |
  39. | 5 | e | NULL | NULL |
  40. | 6 | r | NULL | NULL |
  41. +-------+---------+-------+---------+--+
  42. #右连接
  43. select * from a right join b on a.id=b.id;
  44. +-------+---------+-------+---------+--+
  45. | a.id | a.name | b.id | b.name |
  46. +-------+---------+-------+---------+--+
  47. | 1 | 张三 | 1 | 绿间 |
  48. | 3 | c | 3 | 青峰 |
  49. | 4 | a | 4 | 黑子 |
  50. | NULL | NULL | 9 | 红发 |
  51. +-------+---------+-------+---------+--+
  52. #全连接
  53. select * from a full outer join b on a.id=b.id;
  54. +-------+---------+-------+---------+--+
  55. | a.id | a.name | b.id | b.name |
  56. +-------+---------+-------+---------+--+
  57. | 1 | 张三 | 1 | 绿间 |
  58. | 2 | 李四 | NULL | NULL |
  59. | 3 | c | 3 | 青峰 |
  60. | 4 | a | 4 | 黑子 |
  61. | 5 | e | NULL | NULL |
  62. | 6 | r | NULL | NULL |
  63. | NULL | NULL | 9 | 红发 |
  64. +-------+---------+-------+---------+--+
  65. #左半连接(内连接的结果中只取左边的表的数据)
  66. select * from a left semi join b on a.id = b.id;
  67. +-------+---------+--+
  68. | a.id | a.name |
  69. +-------+---------+--+
  70. | 1 | 张三 |
  71. | 3 | c |
  72. | 4 | a |
  73. +-------+---------+--+

select查询语句


    
    
  1. SELECT [ALL | DISTINCT] select_expr, select_expr, ...
  2. FROM table_name[..join..on(a.id=b.id)]
  3. [ WHERE where_condition]
  4. [ GROUP BY col_list [ HAVING condition]]
  5. [CLUSTER BY col_list
  6. | [ DISTRIBUTE BY col_list] [ SORT BY| ORDER BY col_list]
  7. ]
  8. [ LIMIT number]
  9. #指定 map输出的数据根据 id去分区,排序(cluster by等价于 distribute by+ sort by的效果)
  10. select id, name from my_tb cluster by( id);

自定义函数

hive内置函数

https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
    
    

pom文件


    
    
  1. <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2. xsi:schemaLocation= "http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3. <modelVersion>4.0.0 </modelVersion>
  4. <groupId>com.xiaojie.mm </groupId>
  5. <artifactId>my_hive </artifactId>
  6. <version>0.0.1-SNAPSHOT </version>
  7. <properties>
  8. <hadoop.version>2.6.5 </hadoop.version>
  9. <hive.version>1.2.1 </hive.version>
  10. </properties>
  11. <dependencies>
  12. <!-- Hadoop -->
  13. <dependency>
  14. <groupId>org.apache.hadoop </groupId>
  15. <artifactId>hadoop-common </artifactId>
  16. <version>2.6.5 </version>
  17. </dependency>
  18. <!-- Hive -->
  19. <dependency>
  20. <groupId>org.apache.hive </groupId>
  21. <artifactId>hive-exec </artifactId>
  22. <version>1.2.1 </version>
  23. </dependency>
  24. <dependency>
  25. <groupId>org.apache.hive </groupId>
  26. <artifactId>hive-metastore </artifactId>
  27. <version>1.2.1 </version>
  28. </dependency>
  29. <dependency>
  30. <groupId>org.apache.hive </groupId>
  31. <artifactId>hive-pdk </artifactId>
  32. <version>0.10.0 </version>
  33. </dependency>
  34. <dependency>
  35. <groupId>javax.jdo </groupId>
  36. <artifactId>jdo2-api </artifactId>
  37. <version>2.3-eb </version>
  38. </dependency>
  39. <dependency>
  40. <groupId>commons-logging </groupId>
  41. <artifactId>commons-logging </artifactId>
  42. <version>1.1.1 </version>
  43. </dependency>
  44. <dependency>
  45. <groupId>junit </groupId>
  46. <artifactId>junit </artifactId>
  47. <version>4.7 </version>
  48. <scope>test </scope>
  49. </dependency>
  50. <dependency>
  51. <groupId>jdk.tools </groupId>
  52. <artifactId>jdk.tools </artifactId>
  53. <version>1.7 </version>
  54. <scope>system </scope>
  55. <systemPath>/home/miao/apps/install/jdk1.7.0_45/lib/tools.jar </systemPath>
  56. </dependency>
  57. </dependencies>
  58. </project>

自定义将大写转为小写的方法


    
    
  1. package com.xiaojie.mm;
  2. import org.apache.hadoop.hive.ql.exec.UDF;
  3. public class ToLower extends UDF{
  4. // 重载该方法
  5. public String evaluate(String field) {
  6. return field.toLowerCase();
  7. }
  8. }

导出jar包,并放到hive所在的机器上

scp tolower.jar mini1:/root/apps/
    
    

hive客户端添加自定义函数


    
    
  1. #第一步
  2. add JAR /root/apps/tolower.jar;
  3. #第二步 引号里是自定义方法的全名(临时方法,只在该回话窗口有效)
  4. create temporary function tolower as 'com.xiaojie.mm.ToLower';
  5. #第三步使用
  6. select * from a;
  7. +-------+---------+--+
  8. | a.id | a.name |
  9. +-------+---------+--+
  10. | 7 | AAAAA |
  11. | 1 | 张三 |
  12. | 2 | 李四 |
  13. | 3 | c |
  14. | 4 | a |
  15. | 5 | e |
  16. | 6 | r |
  17. +-------+---------+--+
  18. select id,tolower(name) from a;
  19. +-----+--------+--+
  20. | id | _c1 |
  21. +-----+--------+--+
  22. | 7 | aaaaa |
  23. | 1 | 张三 |
  24. | 2 | 李四 |
  25. | 3 | c |
  26. | 4 | a |
  27. | 5 | e |
  28. | 6 | r |
  29. +-----+--------+--+

自定义获取手机归属地


    
    
  1. package com.xiaojie.mm;
  2. import java.util.HashMap;
  3. import org.apache.hadoop.hive.ql.exec.UDF;
  4. public class GetProvince extends UDF{
  5. public static HashMap<String,String> provinceMap = new HashMap<String,String>();
  6. static {
  7. provinceMap.put( "183", "hangzhou");
  8. provinceMap.put( "186", "nanjing");
  9. provinceMap.put( "187", "suzhou");
  10. provinceMap.put( "188", "ningbo");
  11. }
  12. public String evaluate(int phonenumber) {
  13. String phone_num = String.valueOf(phonenumber);
  14. #取手机号码前三位
  15. String phone = phone_num.substring( 0, 3);
  16. return provinceMap.get(phone)==null? "未知" :provinceMap.get(phone);
  17. }
  18. }
  19. 原数据:
  20. +----------------------+---------------------+--+
  21. | flow_province.phone | flow_province.flow |
  22. +----------------------+---------------------+--+
  23. | 1837878 | 12m |
  24. | 1868989 | 13m |
  25. | 1878989 | 14m |
  26. | 1889898 | 15m |
  27. | 1897867 | 16m |
  28. | 1832323 | 78m |
  29. | 1858767 | 88m |
  30. | 1862343 | 99m |
  31. | 1893454 | 77m |
  32. +----------------------+---------------------+--+
  33. 调用自定义方法后:
  34. select phone,getpro(phone),flow from flow_province;
  35. +----------+-----------+-------+--+
  36. | phone | _c1 | flow |
  37. +----------+-----------+-------+--+
  38. | 1837878 | hangzhou | 12m |
  39. | 1868989 | nanjing | 13m |
  40. | 1878989 | suzhou | 14m |
  41. | 1889898 | ningbo | 15m |
  42. | 1897867 | 未知 | 16m |
  43. | 1832323 | hangzhou | 78m |
  44. | 1858767 | 未知 | 88m |
  45. | 1862343 | nanjing | 99m |
  46. | 1893454 | 未知 | 77m |
  47. +----------+-----------+-------+--+

自定义解析json格式的数据


    
    
  1. #创建表
  2. create table json_tb(line string);
  3. #导入数据
  4. load data local inpath '/root/test_data/a.json' into table json_tb;
  5. #显示原数据
  6. select line from my_tb limit 10;
  7. +----------------------------------------------------------------+--+
  8. | json_tb.line |
  9. +----------------------------------------------------------------+--+
  10. | { "movie": "1193", "rate": "5", "timeStamp": "978300760", "uid": "1"} |
  11. | { "movie": "661", "rate": "3", "timeStamp": "978302109", "uid": "1"} |
  12. | { "movie": "914", "rate": "3", "timeStamp": "978301968", "uid": "1"} |
  13. | { "movie": "3408", "rate": "4", "timeStamp": "978300275", "uid": "1"} |
  14. | { "movie": "2355", "rate": "5", "timeStamp": "978824291", "uid": "1"} |
  15. | { "movie": "1197", "rate": "3", "timeStamp": "978302268", "uid": "1"} |
  16. | { "movie": "1287", "rate": "5", "timeStamp": "978302039", "uid": "1"} |
  17. | { "movie": "2804", "rate": "5", "timeStamp": "978300719", "uid": "1"} |
  18. | { "movie": "594", "rate": "4", "timeStamp": "978302268", "uid": "1"} |
  19. | { "movie": "919", "rate": "4", "timeStamp": "978301368", "uid": "1"} |
  20. +----------------------------------------------------------------+--+
  21. #自定义函数
  22. package com.xiaojie.mm;
  23. import org.apache.hadoop.hive.ql.exec.UDF;
  24. import parquet.org.codehaus.jackson.map.ObjectMapper;
  25. public class JsonParse extends UDF{
  26. public String evaluate(String jsonLine) {
  27. ObjectMapper objectMapper = new ObjectMapper();
  28. try {
  29. MovieBean bean = objectMapper.readValue(jsonLine, MovieBean.class);
  30. return bean.toString();
  31. } catch(Exception e){
  32. }
  33. return "";
  34. }
  35. }
  36. package com.xiaojie.mm;
  37. public class MovieBean {
  38. // 电影id
  39. private String movie;
  40. // 电影评分
  41. private String rate;
  42. // 评分时间
  43. private String timeStamp;
  44. // 用户id
  45. private String uid;
  46. public String getMovie() {
  47. return movie;
  48. }
  49. public void setMovie(String movie) {
  50. this.movie = movie;
  51. }
  52. public String getRate() {
  53. return rate;
  54. }
  55. public void setRate(String rate) {
  56. this.rate = rate;
  57. }
  58. public String getTimeStamp() {
  59. return timeStamp;
  60. }
  61. public void setTimeStamp(String timeStamp) {
  62. this.timeStamp = timeStamp;
  63. }
  64. public String getUid() {
  65. return uid;
  66. }
  67. public void setUid(String uid) {
  68. this.uid = uid;
  69. }
  70. @ Override
  71. public String toString( ) {
  72. return this.movie + "\t" + this.rate + "\t" + this.timeStamp + "\t" + this.uid;
  73. }
  74. }
  75. #打jar包上传到hive所在机器,创建函数
  76. add JAR /root/test_data/json_parse.jar;
  77. create temporary function json_parse as 'com.xiaojie.mm.JsonParse';
  78. #使用自定义的json解析函数
  79. select json_parse(line) from json_tb limit 10;
  80. +---------------------+--+
  81. | _c0 |
  82. +---------------------+--+
  83. | 1193 5 978300760 1 |
  84. | 661 3 978302109 1 |
  85. | 914 3 978301968 1 |
  86. | 3408 4 978300275 1 |
  87. | 2355 5 978824291 1 |
  88. | 1197 3 978302268 1 |
  89. | 1287 5 978302039 1 |
  90. | 2804 5 978300719 1 |
  91. | 594 4 978302268 1 |
  92. | 919 4 978301368 1 |
  93. +---------------------+--+
  94. #将json解析的数据保存到一张新创建的表里
  95. create table json_parse_tb as
  96. select split( json_parse(line),'\t')[0] as movieid,
  97. split( json_parse(line),'\t')[1] as rate,
  98. split( json_parse(line),'\t')[2] as time,
  99. split( json_parse(line),'\t')[3] as userid
  100. from json_tb limit 100;
  101. #内置json函数
  102. select get_json_object(line,'$.movie') as moiveid,
  103. get_json_object( line,'$.rate') as rate,
  104. get_json_object( line,'$.timeStamp') as time,
  105. get_json_object( line,'$.uid') as userid
  106. from json_tb limit 10;

Transform(调用自定义脚本)

Hive的 TRANSFORM 关键字提供了在SQL中调用自写脚本的功能,适合实现Hive中没有的功能又不想写UDF的情况。

自定义python脚本(vim time_parse.py)


    
    
  1. #!/bin/python
  2. import sys
  3. import datetime
  4. for line in sys.stdin:
  5. line = line.strip()
  6. movieid, rate, unixtime,userid = line. split('\t')
  7. weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
  8. print '\t'. join([movieid, rate, str(weekday),userid])

将py文件导入到hive的工作目录下

add file time_parse.py
    
    

使用transform调用自定义的py代码


    
    
  1. create TABLE json_parse_time_tb as
  2. SELECT
  3. #根据transform括号中的参数,将json_parse_tb表的对应数据取出
  4. TRANSFORM (movieid, rate, time, userid)
  5. USING 'python time_parse.py'
  6. AS (movieid, rate, weekday,userid)
  7. FROM json_parse_tb;

查看新表数据


    
    
  1. select * from json_parse_time_tb;
  2. +-----------------------------+--------------------------+-----------------------------+----------------------------+--+
  3. | json_parse_time_tb.movieid | json_parse_time_tb.rate | json_parse_time_tb.weekday | json_parse_time_tb.userid |
  4. +-----------------------------+--------------------------+-----------------------------+----------------------------+--+
  5. | 1690 | 3 | 1 | 2 |
  6. | 589 | 4 | 1 | 2 |
  7. | 3471 | 5 | 1 | 2 |
  8. | 1834 | 4 | 1 | 2 |
  9. | 2490 | 3 | 1 | 2 |
  10. | 2278 | 3 | 1 | 2 |
  11. | 110 | 5 | 1 | 2 |
  12. | 3257 | 3 | 1 | 2 |
  13. | 3256 | 2 | 1 | 2 |
  14. | 3255 | 4 | 1 | 2 |
  15. +-----------------------------+--------------------------+-----------------------------+----------------------------+--+

案例

原数据(用户名,月份,点击量)


    
    
  1. A,2015-01,5
  2. A,2015-01,15
  3. B,2015-01,5
  4. A,2015-01,8
  5. B,2015-01,25
  6. A,2015-01,5
  7. A,2015-02,4
  8. A,2015-02,6
  9. B,2015-02,10
  10. B,2015-02,5

求每个人每个月的点击量,以及点击量累计

第一步:创建表,导入数据


    
    
  1. #建表
  2. create table click_tb(username string,month string,click int)
  3. row format delimited fields terminated by ',';
  4. #导入数据
  5. load data local inpath ‘/root/test_data/click.txt’ into click_tb;

第二步:求每个用户每个月的点击量


    
    
  1. select username, month, sum(click) as click_count from click_tb group by username, month;
  2. + -----------+----------+--------------+--+
  3. | username | month | click_count |
  4. + -----------+----------+--------------+--+
  5. | A | 2015-01 | 33 |
  6. | A | 2015-02 | 10 |
  7. | B | 2015-01 | 30 |
  8. | B | 2015-02 | 15 |
  9. + -----------+----------+--------------+--+

第三步:自己和自己内连接(求交集)


    
    
  1. select * from
  2. (select username,month,sum(click) as click_count from click_tb group by username,month) A
  3. inner join
  4. (select username,month,sum(click) as click_count from click_tb group by username,month) B
  5. on
  6. A.username=B.username;
  7. +-------------+----------+----------------+-------------+----------+----------------+--+
  8. | a.username | a.month | a.click_count | b.username | b.month | b.click_count |
  9. +-------------+----------+----------------+-------------+----------+----------------+--+
  10. | A | 2015-01 | 33 | A | 2015- 01 | 33 |
  11. | A | 2015- 01 | 33 | A | 2015-02 | 10 |
  12. | A | 2015-02 | 10 | A | 2015- 01 | 33 |
  13. | A | 2015- 02 | 10 | A | 2015-02 | 10 |
  14. | B | 2015-01 | 30 | B | 2015- 01 | 30 |
  15. | B | 2015- 01 | 30 | B | 2015-02 | 15 |
  16. | B | 2015-02 | 15 | B | 2015- 01 | 30 |
  17. | B | 2015- 02 | 15 | B | 2015-02 | 15 |
  18. +-------------+----------+----------------+-------------+----------+----------------+--+

第四步:求出最终所需结果


    
    
  1. select a.username,a.month, min(a.click_count) as click_count, sum(b.click_count) as sum_count from
  2. ( select username, month, sum(click) as click_count from click_tb group by username, month) a
  3. inner join
  4. ( select username, month, sum(click) as click_count from click_tb group by username, month) b
  5. on
  6. A.username=B.username
  7. where b.month<=a.month
  8. group by a.username,a.month
  9. order by a.username,a.month;
  10. + -------------+----------+--------------+------------+--+
  11. | a.username | a.month | click_count | sum_count |
  12. + -------------+----------+--------------+------------+--+
  13. | A | 2015-01 | 33 | 33 |
  14. | A | 2015-02 | 10 | 43 |
  15. | B | 2015-01 | 30 | 30 |
  16. | B | 2015-02 | 15 | 45 |
  17. + -------------+----------+--------------+------------+--+

转载于:https://www.cnblogs.com/52mm/p/p17.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值