Hive使用语句

一、HIVE基础

1.1 建表与插入数据

建表

-- 新建Hive表: --
drop table if exists dev.table;
create table if not exists dev.dtable(
 datag_time string comment '字段名称',
 group_type  char(10) comment '字段名称',
 rationality_index  double comment '字段名称',
 deep_value_index double comment '字段名称'
)
comment '表名称注释'
 partitioned by (
  dt string comment 'partition : date'
 )
 row format delimited
   fields terminated by '\001'
 tblproperties (
  'author'='tian'
 );

插入数据

--插入数据
set hive.exec.dynamic.partition=true;  
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT INTO TABLE app.* PARTITION (dt)
select 
distinct *, *,*,*,*,dt
from *.*
where  name is not NULL AND name like '%桌%' AND dt='2023-02-25'
LIMIT 2000000;

本地数据插入HIVE表:

-- 新建Hive表:留资并下单客户 --
drop table if exists app.*;
create table if not exists app.*(
 touch_cus_time string comment '客户触达时间',
 customer_no  string comment '客户C码',
)
comment '新建Hive表:留资并下单客户'
 partitioned by (
  dt string comment 'partition : date'
 )
 row format delimited
   fields terminated by ','
 tblproperties (
  'author'=''
 );
 LOAD DATA LOCAL INPATH '本地路径.csv' OVERWRITE INTO TABLE app.app_xxxx PARTITION(dt='2023-05-23');
 #建立表时,去掉EXCEL表头,然后用\t来进行分隔
1.2查询

统计表的行数:

select count(*) from 表名 where dt=''

查询系统日期:

SELECT sysdate(-2); --(-2)表示系统的前一天
1.3 做差集

1.4连接语句

两个表做JOIN

SELECT
*
FROM
表名 as ta
INNER JOIN 表名 as tb
ON ta.dt = tb.dt and ta.id = tb.id
WHERE ta.dt>='2022-01-01'
1.5 排序

分区排序,全局排序, MapReduce中的内部排序

https://blog.csdn.net/qq_43192537/article/details/102293995

按照某一字段排序:

SELECT date, product, revenue
FROM sales
ORDER BY revenue DESC;
1.6 修改表字段类型

https://blog.csdn.net/glittledream/article/details/84789571

Alter table 表名 change column 原字段名称 现字段名称 数据类型

1.7 update

https://my.oschina.net/u/2380815/blog/4453765

Hive0.14版本之前是不支持update和delete操作的,之后的Hive数据表必须要满足一定的条件,比如ORC存储、ACID支持等,才可以进行update和delete操作,本篇文章讲一下传统的hive数据表如果通过写SQL的方式实现数据的更新。

1.8 处理空值

https://blog.csdn.net/weixin_30416497/article/details/97950750

WHERE 字段名 IS NULL
1.9 where
不等值的判断
SELECT item_name,brand_name_full
FROM 表名
WHERE dt>="2021-05-25" and brand_name_full<>"NO BRAND"
LIMIT 3000
1.10 去重/模糊查询

(1) 去除重复项,模糊查询

select 
distinct sku_name
from 表名
where  sku_name is not NULL AND sku_name like '%桌%' AND dt='2023-02-25';

(2) 去重后统计数量

select count(distinct AccountID) from CharacterLogin where day="27" and month="10";

(3) 对某一列去重
distinct 只能加到最前面,同时存在多个列时,不能对单列去重复,所以可以使用row_number()等,具体如下:
对sku_name去重复

SELECT dropre.sku_name, 
    dropre.m1,
    dropre.m2,
    dropre.m3,
    dropre.mm
FROM
    (SELECT aa.sku_name,aa.
        m1,
        aa.m2,
        aa.m3,aa.mm,row_number()  
    over (partition by aa.sku_name  order BY aa.mm) as rn 
    FROM app.app_*** AS aa ) 
    AS dropre
WHERE  dropre.rn =1

https://www.cnblogs.com/rrttp/p/9026359.html

1.11 保留第一个数字

1.11 两个表的联合查询

https://www.cnblogs.com/likai198981/archive/2013/03/29/2989740.html

1.12 统计某个字段出现的次数

https://blog.csdn.net/love_java_cc/article/details/52234889

1.13 执行sql脚本
hive -f t.sql 
1.14 数据导出为txt

bin/hive -f sql.q >> res.csv

1.15 拼接一个字符串等
CONCAT(dt,"-01")
concat(col1,col2)
#GROUP BY 之后通过拼接存储
concat_ws('_',collect_set(name))
1.16 加工表
#!/bin/bash

start_date="2023-01-01"
end_date="20230501"



# for time_date in '2023-05-01' '2023-04-01' '2023-03-01' '2023-02-01' '2023-01-01' '2022-12-01' '2022-11-01' '2022-10-01'
# '2022-09-01' '2022-08-01' '2022-07-01' '2022-06-01' '2022-05-01' '2022-04-01' '2022-03-01' '2022-02-01'  '2022-01-01'
# do
function run_write(){

  time_date=$1
  echo $time_date
  
  set_dt_d=$time_date
  b_threem=$(date -d "${time_date}-3months" +%Y-%m-01)
  b_onem=$(date -d "${time_date}-1months" +%Y-%m-01)
  thism=$(date -d "${time_date}+1months" +%Y-%m-01)
  yy_ms=$(date -d "${time_date}-12months" +%Y-%m-01)
  yy_me=$(date -d "${time_date}-11months" +%Y-%m-01)
  yy_nextms=$(date -d "${time_date}-11months" +%Y-%m-01)
  yy_nextme=$(date -d "${time_date}-9months" +%Y-%m-01)
  retain_s=$(date -d "${yy_ms}" +%Y-01-01)
  retain_e=$(date -d "${retain_s}+12months" +%Y-%m-01)
  ty_his=$(date -d "${time_date}" +%Y-01-01)
  
  hive -e "
        set hive.exec.dynamic.partition=true; \
        set hive.exec.dynamic.partition.mode=nonstrict;  \

        INSERT INTO TABLE app.写入表 PARTITION (dt)  \
        SELECT  \
           customer_no,  \
           SUM(IF(dt>='$b_threem' AND dt<'$set_dt_d',total_amount,0)) b_threem_amount,  \
           SUM(IF(dt>='$b_onem' AND dt<'$set_dt_d',total_amount,0)) b_onem_amount,  \
           SUM(IF(dt>='$set_dt_d' AND dt<'$thism',total_amount,0 )) thism_amount,  \
           SUM(IF(dt>='$yy_ms' AND dt<'$yy_me',total_amount,0)) yy_m_amount, \
           SUM(IF(dt>='$yy_nextms' AND dt<'$yy_nextme',total_amount, 0)) yy_nextm_amount, \
           SUM(IF(dt>='$retain_s' AND dt<'$retain_e',total_amount, 0)) retain_amount,  \
           SUM(IF(dt>='$ty_his' AND dt<'$set_dt_d',total_amount ,0 )) ty_his_amount, \
           '$set_dt_d' AS dt \
        FROM \
            ( \
              SELECT \
                    id AS cust_no, \
                    SUM(total_amount) AS amount,  \
                    CONCAT(dt,'-01') AS dt \
                FROM  \
                    app.读表数据  \
                GROUP BY id,dt   \
            ) amount_data  \
        GROUP BY customer_no   
  "
}
# done

while [[ "$start_date" -le "$end_date" ]] ;
do
  stat_date_month=`date -d "${start_date}" +%Y-%m-%d`
  echo $stat_date_month
  run_write $stat_date_month
  start_date=$(date -d "${start_date}+1months" +%Y%m%d)
done
1.17 RANK
rank() over(partition BY x order by x, x DESC) AS priority

二、HIVE和ES数据相互导入

HIVE表插入数据

set hive.exec.dynamic.partition=true;  
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT INTO TABLE app.app_*_* PARTITION (dt)
select 
distinct sku_name, item_first*,item_second_*,item_third_*,jd_*,dt
from xxx.xxx_*
where  sku_name is not NULL AND dt='2023-02-25';

添加jar文件

--添加jar文件
add jar hdfs://xxxx/xxx/xx/elasticsearch-hadoop-hive-8.6.2.jar;

基于HIVE建立ES链接

--基于HIVE,建立ES连接
DROP TABLE IF EXISTS xxx.xxx_hive_es;
CREATE EXTERNAL TABLE xxx.xxx_hive_es(
sku_name string,
xx string,
xx string,
xx string,
prc float
)STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES('es.resource'='es名称',
'es.nodes'='esxxx.xxx-xxx-hb.xxx.com',
'es.port'='xxx',
'es.mapping.name' = 'sku_xxx:sku_xxx, item_first_cate_xxx:item_first_cate_xxx',
'es.mapping.id' = 'sku_xxx',
'es.write.operation'='upsert',
'es.index.auto.create'='TRUE',
'es.field.read.empty.as.null' ='TRUE',
'es.net.http.auth.user'='xxx',
'es.net.http.auth.pass'='=xxxx+1'
);

在ES中插入数据

INSERT OVERWRITE TABLE xxx.xxx_hive_es
SELECT 
    dropre.sku_xxx,
    dropre.item_xxx,
    dropre.item_xxx,
    dropre.item_xxx,
    dropre.xxprc
FROM
    (SELECT aa.sku_xxx,aa.
        item_xxx,
        aa.item_xxx,
        aa.item_xxx,
        aa.xxprc,
        aa.dt,
        row_number()
    over (partition by aa.sku_xxx order BY aa.xxprc) as rn 
    FROM app.app_*_* AS aa  WHERE aa.dt='2023-02-25') 
    AS dropre
WHERE  dropre.rn =1 AND dropre.sku_name<>'';

二、HIVE时间日期处理

对年取整数归0

SELECT trunc('2021-12-17','YYYY');
得到2021-01-01

对月取整数归0

SELECT trunc(current_date(),'MM');
得到2021-01-01

上月1号

 SELECT trunc(add_months(CURRENT_TIMESTAMP,-1),'MM');

日期减一个月

SELECT add_months(CURRENT_DATE,-1)

当前日期

SELECT CURRENT_DATE

去年的一月

SELECT
  date_add(concat(year(current_date) - 1, '-01-01'), 0) AS january,
  date_add(concat(year(current_date) - 1, '-02-01'), 0) AS february,
  date_add(concat(year(current_date) - 1, '-03-01'), 0) AS march
FROM
  your_table

参考资料

https://stackoverflow.com/questions/28674753/hive-runtime-error-while-processing-row-in-hive(向量化参数调优)
https://codeantenna.com/a/BD4HSLCWqZ (对某一列去重)
https://blog.csdn.net/weixin_42303955/article/details/113076713 (sh脚本编写HIVE)
https://blog.csdn.net/jiegedalao/article/details/110855846(for循环参考链接)

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值