一、HIVE基础
1.1 建表与插入数据
建表
-- 新建Hive表: --
drop table if exists dev.table;
create table if not exists dev.dtable(
datag_time string comment '字段名称',
group_type char(10) comment '字段名称',
rationality_index double comment '字段名称',
deep_value_index double comment '字段名称'
)
comment '表名称注释'
partitioned by (
dt string comment 'partition : date'
)
row format delimited
fields terminated by '\001'
tblproperties (
'author'='tian'
);
插入数据
--插入数据
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT INTO TABLE app.* PARTITION (dt)
select
distinct *, *,*,*,*,dt
from *.*
where name is not NULL AND name like '%桌%' AND dt='2023-02-25'
LIMIT 2000000;
本地数据插入HIVE表:
-- 新建Hive表:留资并下单客户 --
drop table if exists app.*;
create table if not exists app.*(
touch_cus_time string comment '客户触达时间',
customer_no string comment '客户C码',
)
comment '新建Hive表:留资并下单客户'
partitioned by (
dt string comment 'partition : date'
)
row format delimited
fields terminated by ','
tblproperties (
'author'=''
);
LOAD DATA LOCAL INPATH '本地路径.csv' OVERWRITE INTO TABLE app.app_xxxx PARTITION(dt='2023-05-23');
#建立表时,去掉EXCEL表头,然后用\t来进行分隔
1.2查询
统计表的行数:
select count(*) from 表名 where dt='';
查询系统日期:
SELECT sysdate(-2); --(-2)表示系统的前一天
1.3 做差集
1.4连接语句
两个表做JOIN
SELECT
*
FROM
表名 as ta
INNER JOIN 表名 as tb
ON ta.dt = tb.dt and ta.id = tb.id
WHERE ta.dt>='2022-01-01'
1.5 排序
分区排序,全局排序, MapReduce中的内部排序
https://blog.csdn.net/qq_43192537/article/details/102293995
按照某一字段排序:
SELECT date, product, revenue
FROM sales
ORDER BY revenue DESC;
1.6 修改表字段类型
https://blog.csdn.net/glittledream/article/details/84789571
Alter table 表名 change column 原字段名称 现字段名称 数据类型
1.7 update
https://my.oschina.net/u/2380815/blog/4453765
Hive0.14版本之前是不支持update和delete操作的,之后的Hive数据表必须要满足一定的条件,比如ORC存储、ACID支持等,才可以进行update和delete操作,本篇文章讲一下传统的hive数据表如果通过写SQL的方式实现数据的更新。
1.8 处理空值
https://blog.csdn.net/weixin_30416497/article/details/97950750
WHERE 字段名 IS NULL
1.9 where
不等值的判断
SELECT item_name,brand_name_full
FROM 表名
WHERE dt>="2021-05-25" and brand_name_full<>"NO BRAND"
LIMIT 3000
1.10 去重/模糊查询
(1) 去除重复项,模糊查询
select
distinct sku_name
from 表名
where sku_name is not NULL AND sku_name like '%桌%' AND dt='2023-02-25';
(2) 去重后统计数量
select count(distinct AccountID) from CharacterLogin where day="27" and month="10";
(3) 对某一列去重
distinct 只能加到最前面,同时存在多个列时,不能对单列去重复,所以可以使用row_number()等,具体如下:
对sku_name去重复
SELECT dropre.sku_name,
dropre.m1,
dropre.m2,
dropre.m3,
dropre.mm
FROM
(SELECT aa.sku_name,aa.
m1,
aa.m2,
aa.m3,aa.mm,row_number()
over (partition by aa.sku_name order BY aa.mm) as rn
FROM app.app_*** AS aa )
AS dropre
WHERE dropre.rn =1
https://www.cnblogs.com/rrttp/p/9026359.html
1.11 保留第一个数字
1.11 两个表的联合查询
https://www.cnblogs.com/likai198981/archive/2013/03/29/2989740.html
1.12 统计某个字段出现的次数
https://blog.csdn.net/love_java_cc/article/details/52234889
1.13 执行sql脚本
hive -f t.sql
1.14 数据导出为txt
bin/hive -f sql.q >> res.csv
1.15 拼接一个字符串等
CONCAT(dt,"-01")
concat(col1,col2)
#GROUP BY 之后通过拼接存储
concat_ws('_',collect_set(name))
1.16 加工表
#!/bin/bash
start_date="2023-01-01"
end_date="20230501"
# for time_date in '2023-05-01' '2023-04-01' '2023-03-01' '2023-02-01' '2023-01-01' '2022-12-01' '2022-11-01' '2022-10-01'
# '2022-09-01' '2022-08-01' '2022-07-01' '2022-06-01' '2022-05-01' '2022-04-01' '2022-03-01' '2022-02-01' '2022-01-01'
# do
function run_write(){
time_date=$1
echo $time_date
set_dt_d=$time_date
b_threem=$(date -d "${time_date}-3months" +%Y-%m-01)
b_onem=$(date -d "${time_date}-1months" +%Y-%m-01)
thism=$(date -d "${time_date}+1months" +%Y-%m-01)
yy_ms=$(date -d "${time_date}-12months" +%Y-%m-01)
yy_me=$(date -d "${time_date}-11months" +%Y-%m-01)
yy_nextms=$(date -d "${time_date}-11months" +%Y-%m-01)
yy_nextme=$(date -d "${time_date}-9months" +%Y-%m-01)
retain_s=$(date -d "${yy_ms}" +%Y-01-01)
retain_e=$(date -d "${retain_s}+12months" +%Y-%m-01)
ty_his=$(date -d "${time_date}" +%Y-01-01)
hive -e "
set hive.exec.dynamic.partition=true; \
set hive.exec.dynamic.partition.mode=nonstrict; \
INSERT INTO TABLE app.写入表 PARTITION (dt) \
SELECT \
customer_no, \
SUM(IF(dt>='$b_threem' AND dt<'$set_dt_d',total_amount,0)) b_threem_amount, \
SUM(IF(dt>='$b_onem' AND dt<'$set_dt_d',total_amount,0)) b_onem_amount, \
SUM(IF(dt>='$set_dt_d' AND dt<'$thism',total_amount,0 )) thism_amount, \
SUM(IF(dt>='$yy_ms' AND dt<'$yy_me',total_amount,0)) yy_m_amount, \
SUM(IF(dt>='$yy_nextms' AND dt<'$yy_nextme',total_amount, 0)) yy_nextm_amount, \
SUM(IF(dt>='$retain_s' AND dt<'$retain_e',total_amount, 0)) retain_amount, \
SUM(IF(dt>='$ty_his' AND dt<'$set_dt_d',total_amount ,0 )) ty_his_amount, \
'$set_dt_d' AS dt \
FROM \
( \
SELECT \
id AS cust_no, \
SUM(total_amount) AS amount, \
CONCAT(dt,'-01') AS dt \
FROM \
app.读表数据 \
GROUP BY id,dt \
) amount_data \
GROUP BY customer_no
"
}
# done
while [[ "$start_date" -le "$end_date" ]] ;
do
stat_date_month=`date -d "${start_date}" +%Y-%m-%d`
echo $stat_date_month
run_write $stat_date_month
start_date=$(date -d "${start_date}+1months" +%Y%m%d)
done
1.17 RANK
rank() over(partition BY x order by x, x DESC) AS priority
二、HIVE和ES数据相互导入
HIVE表插入数据
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
INSERT INTO TABLE app.app_*_* PARTITION (dt)
select
distinct sku_name, item_first*,item_second_*,item_third_*,jd_*,dt
from xxx.xxx_*
where sku_name is not NULL AND dt='2023-02-25';
添加jar文件
--添加jar文件
add jar hdfs://xxxx/xxx/xx/elasticsearch-hadoop-hive-8.6.2.jar;
基于HIVE建立ES链接
--基于HIVE,建立ES连接
DROP TABLE IF EXISTS xxx.xxx_hive_es;
CREATE EXTERNAL TABLE xxx.xxx_hive_es(
sku_name string,
xx string,
xx string,
xx string,
prc float
)STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES('es.resource'='es名称',
'es.nodes'='esxxx.xxx-xxx-hb.xxx.com',
'es.port'='xxx',
'es.mapping.name' = 'sku_xxx:sku_xxx, item_first_cate_xxx:item_first_cate_xxx',
'es.mapping.id' = 'sku_xxx',
'es.write.operation'='upsert',
'es.index.auto.create'='TRUE',
'es.field.read.empty.as.null' ='TRUE',
'es.net.http.auth.user'='xxx',
'es.net.http.auth.pass'='=xxxx+1'
);
在ES中插入数据
INSERT OVERWRITE TABLE xxx.xxx_hive_es
SELECT
dropre.sku_xxx,
dropre.item_xxx,
dropre.item_xxx,
dropre.item_xxx,
dropre.xxprc
FROM
(SELECT aa.sku_xxx,aa.
item_xxx,
aa.item_xxx,
aa.item_xxx,
aa.xxprc,
aa.dt,
row_number()
over (partition by aa.sku_xxx order BY aa.xxprc) as rn
FROM app.app_*_* AS aa WHERE aa.dt='2023-02-25')
AS dropre
WHERE dropre.rn =1 AND dropre.sku_name<>'';
二、HIVE时间日期处理
对年取整数归0
SELECT trunc('2021-12-17','YYYY');
得到2021-01-01
对月取整数归0
SELECT trunc(current_date(),'MM');
得到2021-01-01
上月1号
SELECT trunc(add_months(CURRENT_TIMESTAMP,-1),'MM');
日期减一个月
SELECT add_months(CURRENT_DATE,-1)
当前日期
SELECT CURRENT_DATE
去年的一月
SELECT
date_add(concat(year(current_date) - 1, '-01-01'), 0) AS january,
date_add(concat(year(current_date) - 1, '-02-01'), 0) AS february,
date_add(concat(year(current_date) - 1, '-03-01'), 0) AS march
FROM
your_table
参考资料
https://stackoverflow.com/questions/28674753/hive-runtime-error-while-processing-row-in-hive(向量化参数调优)
https://codeantenna.com/a/BD4HSLCWqZ (对某一列去重)
https://blog.csdn.net/weixin_42303955/article/details/113076713 (sh脚本编写HIVE)
https://blog.csdn.net/jiegedalao/article/details/110855846(for循环参考链接)