【Hive】常用指令集锦

最新推荐文章于 2022-08-08 11:30:50 发布

大风车滴呀滴溜溜地转

最新推荐文章于 2022-08-08 11:30:50 发布

阅读量330

点赞数

分类专栏： # Hive 文章标签： Hive

本文链接：https://blog.csdn.net/weixin_43336281/article/details/124428295

版权

Hive 专栏收录该内容

12 篇文章 2 订阅

订阅专栏

本文介绍了如何在Hive中进行数据操作，包括创建表、下载数据到本地、字段值分类汇总、日期处理以及时间去重。同时，分享了Hive查询的优化参数设置，以提升查询效率。此外，还提供了Python UDF的使用模板，用于自定义数据处理功能。

摘要由CSDN通过智能技术生成

创建表

use database_name;
drop table if exists new_table_name;
create table new_table_name as 
select * from origin_table_name;

从Hive数据仓库下载数据到本地

hive -e "set hive.cli.print.header=true;select * from database_name.table_name;" > ./data.csv

根据表字段值分类汇总

  case
    when label in ('G', '1', 1) then '1'
    when label in ('B', '0', 0) then '0'
  else NULL end as new_label,

获取日期字符串月份

substr(cast(dt as string), 1, 7),

近六个月每月划分为4周

case
  when dt >= concat(substr(add_months('${target_dt}', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 8
    then concat('30', '', concat(substr(dt, 3, 5), '-', '01'))
  when dt >= concat(substr(add_months('${target_dt}', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 15
    then concat('30', '', concat(substr(dt, 3, 5), '-', '02'))
  when dt >= concat(substr(add_months('${target_dt}', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 22
    then concat('30', '', concat(substr(dt, 3, 5), '-', '03'))
  when dt >= concat(substr(add_months('${target_dt}', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 32
    then concat('30', '', concat(substr(dt, 3, 5), '-', '04'))
end,

取数并按照时间去重

select
    id
    to_date(apply_time) as apply_time,
    apply_channel
from
    (select
	    id.
	    apply_time,
	    apply_channel,
        row_number() over (distribute by id, apply_channel sort by apply_time desc) as rn
    from
        database_name.user_table_name
    ) as t0 
where
    t0.rn = 1

不select某几个字段

select `(pin|dt)?+.+` from database_name.table_name;

调优参数

set hive.map.aggr.hash.percentmemory=0.3;
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=8;
set mapreduce.map.memory.mb=5200;
set mapreduce.map.java.opts=-Xmx4500M;
set mapreduce.reduce.memory.mb=5200;
set mapreduce.reduce.java.opts=-Xmx4500M;

set mapreduce.map.memory.mb=10240;
set mapreduce.map.java.opts=-Xmx10240m;
set mapreduce.reduce.memory.mb=20480;
set mapreduce.reduce.java.opts=-Xmx20480m;
set mapreduce.jobtracker.split.metainfo.maxsize=-1;
set mapred.max.split.size=1000000;
set mapred.min.split.size.per.node=1000000;
set mapred.min.split.size.per.rack=1000000;
set mapreduce.input.fileinputformat.split.maxsize=1000000;
set hive.exec.parallel=true;
set mapred.compress.map.output=true;
set mapred.output.compress=true;
set hive.exec.compress.output=true;
set hive.compute.query.using.stats=true;
set hive.optimize.skewjoin=true;
set hive.skewjoin.key=500000;
set hive.skewjoin.mapjoin.map.tasks=10000;
set hive.skewjoin.mapjoin.min.split=33554432;
set hive.auto.convert.join=false;
set hive.map.aggr.hash.percentmemory=0.1;
set hive.optimize.sort.dynamic.partition=true;

UDF

python UDF模板

warning: UDF中不能有中文注释

# -*- coding:utf-8 -*-
import sys

# 报错信息，不覆盖输出
error_msg = None

try:
    import numpy as np
except Exception as e:
    error_msg = "import error: " + str(e)


try:
    pass
except Exception as e:
    error_msg = "define error: " + str(e) if error_msg is None else error_msg

for line in sys.stdin:
    try:
        items = line.strip().split("\t")
        print('\t'.join(items))
    except Exception as e:
        error_msg = "process error: " + str(e) if error_msg is None else error_msg

    if error_msg:
        print(error_msg)

python UDF调用模板

use database_name;
drop table if exists table_name;
create table table_name as
select
    transform({select_cols})
    using 'python udf.py'
    as ({new_cols})
from origin_table_name;