【Hive】常用指令集锦

本文介绍了如何在Hive中进行数据操作,包括创建表、下载数据到本地、字段值分类汇总、日期处理以及时间去重。同时,分享了Hive查询的优化参数设置,以提升查询效率。此外,还提供了Python UDF的使用模板,用于自定义数据处理功能。
摘要由CSDN通过智能技术生成
  • 创建表
use database_name;
drop table if exists new_table_name;
create table new_table_name as 
select * from origin_table_name;
  • 从Hive数据仓库下载数据到本地
hive -e "set hive.cli.print.header=true;select * from database_name.table_name;" > ./data.csv
  • 根据表字段值分类汇总
  case
    when label in ('G', '1', 1) then '1'
    when label in ('B', '0', 0) then '0'
  else NULL end as new_label,
  • 获取日期字符串月份
substr(cast(dt as string), 1, 7),
  • 近六个月每月划分为4周
case
  when dt >= concat(substr(add_months('${target_dt}', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 8
    then concat('30', '', concat(substr(dt, 3, 5), '-', '01'))
  when dt >= concat(substr(add_months('${target_dt}', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 15
    then concat('30', '', concat(substr(dt, 3, 5), '-', '02'))
  when dt >= concat(substr(add_months('${target_dt}', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 22
    then concat('30', '', concat(substr(dt, 3, 5), '-', '03'))
  when dt >= concat(substr(add_months('${target_dt}', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 32
    then concat('30', '', concat(substr(dt, 3, 5), '-', '04'))
end,
  • 取数并按照时间去重
select
    id
    to_date(apply_time) as apply_time,
    apply_channel
from
    (select
	    id.
	    apply_time,
	    apply_channel,
        row_number() over (distribute by id, apply_channel sort by apply_time desc) as rn
    from
        database_name.user_table_name
    ) as t0 
where
    t0.rn = 1
  • 不select某几个字段
select `(pin|dt)?+.+` from database_name.table_name;

调优参数

set hive.map.aggr.hash.percentmemory=0.3;
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=8;
set mapreduce.map.memory.mb=5200;
set mapreduce.map.java.opts=-Xmx4500M;
set mapreduce.reduce.memory.mb=5200;
set mapreduce.reduce.java.opts=-Xmx4500M;
set mapreduce.map.memory.mb=10240;
set mapreduce.map.java.opts=-Xmx10240m;
set mapreduce.reduce.memory.mb=20480;
set mapreduce.reduce.java.opts=-Xmx20480m;
set mapreduce.jobtracker.split.metainfo.maxsize=-1;
set mapred.max.split.size=1000000;
set mapred.min.split.size.per.node=1000000;
set mapred.min.split.size.per.rack=1000000;
set mapreduce.input.fileinputformat.split.maxsize=1000000;
set hive.exec.parallel=true;
set mapred.compress.map.output=true;
set mapred.output.compress=true;
set hive.exec.compress.output=true;
set hive.compute.query.using.stats=true;
set hive.optimize.skewjoin=true;
set hive.skewjoin.key=500000;
set hive.skewjoin.mapjoin.map.tasks=10000;
set hive.skewjoin.mapjoin.min.split=33554432;
set hive.auto.convert.join=false;
set hive.map.aggr.hash.percentmemory=0.1;
set hive.optimize.sort.dynamic.partition=true;

UDF

python UDF模板

warning: UDF中不能有中文注释

# -*- coding:utf-8 -*-
import sys

# 报错信息,不覆盖输出
error_msg = None

try:
    import numpy as np
except Exception as e:
    error_msg = "import error: " + str(e)


try:
    pass
except Exception as e:
    error_msg = "define error: " + str(e) if error_msg is None else error_msg

for line in sys.stdin:
    try:
        items = line.strip().split("\t")
        print('\t'.join(items))
    except Exception as e:
        error_msg = "process error: " + str(e) if error_msg is None else error_msg

    if error_msg:
        print(error_msg)

python UDF调用模板

use database_name;
drop table if exists table_name;
create table table_name as
select
    transform({select_cols})
    using 'python udf.py'
    as ({new_cols})
from origin_table_name;
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

大风车滴呀滴溜溜地转

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值