Hive的with as 使用

最新推荐文章于 2024-01-13 15:27:10 发布

wachoo

最新推荐文章于 2024-01-13 15:27:10 发布

阅读量1.9k

点赞数

分类专栏： Hive 文章标签： hive hadoop 数据仓库

本文链接：https://blog.csdn.net/Byd_chao/article/details/123574921

版权

Hive 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

该脚本用于创建名为tmp的临时表，并从多个源表中提取数据，经过复杂的数据清洗和聚合操作，最终将结果插入到目标表`sink_batch_feature_result`中。过程涉及时间窗口计算、排名选择、数据过滤等步骤，主要用于分析用户反馈行为。

摘要由CSDN通过智能技术生成

创建tmp表

#!/bin/bash

begin_date=$1
end_date=$2
target_table=default_catalog.default_database.sink_batch_feature_result



v_dt_year=${begin_date:0:4}


echo $v_dt_year,$begin_date,$end_date


query_sql="
with tmp as
 (  select
    topic ,
    key ,
    send_time ,
    _key_suffix ,
    _expire ,
    _ignore_value ,
    feature ,
    _feature_suffix ,
    val
from(
    select
        'user' as topic,
        key,
        now_timestemp(key) as send_time,
        substr('${end_date}',7,2) as _key_suffix,
        cast(3600*24*2 as bigint) as _expire,
        '' as _ignore_value,
        'user_c_neg_feedback_jd_mismatch_jobtype3_30d' as feature,
        '' as _feature_suffix,
        val
    from(
        select
            user_id as key,
            concat('{', concat_ws(',', LISTAGG(concat('"jtype3":"',cast(jd_subtype_id as char), ',"fbcnt:"',cast(cnt as char), ',"fbdate:"',cast(intervaldate as char)))), '}') as val
        from(
           select
                bb.jd_no,
                bb.user_id,
                bb.jd_subtype_id,
                count(bb.jd_no) as cnt,
                datediff(substr('${end_date}',7,2),substring (bb.c_inappropriate_time,1,8)) as intervaldate,
                row_number() over(partition by user_id order by count(jd_no) desc) as rank_no
            from
                (select
                    a.user_id,
                    a.jd_no,
                    a.c_inappropriate_time,
                    b.jd_subtype_id
                    from(select
                            distinct
                            user_id,
                            jd_no,
                            c_inappropriate_time
                        from
                            myhive.dwb.dwb_pulsar_c_inappropriate_hour
                        where
                            dt >= ${begin_date}
                            and dt <= ${end_date}
                            and refer_type ='9'
                            and c_inappropriate_reason = '1'
                    )a
                left join
                (select
                    distinct jd_no,
                    jd_subtype_id
                from
                    myhive.dwd.dwd_cnt_jd_all
                where
                    dt = ${v_dt_year}
                )b
                on a.jd_no = b.jd_no)bb
            group by bb.jd_no,bb.user_id,bb.jd_subtype_id,datediff(substr('${end_date}',7,2),substring (bb.c_inappropriate_time,1,8))
        )c where rank_no <= 10 group by user_id
    )d
)e where val <>'{}' and val is not null and val <>'')

insert overwrite table  ${target_table} partition(dt_year='${v_dt_year}')

SELECT date_format(dwt.send_time, 'yyyyMMdd') AS send_time,
       df.provider_no,
       df.supplier_no,
       dwt.driver_no,
       df.driver_name,
       df.telephone,
       dwt.business_no,
       vf.vehicle_num,
       vf.vehicle_no,
       dwt.topic,
       dwt.work_end_time,
       dwt.key,
       dwt.feature,
       dwt.val,
       current_timestamp() as etl_date
  FROM tmp dwt
  LEFT JOIN dwd.dwd_campus_service_info df
    ON dwt.topic= df.user_id
"

echo 'start execute hive sql !'



hive_sql="
use dwd;
set hive.exec.parallel=true;
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
set hive.merge.mapfiles = true;
set hive.merge.mapredfiles=true;
set hive.merge.size.per.task = 256000000;
set hive.merge.smallfiles.avgsize=16000000;

alter table ${target_table} drop  partition(dt_year='${v_dt_year}');
alter table ${target_table} add  partition(dt_year='${v_dt_year}');
${query_sql};
"


echo $hive_sql

hive -e  "$hive_sql"



echo 'hive sql is execute OK !'