Hive表迁移至压缩表:
1、 压缩方式选择(经测试选择snappy)
2、 创建Hive内部表(主要是后面Rename的时候可以将location改成对应的位置)
CREATE **TABLE** `ods_adv.ods_advt_user_operation_basic_2_tmp`(
`android_id` string COMMENT '',
`google_id` string COMMENT '',
`bat_id` string COMMENT '',
`log_time` string COMMENT '',
`auto_time` bigint COMMENT '')
PARTITIONED BY (
`dt` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.RCFileInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.RCFileOutputFormat'
LOCATION
'hdfs://ns/apps/hive/warehouse/ods_adv.db/ods_advt_user_operation_basic_2_tmp'
TBLPROPERTIES (
**'orc.compress'='SNAPPY'**
)
3、 将外部表(原来ODS数据表)的数据insert overwrite 第二步的表
set hive.exec.dynamic.partition.mode=nonstrict;
SET hive.merge.mapfiles = true;
SET hive.merge.mapredfiles = true;
SET hive.merge.size.per.task = 512000000;
SET hive.merge.smallfiles.avgsize = 256000000;
SET hive.exec.compress.output = true;
set hive.auto.convert.join=true;
set hive.map.aggr=true
set hive.groupby.skewindata=true
SET parquet.compression = snappy;
SET hive.exec.dynamic.partition.mode = nonstrict;
SET hive.exec.dynamic.partition = true;
msck repair table ods_adv.ods_online_h5_basic_1;
insert overwrite table `ods_adv.ods_online_h5_basic_1_tmp` partition (dt)
select * from ods_adv.ods_online_h5_basic_1
WHERE dt between '2019-07-01' and '2019-07-31';
## where dt=to_date('${hivevar:data_date}')
4、 Rename外部表(原ODS表) (location的不会变得)
alter table ods_adv.ods_advt_natural_flow_basic_1_tmp2 rename to ods_adv.ods_advt_natural_flow_basic_1_tmp ;
5、 Rename 步骤二创建的表
6、 删除原ODS数据只保留至前7天
hdfs dfs -rm -r /apps/hive/ods_log_tmp/ods_adv/ods_sdk_fre_4/dt=2018-12-29
OOZIE:
<action name="fs-debd">
<fs>
<delete path='${nameNode}${delete_ods_data_path}/dt=${data_date_of_lastweek}'/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
7、 Msck 第六步的表
msck repair table ods_adv.ods_online_h5_basic_1;
8、 oozie调度开发(以mopub为例)
job.properties:
oozie.use.system.libpath=True
security_enabled=False
dryrun=False
workspace_52eceecc-0760-0201-c64a-0d1cd3a3b097=/user/hue/oozie/workspaces/advertisement/common/wf_kylin_build_time_cube
send_email=False
jobTracker=rm1
kylin_url=http://zzdz0420180014.hk.batmobi.cn:7070
nameNode=hdfs://ns
hive2_url=zzdz0420180015.hk.batmobi.cn:10000
wf_shell_path=/user/hue/oozie/workspaces/resources/common/script
data_date=2019-10-21
data_date_of_lastweek=2019-10-14
wf_data_path=/apps/hive/warehouse/ods_adv.db/ods_online_h5_m_1
oozie.wf.application.path=hdfs://ns/user/hue/oozie/workspaces/advertisement/online_mopub/workflows/wf_ods_online_mopub_user_event_info
oozie.libpath=hdfs://ns/user/hue/oozie/workspaces/advertisement/online_mopub/workflows/wf_ods_online_mopub_user_event_info/lib
wf_sql_path=/user/hue/oozie/workspaces/resources/advertisement/hiveserver2/online_mopub
delete_ods_data_path=/apps/hive/ods_log_tmp/ods_adv/ods_online_h5_m_1
workflow.xml:
<workflow-app name="wf_ods_online_mopub_user_event_info" xmlns="uri:oozie:workflow:0.5">
<start to="hive2-af51"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="hive2-af51" cred="hive2">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<prepare>
<delete path="${nameNode}${wf_data_path}/dt={data_date}/_SUCCESS"/>
</prepare>
<jdbc-url>jdbc:hive2://${hive2_url}/default</jdbc-url>
<script>${wf_sql_path}/hql_ods_online_h5_mopub_user_event_info.hql</script>
<param>data_date=${data_date}</param>
<file>/user/hue/oozie/workspaces/resources/advertisement/hiveserver2/online_mopub/hql_ods_online_h5_mopub_user_event_info.hql#hql_ods_online_h5_mopub_user_event_info.hql</file>
</hive2>
<ok to="decision-2ffc"/>
<error to="Email"/>
</action>
<action name="fs-c42f">
<fs>
<touchz path='${nameNode}${wf_data_path}/dt=${data_date}/_SUCCESS'/>
</fs>
<ok to="fs-debd"/>
<error to="Email"/>
</action>
<decision name="decision-2ffc">
<switch>
<case to="fs-c42f">
${fs:exists(concat(concat(wf_data_path,'/dt='),data_date))}</case>
<default to="End"/>
</switch>
</decision>
<action name="fs-debd">
<fs>
<delete path='${nameNode}${delete_ods_data_path}/dt=${data_date_of_lastweek}'/>
</fs>
<ok to="End"/>
<error to="Email"/>
</action>
<action name="Email">
<email xmlns="uri:oozie:email-action:0.2">
<to>zouyi@batmobi.net,lijialun@batmobi.net,moxuqiang@batmobi.net,chenhongwei@batmobi.net,chenhongwei8888@gmail.com</to>
<subject>wf_ods_online_mopub_user_event_info report</subject>
<body>wf_ods_online_mopub_user_event_info error</body>
<content_type>text/html; charset=UTF-8</content_type>
</email>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>
job.properties:
oozie.use.system.libpath=True
security_enabled=False
dryrun=False
coord_kylin_url=http://zzdz0420180014.hk.batmobi.cn:7070
end_date=2050-04-21T00:20+0800
jobTracker=rm1
workspace_52eceecc-0760-0201-c64a-0d1cd3a3b097=/user/hue/oozie/workspaces/advertisement/common/wf_kylin_build_time_cube
hive2_url=zzdz0420180015.hk.batmobi.cn:10000
nameNode=hdfs://ns
coord_wf_shell_path=/user/hue/oozie/workspaces/resources/common/script
coord_data_path=/apps/hive/warehouse/ods_adv.db/ods_online_h5_m_1
oozie.coord.application.path=/user/hue/oozie/workspaces/advertisement/online_mopub/coordinators/coord_wf_ods_online_mopub_user_event_info
coord_wf_sql_path=/user/hue/oozie/workspaces/resources/advertisement/hiveserver2/online_mopub
coord_data_path_decision=/apps/hive/warehouse/ods_adv.db/ods_online_h5_m_1
coord_project_name=advt_online_mopub
coord_oozie_wf_applilcation_path=hdfs://ns/user/hue/oozie/workspaces/advertisement/online_mopub/workflows/wf_ods_online_mopub_user_event_info
coord_cube_name=cube_advt_online_mopub_adt_online_mopub_user_event_info_1_h
wf_application_path=hdfs://ns/user/hue/oozie/workspaces/advertisement/online_mopub/workflows/wf_ods_online_mopub_user_event_info
start_date=2019-10-21T14:00+0800
coord_delete_ods_data_path=/apps/hive/ods_log_tmp/ods_adv/ods_online_h5_m_1
coordinator.xml:
<coordinator-app name="coord_ods_online_mopub_user_event_info"
frequency="15 * * * *"
start="${start_date}" end="${end_date}" timezone="Asia/Shanghai"
xmlns="uri:oozie:coordinator:0.2"
>
<controls>
<timeout>-1</timeout>
<execution>FIFO</execution>
</controls>
<datasets>
<dataset name="wf_data_path_decision" frequency="${coord:minutes(5)}"
initial-instance="${start_date}" timezone="Asia/Shanghai">
<uri-template>${nameNode}/apps/hive/warehouse/ods_adv.db/ods_online_h5_m_1/dt=${YEAR}-${MONTH}-${DAY}</uri-template>
<done-flag> </done-flag>
</dataset>
</datasets>
<input-events>
<data-in name="wf_data_path_decision" dataset="wf_data_path_decision">
<instance>${coord:current(-119)}</instance>
</data-in>
</input-events>
<action>
<workflow>
<app-path>${wf_application_path}</app-path>
<configuration>
<property>
<name>wf_data_path_decision</name>
<value>${coord:dataIn('wf_data_path_decision')}</value>
</property>
<property>
<name>project_name</name>
<value>${coord_project_name}</value>
</property>
<property>
<name>data_time</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'HOUR'), 'yyyy-MM-dd HH:mm:ss')}</value>
</property>
<property>
<name>data_hour</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'HOUR'), 'HH')}</value>
</property>
<property>
<name>wf_data_path</name>
<value>${coord_data_path}</value>
</property>
<property>
<name>delete_ods_data_path</name>
<value>${coord_delete_ods_data_path}</value>
</property>
<property>
<name>data_date_of_lastweek</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -8, 'DAY'), 'yyyy-MM-dd')}</value>
</property>
<property>
<name>oozie.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.wf.application.path</name>
<value> ${coord_oozie_wf_applilcation_path}</value>
</property>
<property>
<name>cube_name</name>
<value>${coord_cube_name}</value>
</property>
<property>
<name>wf_sql_path</name>
<value>${coord_wf_sql_path}</value>
</property>
<property>
<name>kylin_url</name>
<value>${coord_kylin_url}</value>
</property>
<property>
<name>hive2_url</name>
<value>${hive2_url}</value>
</property>
<property>
<name>data_date</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'HOUR'), 'yyyy-MM-dd')}</value>
</property>
<property>
<name>data_before</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -2, 'DAY'), 'yyyy-MM-dd')}</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>True</value>
</property>
<property>
<name>start_date</name>
<value>${start_date}</value>
</property>
<property>
<name>end_date</name>
<value>${end_date}</value>
</property>
<property>
<name>coord_oozie_wf_applilcation_path</name>
<value>hdfs://ns/user/hue/oozie/workspaces/advertisement/online_mopub/workflows/wf_ods_online_mopub_user_event_info</value>
</property>
<property>
<name>oozie.coord.application.path</name>
<value>/user/hue/oozie/workspaces/advertisement/online_mopub/coordinators/coord_wf_ods_online_mopub_user_event_info</value>
</property>
<property>
<name>wf_shell_path</name>
<value>${coord_wf_shell_path}</value>
</property>
</configuration>
</workflow>
</action>
</coordinator-app>
在bdt的coordinator.xml:
红框一:
<datasets>
<dataset name="wf_data_path_decision" frequency="${coord:minutes(5)}"
initial-instance="${start_date}" timezone="Asia/Shanghai">
<uri-template>${nameNode}${coord_data_path_decision}/dt=${YEAR}-${MONTH}-${DAY}</uri-template>
<done-flag>_SUCCESS</done-flag>
</dataset>
</datasets>
<input-events>
<data-in name="wf_data_path_decision" dataset="wf_data_path_decision">
<instance>${coord:current(-119)}</instance>
</data-in>
</input-events>
红框二:
<property>
<name>wf_data_path_decision</name>
<value>${coord:dataIn('wf_data_path_decision')}</value>
</property>
Oozie 两个数据原表(ods_1、ods_2)的:
job.Properties:
oozie.use.system.libpath=True
security_enabled=False
dryrun=False
workspace_52eceecc-0760-0201-c64a-0d1cd3a3b097=/user/hue/oozie/workspaces/advertisement/common/wf_kylin_build_time_cube
send_email=False
jobTracker=rm1
kylin_url=http://zzdz0420180014.hk.batmobi.cn:7070
nameNode=hdfs://ns
hive2_url=zzdz0420180015.hk.batmobi.cn:10000
wf_shell_path=/user/hue/oozie/workspaces/resources/common/script
oozie.wf.validate.ForkJoin = false
data_date=2019-10-23
data_date_of_lastweek=2018-01-01
wf_data_path=/apps/hive/warehouse/ods_adv.db/ods_advt_user_operation_basic_tmp
oozie.wf.application.path=hdfs://ns/user/hue/oozie/workspaces/advertisement/advt/workflows/wf_ods_advt_user_operation_basic
oozie.libpath=hdfs://ns/user/hue/oozie/workspaces/advertisement/advt/workflows/wf_ods_advt_user_operation_basic/lib
wf_sql_path=/user/hue/oozie/workspaces/resources/advertisement/hiveserver2/advt
delete_ods_data_path_1=/apps/hive/ods_log_tmp/ods_adv/ods_advt_user_operation_basic_1
delete_ods_data_path_2=/apps/hive/ods_log_tmp/ods_adv/ods_advt_user_operation_basic_2
workflow.xml:
<workflow-app name="wf_ods_advt_user_operation_basic" xmlns="uri:oozie:workflow:0.5">
<start to="hive2-af51"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="hive2-af51" cred="hive2">
<hive2 xmlns="uri:oozie:hive2-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<prepare>
<delete path="${nameNode}${wf_data_path}/dt={data_date}/_SUCCESS"/>
</prepare>
<jdbc-url>jdbc:hive2://${hive2_url}/default</jdbc-url>
<script>${wf_sql_path}/hql_ods_advt_user_operation_basic.hql</script>
<param>data_date=${data_date}</param>
<file>/user/hue/oozie/workspaces/resources/advertisement/hiveserver2/advt/hql_ods_advt_user_operation_basic.hql#hql_ods_advt_user_operation_basic.hql</file>
</hive2>
<ok to="decision-2ffc"/>
<error to="Email"/>
</action>
<action name="fs-c42f">
<fs>
<touchz path='${nameNode}${wf_data_path}/dt=${data_date}/_SUCCESS'/>
</fs>
<ok to="fork-0b64"/>
<error to="Email"/>
</action>
<decision name="decision-2ffc">
<switch>
<case to="fs-c42f">
${fs:exists(concat(concat(wf_data_path,'/dt='),data_date))}</case>
<default to="fork-0b64"/>
</switch>
</decision>
<action name="fs-debd">
<fs>
<delete path='${nameNode}${delete_ods_data_path_1}/dt=${data_date_of_lastweek}'/>
</fs>
<ok to="join-78d1"/>
<error to="Email"/>
</action>
<action name="fs-ddec">
<fs>
<delete path='${nameNode}${delete_ods_data_path_2}/dt=${data_date_of_lastweek}'/>
</fs>
<ok to="join-78d1"/>
<error to="Email"/>
</action>
<action name="Email">
<email xmlns="uri:oozie:email-action:0.2">
<to>zouyi@batmobi.net,lijialun@batmobi.net,moxuqiang@batmobi.net,chenhongwei@batmobi.net,chenhongwei8888@gmail.com</to>
<subject>wf_ods_advt_user_operation_basic report</subject>
<body>wf_ods_advt_user_operation_basic error</body>
<content_type>text/html; charset=UTF-8</content_type>
</email>
<ok to="join-78d1"/>
<error to="Kill"/>
</action>
<fork name="fork-0b64">
<path start="fs-debd" />
<path start="fs-ddec" />
</fork>
<join name="join-78d1" to="End"/>
<end name="End"/>
</workflow-app>
job.properties:
oozie.use.system.libpath=True
security_enabled=False
dryrun=False
coord_kylin_url=http://zzdz0420180014.hk.batmobi.cn:7070
end_date=2050-04-21T00:20+0800
jobTracker=rm1
workspace_52eceecc-0760-0201-c64a-0d1cd3a3b097=/user/hue/oozie/workspaces/advertisement/common/wf_kylin_build_time_cube
hive2_url=zzdz0420180015.hk.batmobi.cn:10000
nameNode=hdfs://ns
coord_wf_shell_path=/user/hue/oozie/workspaces/resources/common/script
oozie.wf.validate.ForkJoin=false
coord_data_path=/apps/hive/warehouse/ods_adv.db/ods_advt_user_operation_basic_tmp
oozie.coord.application.path=/user/hue/oozie/workspaces/advertisement/advt/coordinators/coord_wf_ods_advt_user_operation_basic
coord_wf_sql_path=/user/hue/oozie/workspaces/resources/advertisement/hiveserver2/advt
coord_data_path_decision_1=/apps/hive/ods_log_tmp/ods_adv/ods_advt_user_operation_basic_1
coord_data_path_decision_2=/apps/hive/ods_log_tmp/ods_adv/ods_advt_user_operation_basic_2
coord_project_name=advt_advt
coord_oozie_wf_applilcation_path=hdfs://ns/user/hue/oozie/workspaces/advertisement/advt/workflows/wf_ods_advt_user_operation_basic
coord_cube_name=cube_advt_advt_adt_advt_user_event_info_1_h
wf_application_path=hdfs://ns/user/hue/oozie/workspaces/advertisement/advt/workflows/wf_ods_advt_user_operation_basic
start_date=2019-10-23T00:00+0800
coord_delete_ods_data_path_1=/apps/hive/ods_log_tmp/ods_adv/ods_advt_user_operation_basic_1
coord_delete_ods_data_path_2=/apps/hive/ods_log_tmp/ods_adv/ods_advt_user_operation_basic_2
coordinator.xml:
<coordinator-app name="coord_wf_ods_advt_user_operation_basic"
frequency="15 0 * * *"
start="${start_date}" end="${end_date}" timezone="Asia/Shanghai"
xmlns="uri:oozie:coordinator:0.2"
>
<controls>
<timeout>-1</timeout>
<execution>FIFO</execution>
</controls>
<datasets>
<dataset name="wf_data_path_decision_1" frequency="${coord:minutes(5)}"
initial-instance="${start_date}" timezone="Asia/Shanghai">
<uri-template>${nameNode}/apps/hive/ods_log_tmp/ods_adv/ods_advt_user_operation_basic_1/dt=${YEAR}-${MONTH}-${DAY}</uri-template>
<done-flag> </done-flag>
</dataset>
<dataset name="wf_data_path_decision_2" frequency="${coord:minutes(5)}"
initial-instance="${start_date}" timezone="Asia/Shanghai">
<uri-template>${nameNode}/apps/hive/ods_log_tmp/ods_adv/ods_advt_user_operation_basic_2/dt=${YEAR}-${MONTH}-${DAY}</uri-template>
<done-flag> </done-flag>
</dataset>
</datasets>
<input-events>
<data-in name="wf_data_path_decision_1" dataset="wf_data_path_decision_1">
<instance>${coord:current(-119)}</instance>
</data-in>
<data-in name="wf_data_path_decision_2" dataset="wf_data_path_decision_2">
<instance>${coord:current(-119)}</instance>
</data-in>
</input-events>
<action>
<workflow>
<app-path>${wf_application_path}</app-path>
<configuration>
<property>
<name>wf_data_path_decision_1</name>
<value>${coord:dataIn('wf_data_path_decision_1')}</value>
</property>
<property>
<name>wf_data_path_decision_2</name>
<value>${coord:dataIn('wf_data_path_decision_2')}</value>
</property>
<property>
<name>project_name</name>
<value>${coord_project_name}</value>
</property>
<property>
<name>data_time</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'HOUR'), 'yyyy-MM-dd HH:mm:ss')}</value>
</property>
<property>
<name>data_hour</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'HOUR'), 'HH')}</value>
</property>
<property>
<name>wf_data_path</name>
<value>${coord_data_path}</value>
</property>
<property>
<name>delete_ods_data_path_1</name>
<value>${coord_delete_ods_data_path_1}</value>
</property>
<property>
<name>delete_ods_data_path_2</name>
<value>${coord_delete_ods_data_path_2}</value>
</property>
<property>
<name>data_date_of_lastweek</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -8, 'DAY'), 'yyyy-MM-dd')}</value>
</property>
<property>
<name>oozie.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.wf.application.path</name>
<value> ${coord_oozie_wf_applilcation_path}</value>
</property>
<property>
<name>cube_name</name>
<value>${coord_cube_name}</value>
</property>
<property>
<name>wf_sql_path</name>
<value>${coord_wf_sql_path}</value>
</property>
<property>
<name>kylin_url</name>
<value>${coord_kylin_url}</value>
</property>
<property>
<name>hive2_url</name>
<value>${hive2_url}</value>
</property>
<property>
<name>data_date</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -1, 'HOUR'), 'yyyy-MM-dd')}</value>
</property>
<property>
<name>data_before</name>
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), -2, 'DAY'), 'yyyy-MM-dd')}</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>True</value>
</property>
<property>
<name>start_date</name>
<value>${start_date}</value>
</property>
<property>
<name>end_date</name>
<value>${end_date}</value>
</property>
<property>
<name>coord_oozie_wf_applilcation_path</name>
<value>hdfs://ns/user/hue/oozie/workspaces/advertisement/advt/workflows/wf_ods_advt_user_operation_basic</value>
</property>
<property>
<name>oozie.coord.application.path</name>
<value>/user/hue/oozie/workspaces/advertisement/advt/coordinators/coord_wf_ods_advt_user_operation_basic</value>
</property>
<property>
<name>wf_shell_path</name>
<value>${coord_wf_shell_path}</value>
</property>
</configuration>
</workflow>
</action>
</coordinator-app>