阿里数据湖基本操作(踩坑小能手)

数据湖笔记

DLA针对OSS建表文档:
Data Lake Analytics +OSS数据文件格式处理大全:

https://yq.aliyun.com/articles/623246

Data Lake 文档:

https://help.aliyun.com/product/70174.html?spm=a2c4g.11174283.6.540.71ff7a92YAvE4q

杀掉进程
kill query q201908151931zjkb589d03d0003344
查看分区
show partitions log_push_dla_cleaned_hour_1906  partition (pure_date =20190607)
转时间戳
UNIX_TIMESTAMP(date) as '推送时间戳'
unicode转中文
decode_hex_unicode(site_title)
清洗
/*+ run-async=true,addExtraExchangesBeforeTablescan=true*/
分析查询池子
/*+ pool-selector=medimum,  run-async=true */
/*+ pool-selector=dla-1408722774623865-eas6x7dh, run-async=true* /
spark
/*+ run-async=true,engine=spark, spark-mode=thrift-server*/
异步小查询
/*+run-async=true,pool-selector=5min */
修改分隔符
/*+ run-async=true, result-col-del=","*/
修改文件名
/*+ result-file-name=yourFile */
修改时间-n
to_char(sysdate() - interval 5 day, 'yyyyMMdd')
异步执行
/*+run-async=true*/   
select count(distinct dmp_tag) from table where pure_date  = 20190501 
查询状态
   show query_task 
    show query_task where task_name = 'SELECT' and id= 'q201905151559zjke18865d10023553'  
更新表
MSCK REPAIR TABLE table
创建临时表
  CREATE EXTERNAL TABLE   month_01_pv_pure_date (
    pure_date  int,
    pv string
    ) 
    ROW FORMA   T DELIMITED FIELDS TERMINATED BY '|' 
    STORED AS TEXTFILE 
    LOCATION  
    'oss://.../result.csv'
创建表然后insert
  CREATE external TABLE dmp_toutiao_mz (
    rmid string,
    mtype string,
    publisher_id int,
    campaign_id int,
    order_id int ,
    user_yearsold int,
    user_gender int ,
    dmp_tag string
    )
    STORED AS PARQUET
    LOCATION 'oss://a.../mz/'
    TBLPROPERTIES (
    'auto.create.location' = 'true'
    )
    /*+run-async=true*/   
    insert overwrite  `dmp_toutiao_mz`
    select 
    rmid,mtype,publisher_id,campaign_id,order_id,user_year    sold,user_gender,dmp_tag
    from tables
    where  pure_date >=20190401 and pure_date <= 20190430     and campaign_id in     (1,2,3)
    and mtype in ('idfa','ios','imei') 
    and valid = true;
数据湖传输表:
vertica:
create table data.bc_nx(
bc varchar(128),
nx varchar(128)
);
  copy DATA.bc_nx from local   'C:\Users\QuanSir\Desktop\bc64.txt' DELIMITER   E'\t' abort on error; 
     
  export to parquet (directory=   's3://.../bc_nx') 
  AS select * from data.bc_nx
数据湖:
SHOW CREATE TABLE  'addnewer_data_hb3'.bc_nx
LIKE MAPPING ('oss://.../bc_nx    ');(会出现一个创表语句如下copy运行就ok)
CREATE EXTERNAL TABLE `bc_nx` (
	`bc` string,
	`nx` string
)
STORED AS `PARQUET`
LOCATION 'oss://.../bc_nx'
TBLPROPERTIES (
	'create.table.like.file' =     'oss://.../bc_nx'
)
select * from `bc_nx`    
数据湖创建指定文件的表
show create table rm_report_basicinfo LIKE MAPPING ('oss://.../801073227520.parquet');
vertica传输到数据湖
vertica:
export to parquet (directory=   's3://.../bc_nx_id') 
AS select * from data.bc_nx;
数据湖:
SHOW CREATE TABLE rmos_order_2018
LIKE MAPPING ('oss://.../bc_nx_id');
数据湖传输vertica
CREATE  external TABLE data.(

) as copy from 's3://.../*'  parquet
CTAS
第一步 创表
create external  table dla_result.06_sum  
STORED AS parquet 
LOCATION  
'oss://.../06_sum/' --路径必须自己写
TBLPROPERTIES (
'auto.create.location' = 'true' --允许自动创建新目录
)like 
select pure_date, hour,count(*) from   log_push_dla_cleaned_hour_1906  where pure_date between 20190601 and 20190610 group by 1,2 

第二步 insert
/*+run-async=true*/
insert into dla_result.06_sum 
select pure_date, hour,count(*) from   log_push_dla_cleaned_hour_1906  where pure_date between 20190601 and 20190610 group by 1,2 
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值