以下试验证明各自特点。
create table t1_t ( start_time timestamp ,cdr_id decimal(14,1)) stored as parquet;
create table t1_s ( start_time string ,cdr_id decimal(14,1)) stored as parquet;
1、转换
[cdh1:21000] > insert into t1_t(start_time,cdr_id) select start_time,cdr_id from ds_clt_ps_cdr_all_20141205;
Query: insert into t1_t(start_time,cdr_id) select start_time,cdr_id from ds_clt_ps_cdr_all_20141205
Inserted 93443540 row(s) in 15.66s
[cdh1:21000] >
[cdh1:21000] >
[cdh1:21000] > insert into t1_s(start_time,cdr_id) select start_time,cdr_id from ds_clt_ps_cdr_all_20141205;
Query: insert into t1_s(start_time,cdr_id) select start_time,cdr_id from ds_clt_ps_cdr_all_20141205
ERROR: AnalysisException: Possible loss of precision for target table 'db1.t1_s'.
Expression 'start_time' (type: TIMESTAMP) would need to be cast to STRING for column 'start_time'
[cdh1:21000] >
[cdh1:21000] > insert into t1_s(start_time,cdr_id) select cast(start_time as string),cdr_id from ds_clt_ps_cdr_all_20141205;
Query: insert into t1_s(start_time,cdr_id) select cast(start_time as string),cdr_id from ds_clt_ps_cdr_all_20141205
Inserted 93443540 row(s) in 146.34s
cast转换timestamp到string需要耗费10倍时间。
2、存储
1440945552 4322836656 /user/hive/warehouse/db1.db/t1_s
1123422908 3370268724 /user/hive/warehouse/db1.db/t1_t
string需要的存储空间大于timestamp。
3、入库
timestamp类型
Query: insert overwrite ds_clt_ps_cdr_all_20141205 partition (hour=03) select * from ds_clt_ps_cdr_all_20141205_03
Inserted 31114145 row(s) in 373.19s
5808422476 17425267428 /user/hive/warehouse/db1.db/ds_clt_ps_cdr_all_20141205/hour=3
string类型
[cdh1:21000] > insert into t1_ps partition (hour=03) select * from t1_allcol;
Query: insert into t1_ps partition (hour=03) select * from t1_allcol
Inserted 31114145 row(s) in 411.73s
6047395281 18142185843 /user/hive/warehouse/db1.db/t1_ps/hour=3
4、查询1
[cdh1:21000] > SELECT substr(cast(start_time as string),1,10) FROM t1_t
> group by substr(cast(start_time as string),1,10),
> cdr_id
> limit 10;
Query: select substr(cast(start_time as string),1,10) FROM t1_t
group by substr(cast(start_time as string),1,10),
cdr_id
limit 10
+-------------------------------------------+
| substr(cast(start_time as string), 1, 10) |
+-------------------------------------------+
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
+-------------------------------------------+
Fetched 10 row(s) in 234.14s
[cdh1:21000] > SELECT substr(start_time ,1,10) FROM t1_s
> group by substr(start_time ,1,10),
> cdr_id
> limit 10
> ;
Query: select substr(start_time ,1,10) FROM t1_s
group by substr(start_time ,1,10),
cdr_id
limit 10
+---------------------------+
| substr(start_time, 1, 10) |
+---------------------------+
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
+---------------------------+
Fetched 10 row(s) in 29.35s
由于需要cast转换,查询效率也接近10倍。
5、查询2
[cdh1:21000] > SELECT year(start_time) FROM t1_t
> group by year(start_time),cdr_id
> limit 10;
Query: select year(start_time) FROM t1_t
group by year(start_time),cdr_id
limit 10
+------------------+
| year(start_time) |
+------------------+
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
+------------------+
Fetched 10 row(s) in 28.31s
[cdh1:21000] > SELECT substr(start_time,1,4) FROM t1_s
> group by substr(start_time,1,4),cdr_id
> limit 10;
Query: select substr(start_time,1,4) FROM t1_s
group by substr(start_time,1,4),cdr_id
limit 10
+--------------------------+
| substr(start_time, 1, 4) |
+--------------------------+
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
+--------------------------+
Fetched 10 row(s) in 30.30s
使用各自特性,两者查询相差不大。
create table t1_t ( start_time timestamp ,cdr_id decimal(14,1)) stored as parquet;
create table t1_s ( start_time string ,cdr_id decimal(14,1)) stored as parquet;
1、转换
[cdh1:21000] > insert into t1_t(start_time,cdr_id) select start_time,cdr_id from ds_clt_ps_cdr_all_20141205;
Query: insert into t1_t(start_time,cdr_id) select start_time,cdr_id from ds_clt_ps_cdr_all_20141205
Inserted 93443540 row(s) in 15.66s
[cdh1:21000] >
[cdh1:21000] >
[cdh1:21000] > insert into t1_s(start_time,cdr_id) select start_time,cdr_id from ds_clt_ps_cdr_all_20141205;
Query: insert into t1_s(start_time,cdr_id) select start_time,cdr_id from ds_clt_ps_cdr_all_20141205
ERROR: AnalysisException: Possible loss of precision for target table 'db1.t1_s'.
Expression 'start_time' (type: TIMESTAMP) would need to be cast to STRING for column 'start_time'
[cdh1:21000] >
[cdh1:21000] > insert into t1_s(start_time,cdr_id) select cast(start_time as string),cdr_id from ds_clt_ps_cdr_all_20141205;
Query: insert into t1_s(start_time,cdr_id) select cast(start_time as string),cdr_id from ds_clt_ps_cdr_all_20141205
Inserted 93443540 row(s) in 146.34s
cast转换timestamp到string需要耗费10倍时间。
2、存储
1440945552 4322836656 /user/hive/warehouse/db1.db/t1_s
1123422908 3370268724 /user/hive/warehouse/db1.db/t1_t
string需要的存储空间大于timestamp。
3、入库
timestamp类型
Query: insert overwrite ds_clt_ps_cdr_all_20141205 partition (hour=03) select * from ds_clt_ps_cdr_all_20141205_03
Inserted 31114145 row(s) in 373.19s
5808422476 17425267428 /user/hive/warehouse/db1.db/ds_clt_ps_cdr_all_20141205/hour=3
string类型
[cdh1:21000] > insert into t1_ps partition (hour=03) select * from t1_allcol;
Query: insert into t1_ps partition (hour=03) select * from t1_allcol
Inserted 31114145 row(s) in 411.73s
6047395281 18142185843 /user/hive/warehouse/db1.db/t1_ps/hour=3
4、查询1
[cdh1:21000] > SELECT substr(cast(start_time as string),1,10) FROM t1_t
> group by substr(cast(start_time as string),1,10),
> cdr_id
> limit 10;
Query: select substr(cast(start_time as string),1,10) FROM t1_t
group by substr(cast(start_time as string),1,10),
cdr_id
limit 10
+-------------------------------------------+
| substr(cast(start_time as string), 1, 10) |
+-------------------------------------------+
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
+-------------------------------------------+
Fetched 10 row(s) in 234.14s
[cdh1:21000] > SELECT substr(start_time ,1,10) FROM t1_s
> group by substr(start_time ,1,10),
> cdr_id
> limit 10
> ;
Query: select substr(start_time ,1,10) FROM t1_s
group by substr(start_time ,1,10),
cdr_id
limit 10
+---------------------------+
| substr(start_time, 1, 10) |
+---------------------------+
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
| 2014-12-05 |
+---------------------------+
Fetched 10 row(s) in 29.35s
由于需要cast转换,查询效率也接近10倍。
5、查询2
[cdh1:21000] > SELECT year(start_time) FROM t1_t
> group by year(start_time),cdr_id
> limit 10;
Query: select year(start_time) FROM t1_t
group by year(start_time),cdr_id
limit 10
+------------------+
| year(start_time) |
+------------------+
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
+------------------+
Fetched 10 row(s) in 28.31s
[cdh1:21000] > SELECT substr(start_time,1,4) FROM t1_s
> group by substr(start_time,1,4),cdr_id
> limit 10;
Query: select substr(start_time,1,4) FROM t1_s
group by substr(start_time,1,4),cdr_id
limit 10
+--------------------------+
| substr(start_time, 1, 4) |
+--------------------------+
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
| 2014 |
+--------------------------+
Fetched 10 row(s) in 30.30s
使用各自特性,两者查询相差不大。
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/10037372/viewspace-1471785/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/10037372/viewspace-1471785/