环境:
- hadoop1.2.1集群
- Hive1.2.0
- mysql server
- Redhat-6.6 Server.x86_64
1 导入作业数据集total.csv到Hive中,用日期做为分区表的分区ID。要求有描述型文字,Hive脚本和截图。
回购表数据:
tradedate:交易日期
tradetime:交易时间
securityid:股票ID
bidpx1:买一
bidsize1:买一交易量
offerpx1:卖一
bidsize1:卖一交易量
数据存储格式:
tradedate,tradetime,securityid,bidpx1,bidsize1,offerpx1,offersize1
数据准备:
localhost:homework11 apple$ scptotal.csv hadoop@hadoop0:/home/hadoop/Downloads/Data/
[hadoop@hadoop0Data]$ head total.csv
- [hadoop@hadoop0 Data]$ head total.csv
- 20130722,130042,131810,3.415,915890.0,3.42,8960.0
- 20130722,130516,131810,3.421,405180.0,3.422,14950.0
- 20130722,130752,131810,3.431,232070.0,3.435,620.0
- 20130722,131001,131810,3.453,136520.0,3.46,7400.0
- 20130722,130901,131810,3.451,138740.0,3.452,31330.0
- 20130722,130449,131810,3.42,1522850.0,3.421,22900.0
- 20130722,130254,131810,3.415,428150.0,3.416,2430.0
- 20130722,130713,131810,3.43,476160.0,3.435,2990.0
- 20130722,130540,131810,3.421,304720.0,3.422,1050.0
- 20130722,130931,131810,3.45,1742420.0,3.451,13700.0
#删除表
drop table if exists t_reverse_repurchase_tmp;
#创建临时表t_reverse_repurchase_tmp,分隔符','。
CREATE TABLE t_reverse_repurchase_tmp(tradedate STRING,tradetime STRING, securityid STRING, bidpx1 DOUBLE, bidsize1BIGINT,offerpx1 DOUBLE,offersize1 BIGINT) ROW FORMAT DELIMITED FIELDSTERMINATED BY ',';
#导入数据到t_reverse_repurchase_tmp。
LOAD DATA LOCAL INPATH'/home/hadoop/Downloads/Data/total.csv' OVERWRITE INTO TABLEt_reverse_repurchase_tmp;
#查看导入的数据:
hive> select * from t_reverse_repurchase_tmp limit 10;
- hive> select * from t_reverse_repurchase_tmp limit 10;
- OK
- 20130722 130042 131810 3.415 915890 3.42 8960
- 20130722 130516 131810 3.421 405180 3.422 14950
- 20130722 130752 131810 3.431 232070 3.435 620
- 20130722 131001 131810 3.453 136520 3.46 7400
- 20130722 130901 131810 3.451 138740 3.452 31330
- 20130722 130449 131810 3.42 1522850 3.421 22900
- 20130722 130254 131810 3.415 428150 3.416 2430
- 20130722 130713 131810 3.43 476160 3.435 2990
- 20130722 130540 131810 3.421 304720 3.422 1050
- 20130722 130931 131810 3.45 1742420 3.451 13700
- Time taken: 0.056 seconds, Fetched: 10 row(s)
- hive>
#查看表描述
hive> DESCRIBE t_reverse_repurchase_tmp;
- hive> DESCRIBE t_reverse_repurchase_tmp;
- OK
- tradedate string
- tradetime string
- securityid string
- bidpx1 double
- bidsize1 bigint
- offerpx1 double
- offersize1 bigint
- Time taken: 0.095 seconds, Fetched: 7 row(s)
staticpartition常适用于使用处理时间作为partition key的例子。
但是这里需要向分区表中插入大量数据,并且插入前不清楚数据归属的partition,此时,我们需要dynamic partition。使用动态分区需要设置hive.exec.dynamic.partition参数值为true。
hive>use mydb
#删除表
hive>drop table if exists t_reverse_repurchase;
#建表
hive>CREATE TABLE t_reverse_repurchase (tradetime STRING, securityidSTRING, bidpx1 DOUBLE, bidsize1 BIGINT,offerpx1 DOUBLE,offersize1 BIGINT)PARTITIONED BY (tradedate STRING);
#设置参数开启dynamic partition
hive> sethive.exec.dynamic.partition.mode=nonstrict;
hive> sethive.exec.dynamic.partition=true;
#将数据从t_reverse_repurchase_tmp加载到t_reverse_repurchase中
INSERT OVERWRITE TABLEt_reverse_repurchase PARTITION (tradedate) SELECT a.tradetime astradetime,a.securityid as securityid, a.bidpx1 as bidpx1, a.bidsize1 asbidsize1,a.offerpx1 as fferpx1,a.offersize1 as offersize1, a.tradedate astradedate FROM t_reverse_repurchase_tmp a;
#查看导入的数据:
hive> select * from t_reverse_repurchase wheretradedate=='20130723' and securityid='204001' limit 10;
- hive> select * from t_reverse_repurchase where tradedate=='20130723' and securityid='204001' limit 10;
- OK
- 145651 204001 2.65 104300 2.66 100 20130723
- 145011 204001 2.85 31300 2.9 8300 20130723
- 145451 204001 2.305 185800 2.31 17600 20130723
- 145616 204001 2.6 144200 2.61 200 20130723
- 145301 204001 2.22 74400 2.25 800 20130723
- 145341 204001 2.3 116000 2.38 1000 20130723
- 145211 204001 2.6 177700 2.625 4200 20130723
- 145121 204001 2.725 16000 2.73 600 20130723
- 145041 204001 2.78 153300 2.8 8700 20130723
- 145431 204001 2.31 85900 2.38 300 20130723
- Time taken: 0.079 seconds, Fetched: 10 row(s)
- hive>
#查看表的描述信息
hive> DESCRIBE t_reverse_repurchase;
- hive> DESCRIBE t_reverse_repurchase;
- OK
- tradetime string
- securityid string
- bidpx1 double
- bidsize1 bigint
- offerpx1 double
- offersize1 bigint
- tradedate string
-
- # Partition Information
- # col_name data_type comment
-
- tradedate string
- Time taken: 0.08 seconds, Fetched: 12 row(s)
#查看表的分区存储情况
hive>show partitions t_reverse_repurchase;
- hive> show partitions t_reverse_repurchase;
- OK
- tradedate=20130722
- tradedate=20130723
- tradedate=20130724
- tradedate=20130725
- tradedate=20130726
- Time taken: 0.116 seconds, Fetched: 5 row(s)
- hive>