--创建表,确定schema和各种formatDROPTABLEIFEXISTS shufang.students;CREATETABLEIFNOTEXISTS shufang.students(
id int,
name string,
create_time string
) partitioned by(dt string)--指定分区表row format delimited fieldsterminatedby'\t'--指定字段分隔符
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'--指定INPUTFORMAT,就是从
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION '/user/hive/warehouse/shufang.db/students'--指定在表在HDFS中的存储路径;--导入数据,是不走MR的,只需要将指定的目录中的文件移动到分区目录下LOADDATA INPATH '/origin_data/db/shufang/students/2021-01-18'INTOTABLE shufang.students PARTITION(dt ='2021-01-18');--如果是flume过来的日志数据,由于只做了压缩,还不支持切片,所以我们需要load之后将数据建立索引支持切片
hadoop jar /opt/module/hadoop-2.7.7/share/hadoop/common/hadoop-lzo-0.4.21-SNAPSHOT.jar \
com.hadoop.compression.lzo.DistributedLzoIndexer\
/user/hive/warehouse/shufang.db/students/dt=2021-01-18
2 从ODS到DWD
CREATETABLEIFNOTEXISTS student1(
id int,
name string,
create_time string
)COMMENT'parquet store table,parquet is born to support split'
PARTITIONED BY(dt string)--指定分区键
STORED AS parquet --指定存储,底层还是inputformat 和 outputformat
LOCATION '/user/hive/warehouse/shufang.db/student1'--指定存储路径
TBLPROPERTIES('parquet.compression'='lzo');--指定表属性,为parquet指定压缩格式INSERT OVERWRITE TABLE student1 PARTITION(dt ='2021-01-18')SELECT
id,
name,
create_time
FROM students
WHERE dt='2021-01-18';