1、创建表
创建表:youtube_ori,youtube_user_ori
创建表:youtube_orc,youtube_user_orc
youtube_ori
hive> create table youtube_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
youtube_user_ori
hive> create table youtube_user_ori(
uploader string,
videos int,
friends int)
clustered by (uploader) into 24 buckets
row format delimited
fields terminated by "\t"
stored as textfile;
然后把原始数据插入到orc表中
youtube_orc
hive> create table youtube_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
clustered by (uploader) into 8 buckets
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc;
youtube_user_orc
hive> create table youtube_user_orc(
uploader string,
videos int,
friends int)
clustered by (uploader) into 24 buckets
row format delimited
fields terminated by "\t"
stored as orc;
2、导入ETL后的数据
youtube_ori
hive> load data inpath "/youtube/output/video/2008/0222" into table youtube_ori;
youtube_user_ori
hive> load data inpath "/youtube/user/2008/0903" into table youtube_user_ori;
3、向ORC表插入数据
youtube_orc
hive> insert into table youtube_orc select * from youtube_ori;
youtube_user_orc
hive> insert into table youtube_user_orc select * from youtube_user_ori;