hive数据压缩 snappy
mkdir 2.5.0-native-snappy
tar -zxvf 2.5.0-native-snappy.tar.gz -C 2.5.0-native-snappy
cd hadoop-2.5.0/lib
#改变原来的native
mv native/ 250native
mkdir native
cp /home/soft/2.5.0-native-snappy/* native/
bin/hadoop checknative
#结果如下
18/04/16 12:30:25 INFO bzip2.Bzip2Factory: Successfully loaded & initialized native-bzip2 library system-native
18/04/16 12:30:25 INFO zlib.ZlibFactory: Successfully loaded & initialized native-zlib library
Native library checking:
hadoop: true /opt/modules/hadoop-2.5.0/lib/native/libhadoop.so
zlib: true /lib64/libz.so.1
snappy: true /opt/modules/hadoop-2.5.0/lib/native/libsnappy.so.1
lz4: true revision:99
bzip2: true /lib64/libbz2.so.1
运行一个mapreduce程序
#在hdfs上创建文件
bin/hdfs dfs -mkdir -p /user/jianxin/mapreduce/wordcount/input
#上传文件
bin/hdfs dfs -put /home/datas/mc.input /user/jianxin/mapreduce/wordcount/input
bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.0.jar wordcount /user/jianxin/mapreduce/wordcount/input /user/jianxin/mapreduce/wordcount/output
bin/hdfs dfs -cat /user/jianxin/mapreduce/wordcount/output/part-r-00000
#压缩形式
bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.0.jar wordcount -D mapreduce.map.output.compress=true -D mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.SnappyCodec /user/jianxin/mapreduce/wordcount/input /user/jianxin/mapreduce/wordcount/output22
#查看历史记录服务器
http://hadoop.jianxin.com:19888/jobhistory
hive数据表的存储
- TEXTFILE
create table page_views(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE ;
hive (default)> load data local inpath '/home/datas/page_views.data' into table page_views;
#查看文件的大小
hive (default)> dfs -du -h /user/hive/warehouse/page_views
18.1 M /user/hive/warehouse/page_views/page_views.data
- orc
create table page_views_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc ;
hive (default)> load data local inpath '/home/datas/page_views.data' into table page_views_orc;
#或者
insert into table page_views_orc select * from page_views ;
hive (default)> dfs -du -h /user/hive/warehouse/page_views_orc
2.6 M /user/hive/warehouse/page_views_orc/000000_0
- parquet
create table page_views_parquet(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET ;
load data local inpath '/home/datas/page_views.data' into table page_views_parquet;
dfs -du -h /user/hive/warehouse/page_views_parquet
18.1 M /user/hive/warehouse/page_views_parquet/page_views.data
- snappy
create table page_views_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");
load data local inpath '/home/datas/page_views.data' into table page_views_orc_snappy;
dfs -du -h /user/hive/warehouse/page_views_orc_snappy
18.1 M /user/hive/warehouse/page_views_orc_snappy/page_views.data
create table page_views_orc_none(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="NONE");
load data local inpath '/home/datas/page_views.data' into table page_views_orc_none;
dfs -du -h /user/hive/warehouse/page_views_orc_none
hive (default)> dfs -du -h /user/hive/warehouse/page_views_orc_none;
18.1 M /user/hive/warehouse/page_views_orc_none/page_views.data
set parquet.compression=SNAPPY ;
create table page_views_parquet_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS parquet;
insert into table page_views_parquet_snappy select * from page_views ;
dfs -du -h /user/hive/warehouse/page_views_parquet_snappy/ ;
hive (default)> dfs -du -h /user/hive/warehouse/page_views_parquet_snappy/ ;
6.4 M /user/hive/warehouse/page_views_parquet_snappy/000000_0
总结
在实际的项目开发当中,hive表的数据,存储格式选用orcfile 或者 qarquet 数据压缩采用snappy。