下载依赖库
yum -y install lzo-devel zlib-devel gcc autoconf automake libtool
安装LZO
#1. 下载
[hadoop@hdp01 ~]cd /opt/software
[hadoop@hdp01 softWare] wget www.oberhumer.com/opensource/lzo/download/lzo-2.06.tar.gz
#2. 解压缩
[hadoop@hdp01 softWare]$ tar -zxvf lzo-2.06.tar.gz -C /opt/module/
#3. 安装
[hadoop@hdp01 softWare]$ cd /opt/software/lzo-2.06/
[hadoop@hdp01 lzo-2.06]$ export CFLAGS=-m64
#创建文件夹,用来存放编译之后的lzo
[hadoop@hdp01 lzo-2.06]$ mkdir complied_lzo
#指定编译之后的位置
[hadoop@hdp01 lzo-2.06]$ ./configure -enable-shared -prefix=/opt/module/lzo-2.06/complied_lzo
#开始编译安装
[hadoop@hdp01 lzo-2.06]$ make && make install
#4. 查看编译是否成功 只要有如下内容 就可以了
[hadoop@hdp01 lzo-2.06]$ cd complied_lzo/
[hadoop@hdp01 complied_lzo]$ ll
total 12
drwxrwxr-x 3 hadoop hadoop 4096 Dec 6 17:08 include
drwxrwxr-x 2 hadoop hadoop 4096 Dec 6 17:08 lib
drwxrwxr-x 3 hadoop hadoop 4096 Dec 6 17:08 share
[hadoop@hdp01 complied_lzo]$
#5. 最后将文件复制到其他节点对应位置
安装hadoop-lzo
#1. 下载
[hadoop@hdp01 softWare]$ wget https://github.com/twitter/hadoop-lzo/archive/master.zip
报错
解决方法
在Wget后面添加"–no-check-certificate"如下所示:
wget --no-check-certificate 你要下载的SSL网址
wget https://github.com/twitter/hadoop-lzo/archive/master.zip --no-check-certificate
#2. 解压
[hadoop@hdp01 softWare]$ unzip master
#解压后的文件夹名:hadoop-lzo-master
[hadoop@hdp01 softWare]$ mv hadoop-lzo-master /opt/module/
[hadoop@hdp01 softWare]$ cd /opt/module/hadoop-lzo-master
#3. 修改配置,导入环境
[hadoop@hdp01 hadoop-lzo-master]$ vim pom.xml
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<hadoop.current.version>2.7.2</hadoop.current.version> #这里修改成对应的hadoop版本号
<hadoop.old.version>1.0.4</hadoop.old.version>
[hadoop@hdp01 hadoop-lzo-master]$ export CFLAGS=-m64
[hadoop@hdp01 hadoop-lzo-master]$ export CXXFLAGS=-m64
[hadoop@hdp01 hadoop-lzo-master]$ export C_INCLUDE_PATH=/opt/module/lzo-2.06/complied_lzo/include/ # 这里需要提供编译好的lzo的include文件
[hadoop@hdp01 hadoop-lzo-master]$ export LIBRARY_PATH=/opt/module/lzo-2.06/complied_lzo/lib/
#4. 编译
[hadoop@hdp01 hadoop-lzo-master]$ mvn clean package -Dmaven.test.skip=true
结果出现 BUILD SUCCESS 的时候 说明成功!
#5. 查看文件并复制到对应位置
[hadoop@hdp01 hadoop-lzo-master]$ ll
total 80
-rw-rw-r-- 1 hadoop hadoop 35147 Oct 13 2017 COPYING
-rw-rw-r-- 1 hadoop hadoop 19753 Dec 6 17:18 pom.xml
-rw-rw-r-- 1 hadoop hadoop 10170 Oct 13 2017 README.md
drwxrwxr-x 2 hadoop hadoop 4096 Oct 13 2017 scripts
drwxrwxr-x 4 hadoop hadoop 4096 Oct 13 2017 src
drwxrwxr-x 10 hadoop hadoop 4096 Dec 6 17:21 target
#进入target/native/Linux-amd64-64 目录下执行如下命令
[hadoop@hdp01 hadoop-lzo-master]$ cd target/native/Linux-amd64-64
[hadoop@hdp01 Linux-amd64-64]$ tar -cBf - -C lib . | tar -xBvf - -C ~
./
./libgplcompression.so
./libgplcompression.so.0
./libgplcompression.la
./libgplcompression.a
./libgplcompression.so.0.0.
[hadoop@hdp01 Linux-amd64-64]$ cp ~/libgplcompression* $HADOOP_HOME/lib/native/
#这里很重要,需要把hadoop-lzo-0.4.21-SNAPSHOT.jar 复制到hadoop中
[hadoop@hdp01 hadoop-lzo-master]$ cp target/hadoop-lzo-0.4.21-SNAPSHOT.jar $HADOOP_HOME/share/hadoop/common/
[hadoop@hdp01 hadoop-lzo-master]$ cp target/hadoop-lzo-0.4.21-SNAPSHOT.jar $HADOOP_HOME/share/hadoop/mapreduce/lib
#6. 最后别忘记将下面三个地址的文件复制到其他节点
$HADOOP_HOME/lib/native/libgplcompression*
$HADOOP_HOME/share/hadoop/mapreduce/lib/hadoop-lzo-0.4.21-SNAPSHOT.jar
$HADOOP_HOME/share/hadoop/common/hadoop-lzo-0.4.21-SNAPSHOT.jar
修改hadoop配置文件
#1. vim $HADOOP_HOME/etc/hadoop/hadoop-env.sh
增加 如下配置
export LD_LIBRARY_PATH=/opt/module/lzo-2.06/complied_lzo/lib
export HADOOP_CLASSPATH="<extra_entries>:
H
A
D
O
O
P
C
L
A
S
S
P
A
T
H
:
HADOOP_CLASSPATH:
HADOOPCLASSPATH:{HADOOP_HOME}/share/hadoop/common"
export JAVA_LIBRARY_PATH=
J
A
V
A
L
I
B
R
A
R
Y
P
A
T
H
:
{JAVA_LIBRARY_PATH}:
JAVALIBRARYPATH:{HADOOP_HOME}/lib/native
#2. vim $HADOOP_HOME/etc/hadoop/core-site.xml
增加 如下配置
io.compression.codecs org.apache.hadoop.io.compress.GzipCodec, org.apache.hadoop.io.compress.DefaultCodec, org.apache.hadoop.io.compress.BZip2Codec, com.hadoop.compression.lzo.LzoCodec, com.hadoop.compression.lzo.LzopCodec io.compression.codec.lzo.class com.hadoop.compression.lzo.LzoCodec#3. 修改 vim $HADOOP_HOME/etc/hadoop/mapred-site.xml
mapred.compress.map.output
true
#4. 最后将以上配置文件复制到其他节点
重启hadoop集群后,使用hadoop-lzo
使用lzo压缩hive表文件
yum -y install lzop
# 1. 查看默认的压缩方式
set hive.exec.compress.output;
set mapreduce.output.fileoutputformat.compress.codec;
# 2. 自定义压缩方式
# 任务中间压缩
set hive.exec.compress.intermediate=true;
set hive.intermediate.compression.codec=org.apache.hadoop.io.compress.LzoCodec;
set hive.intermediate.compression.type=BLOCK;
# map/reduce 输出压缩
set hive.exec.compress.output=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.LzoCodec;
set mapred.output.compression.type=BLOCK;
#3. 创建新表导入压缩数据
create table page_views_lzo as select * from page_views;
#4. 查看HDFS上 page_views_snappy 表数据
hdfs dfs -du -h /user/hive/warehouse/hivetest.db/page_views_lzo
hdfs dfs -ls /user/hive/warehouse/hivetest.db/page_views_lzo
使用lzo压缩普通文件并进行wc计算
5.2.1 lzo压缩文件并wc
#上传文件到Linux1234.dat
[hadoop@hdp01 data]$ lzop 1234.dat # 生成新文件1234.dat.lzo
[hadoop@hdp01 data]$ hdfs dfs -put 1234.dat.lzo /testdata/
#wc计算
[hadoop@hdp01 data]$ cd /home/hadoop/app/hadoop/share/hadoop/mapreduce
[hadoop@hdp01 mapreduce]$ hadoop jar \
hadoop-mapreduce-examples-2.6.0-cdh5.7.0.jar wordcount \
/testdata/1234.dat.lzo \
/testdata/out01
这个过程中出现 **number of splits:1** ,说明 hadoop并没有给我的lzo文件切片
5.2.2 给lzo文件建立索引并wc
注意,这里要使用的lzo文件需要大于hdfs-site.xml文件中设置的默认块大小,我的默认块大小为256M,文件大小为2.2G,wc计算中split: 9
#创建索引文件, HDFS同目录下会生成文件 1234.dat.lzo.index
[hadoop@hdp01 mapreduce]$ hadoop jar \
share/hadoop/mapreduce/lib/hadoop-lzo-0.4.21-SNAPSHOT.jar \
com.hadoop.compression.lzo.DistributedLzoIndexer \
/data/1234.dat.lzo
# wc计算
[hadoop@hdp01 mapreduce]$ hadoop jar \
hadoop-mapreduce-examples-2.6.0-cdh5.7.0.jar wordcount \
-Dmapreduce.job.inputformat.class=com.hadoop.mapreduce.LzoTextInputFormat \
/testdata/1234.dat.lzo \
/testdata/out02
这个过程中出现 **number of splits:9** ,说明 hadoop已经对我的lzo文件切片了。