准备工作:
yum -y install lzo-devel zlib-devel gcc autoconf automake libtool
wget http://www.oberhumer.com/opensource/lzo/download/lzo-2.06.tar.gz
tar -zxvf lzo-2.06.tar.gz
cd /root/lzo-2.06
./configure -enable-shared -prefix=/usr/local/hadoop/lzo/make && make test && make install
安装完毕,将/usr/local/hadoop/lzo/lib/* 复制到/usr/lib/和/usr/lib64/下
cp /usr/local/hadoop/lzo/lib/* /usr/lib/
cp /usr/local/hadoop/lzo/lib/* /usr/lib64/
配置环境变量(vi /etc/profile):export PATH=/usr/local//hadoop/lzo/:$PATH
wget http://www.lzop.org/download/lzop-1.03.tar.gz
tar -zxvf lzop-1.03.tar.gz
export C_INCLUDE_PATH=/usr/local/hadoop/lzo/include/
PS:如果不配置,会报错:
configure: error: LZO header files not found. Please check your installation or set the environment variable `CPPFLAGS'.
接下来,
/root/lzop-1.03
make && make install
(3)把lzop复制到/usr/bin/
ln -s /usr/local/hadoop/lzop/bin/lzop /usr/bin/lzop
(4)测试lzop
lzop /root/data/access_20131219.log
输入lzop
报错:lzop: error while loading shared libraries: liblzo2.so.2: cannot open shared object file: No such file or directory
解决办法:增加环境变量export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib64
会在生成一个lzo后缀的压缩文件: /home/hadoop/data/access_20131219.log.lzo即表示前述几个步骤正确哦。
当然的还有一个前提,就是配置好maven和svn 或者Git(我使用的是SVN),这个就不说了,如果这些搞不定,其实也不必要进行下去了!
我这里使用https://github.com/twitter/hadoop-lzo
再依次执行:
mvn clean package -Dmaven.test.skip=true
tar -cBf - -C target/native/Linux-amd64-64/lib . | tar -xBvf - -C /usr/local/hadoop/lib/native/
cp target/hadoop-lzo-0.4.20-SNAPSHOT.jar /usr/local/hadoop/share/hadoop/common/
接下来就是将/usr/local/hadoop/share/hadoop/common/hadoop-lzo-0.4.20-SNAPSHOT.jar以及/usr/local/hadoop/lib/native/ 同步到其它所有的hadoop节点。注意,要保证目录/usr/local/hadoop/lib/native/ 下的jar包,你运行hadoop的用户都有执行权限。
(6)配置Hadoop
在core-site.xml配置
<property>
<name>io.compression.codecs</name>
<value>
org.apache.hadoop.io.compress.GzipCodec,
org.apache.hadoop.io.compress.DefaultCodec,
org.apache.hadoop.io.compress.BZip2Codec,
org.apache.hadoop.io.compress.SnappyCodec,
com.hadoop.compression.lzo.LzoCodec,
com.hadoop.compression.lzo.LzopCodec
</value>
</property>
<property>
<name>io.compression.codec.lzo.class</name>
<value>com.hadoop.compression.lzo.LzoCodec</value>
</property>
在mapred-site.xml中配置
<property>
<name>mapreduce.map.output.compress.codec</name>
<value>com.hadoop.compression.lzo.LzoCodec</value>
</property>
<property>
<name>mapred.child.env</name>
<value>LD_LIBRARY_PATH=/usr/local/hadoop/lzo/lib</value>
</property>
在hadoop-env.sh中配置
export LD_LIBRARY_PATH=/usr/local/hadoop/lzo/lib
(7)在hive中配置
SET hive.exec.compress.output=true;
SET mapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.lzopCodec;
SET mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec;
create table page_views_parquet_lzo ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
STORED AS PARQUETTBLPROPERTIES("parquet.compression"="lzo")
as select * from page_views;
(8)用index
SET hive.exec.compress.output=true;
create table abc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS
INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat"
OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat";
insert into table abc select * from page_views_lzo;
hadoop jar /usr/local/hadoop-2.6.0-cdh5.7.1/share/hadoop/common/hadoop-lzo-0.4.21-SNAPSHOT.jar com.hadoop.compression.lzo.LzoIndexer page_views_lzo/000000_0.lzo
改为com.hadoop.compression.lzo.LzopCodec
SET hive.exec.compress.output=true;
SET mapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.lzopCodec;
SET mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec;
create table page_views_lzo
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t"
STORED AS
INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat"
OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
as select * from page_views;
参考博客:https://www.cnblogs.com/luxiaorui/p/3931024.html
【来自@若泽大数据】