[hadoop@hadoop004 hive-1.1.0-cdh5.7.0]$ which lzop
/bin/lzop
[hadoop@hadoop004 data]$ lzop -v page_views_big.dat
[hadoop@hadoop004 data]$ ls -lah
total 1.4G
drwxrwxr-x 2 hadoop hadoop 4.0K Apr 21 18:29 .
drwx------ 12 hadoop hadoop 4.0K Apr 22 01:14 ..
-rw-rw-r-- 1 hadoop hadoop 304 Apr 21 18:29 live.txt
-rw-r--r-- 1 root root 455M Apr 19 12:08 login.log
-rw-rw-r-- 1 hadoop hadoop 599M Apr 19 18:08 page_views_big.dat
-rw-rw-r-- 1 hadoop hadoop 285M Apr 19 18:08 page_views_big.dat.lzo
-rw-r--r-- 1 root root 19M Apr 18 20:47 page_views.dat
-rw-rw-r-- 1 hadoop hadoop 44 Apr 18 19:55 wc.txt
[hadoop@hadoop004 maven_repo]$ cd ~/software/
[hadoop@hadoop004 software]$ cd hadoop-lzo/
[hadoop@hadoop004 hadoop-lzo]$ mvn clean package -Dmaven.test.skip=true
[hadoop@hadoop004 target]$ ll
total 456
drwxrwxr-x 2 hadoop hadoop 4096 Apr 19 18:43 antrun
drwxrwxr-x 5 hadoop hadoop 4096 Apr 19 18:43 apidocs
drwxrwxr-x 5 hadoop hadoop 4096 Apr 19 18:43 classes
drwxrwxr-x 3 hadoop hadoop 4096 Apr 19 18:43 generated-sources
-rw-rw-r-- 1 hadoop hadoop 188970 Apr 19 18:43 hadoop-lzo-0.4.21-SNAPSHOT.jar
-rw-rw-r-- 1 hadoop hadoop 184565 Apr 19 18:43 hadoop-lzo-0.4.21-SNAPSHOT-javadoc.jar
-rw-rw-r-- 1 hadoop hadoop 52024 Apr 19 18:43 hadoop-lzo-0.4.21-SNAPSHOT-sources.jar
drwxrwxr-x 2 hadoop hadoop 4096 Apr 19 18:43 javadoc-bundle-options
drwxrwxr-x 2 hadoop hadoop 4096 Apr 19 18:43 maven-archiver
drwxrwxr-x 3 hadoop hadoop 4096 Apr 19 18:43 native
drwxrwxr-x 3 hadoop hadoop 4096 Apr 19 18:43 test-classes
[hadoop@hadoop004 target]$ cp hadoop-lzo-0.4.21-SNAPSHOT.jar ~/app/hadoop-2.6.0-cdh5.7.0/share/hadoop/common/
[hadoop@hadoop004 common]$ ll
total 5548
-rw-r--r-- 1 hadoop hadoop 3411839 Apr 10 01:41 hadoop-common-2.6.0-cdh5.7.0.jar
-rw-r--r-- 1 hadoop hadoop 1892451 Apr 10 01:41 hadoop-common-2.6.0-cdh5.7.0-tests.jar
-rw-rw-r-- 1 hadoop hadoop 188970 Apr 19 18:47 hadoop-lzo-0.4.21-SNAPSHOT.jar
-rw-r--r-- 1 hadoop hadoop 161018 Apr 10 01:41 hadoop-nfs-2.6.0-cdh5.7.0.jar
drwxr-xr-x 2 hadoop hadoop 4096 Apr 10 01:41 jdiff
drwxr-xr-x 2 hadoop hadoop 4096 Apr 10 01:41 lib
drwxr-xr-x 2 hadoop hadoop 4096 Apr 10 01:41 sources
drwxr-xr-x 2 hadoop hadoop 4096 Apr 10 01:41 templates
[hadoop@hadoop004 hadoop]$ vim core-site.xml
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.GzipCodec,
org.apache.hadoop.io.compress.DefaultCodec,
org.apache.hadoop.io.compress.BZip2Codec,
org.apache.hadoop.io.compress.SnappyCodec,
com.hadoop.compression.lzo.LzoCodec,
com.hadoop.compression.lzo.LzopCodec
</value>
</property>
<property>
<name>io.compression.codec.lzo.class</name>
<value>com.hadoop.compression.lzo.LzoCodec</value>
</property>
hive> create table page_views_lzo(
> track_times string,
> url string,
> session_id string,
> referer string,
> ip string,
> end_user_id string,
> city_id string
> ) row format delimited fields terminated by '\t'
> STORED AS INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat"
> OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat";
OK
Time taken: 0.199 seconds
hive> load data local inpath '/home/hadoop/data/page_views_big.dat.lzo' overwrite into table page_views_lzo;
Loading data to table default.page_views_lzo
Table default.page_views_lzo stats: [numFiles=1, numRows=0, totalSize=298200895, rawDataSize=0]
OK
Time taken: 4.064 seconds
[hadoop@hadoop004 data]$ hdfs dfs -ls /user/hive/warehouse/page_views_lzo
Found 1 items
-rwxr-xr-x 1 hadoop supergroup 298200895 2019-04-23 14:28 /user/hive/warehouse/page_views_lzo/page_views_big.dat.lzo
[hadoop@hadoop004 data]$ hdfs dfs -du -s -h /user/hive/warehouse/page_views_lzo
284.4 M 284.4 M /user/hive/warehouse/page_views_lzo
hive> select count(1) from page_views_lzo;
Query ID = hadoop_20190423142626_386a65de-1dad-4000-b223-15239ce16743
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1556000359234_0001, Tracking URL = http://hadoop004:8088/proxy/application_1556000359234_0001/
Kill Command = /home/hadoop/app/hadoop-2.6.0-cdh5.7.0/bin/hadoop job -kill job_1556000359234_0001
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2019-04-23 14:33:15,184 Stage-1 map = 0%, reduce = 0%
2019-04-23 14:33:26,643 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 7.06 sec
2019-04-23 14:33:32,982 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 8.51 sec
MapReduce Total cumulative CPU time: 8 seconds 510 msec
Ended Job = job_1556000359234_0001
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 8.51 sec HDFS Read: 298207931 HDFS Write: 8 SUCCESS
Total MapReduce CPU Time Spent: 8 seconds 510 msec
OK
3300000
Time taken: 28.124 seconds, Fetched: 1 row(s)
由倒数几行可以看出
Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 8.51 sec HDFS Read: 298207931 HDFS Write: 8 SUCCESS
这条SQL语句只有一个Map作业,但是page_views_big.dat.lzo这个文件是285M,至少有三个block,按理来说应该有3个split,因此这里说明了不添加索引的lzo默认不支持分片。
下面使lzo支持分片
hive> SET hive.exec.compress.output;
hive.exec.compress.output=false
hive> SET hive.exec.compress.output=true;
hive> SET hive.exec.compress.output;
hive.exec.compress.output=true
hive> SET mapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.LzopCodec;
hive> SET mapreduce.output.fileoutputformat.compress.codec;
mapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.LzopCodec
hive> create table page_views_lzo_split
> STORED AS INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat"
> OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
> as select * from page_views_lzo;
Query ID = hadoop_20190423142626_386a65de-1dad-4000-b223-15239ce16743
Total jobs = 3
Launching Job 1 out of 3
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_1556000359234_0002, Tracking URL = http://hadoop004:8088/proxy/application_1556000359234_0002/
Kill Command = /home/hadoop/app/hadoop-2.6.0-cdh5.7.0/bin/hadoop job -kill job_1556000359234_0002
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2019-04-23 14:42:08,062 Stage-1 map = 0%, reduce = 0%
2019-04-23 14:42:18,703 Stage-1 map = 36%, reduce = 0%, Cumulative CPU 7.42 sec
2019-04-23 14:42:21,813 Stage-1 map = 59%, reduce = 0%, Cumulative CPU 10.92 sec
2019-04-23 14:42:24,211 Stage-1 map = 81%, reduce = 0%, Cumulative CPU 14.05 sec
2019-04-23 14:42:26,738 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 16.69 sec
MapReduce Total cumulative CPU time: 16 seconds 690 msec
Ended Job = job_1556000359234_0002
Stage-4 is selected by condition resolver.
Stage-3 is filtered out by condition resolver.
Stage-5 is filtered out by condition resolver.
Moving data to: hdfs://hadoop004:9000/user/hive/warehouse/.hive-staging_hive_2019-04-23_14-42-01_301_8465660280055053580-1/-ext-10001
Moving data to: hdfs://hadoop004:9000/user/hive/warehouse/page_views_lzo_split
Table default.page_views_lzo_split stats: [numFiles=1, numRows=3300000, totalSize=296148323, rawDataSize=624194769]
MapReduce Jobs Launched:
Stage-Stage-1: Map: 1 Cumulative CPU: 16.69 sec HDFS Read: 298204253 HDFS Write: 296148419 SUCCESS
Total MapReduce CPU Time Spent: 16 seconds 690 msec
OK
Time taken: 27.738 seconds
[hadoop@hadoop004 data]$ hdfs dfs -du -s -h /user/hive/warehouse/page_views_lzo_split
282.4 M 282.4 M /user/hive/warehouse/page_views_lzo_split
构建LZO文件索引
[hadoop@hadoop004 data]$ hadoop jar ~/app/hadoop-2.6.0-cdh5.7.0/share/hadoop/common/hadoop-lzo-0.4.21-SNAPSHOT.jar com.hadoop.compression.lzo.LzoIndexer /user/hive/warehouse/page_views_lzo_split
19/04/23 14:47:58 INFO lzo.GPLNativeCodeLoader: Loaded native gpl library from the embedded binaries
19/04/23 14:47:58 INFO lzo.LzoCodec: Successfully loaded & initialized native-lzo library [hadoop-lzo rev f1deea9a313f4017dd5323cb8bbb3732c1aaccc5]
19/04/23 14:47:59 INFO lzo.LzoIndexer: LZO Indexing directory /user/hive/warehouse/page_views_lzo_split...
19/04/23 14:47:59 INFO lzo.LzoIndexer: [INDEX] LZO Indexing file hdfs://hadoop004:9000/user/hive/warehouse/page_views_lzo_split/000000_0.lzo, size 0.28 GB...
19/04/23 14:47:59 INFO Configuration.deprecation: hadoop.native.lib is deprecated. Instead, use io.native.lib.available
19/04/23 14:48:00 INFO lzo.LzoIndexer: Completed LZO Indexing in 0.72 seconds (393.90 MB/s). Index size is 19.97 KB.
[hadoop@hadoop004 data]$ hdfs dfs -ls /user/hive/warehouse/page_views_lzo_split
Found 2 items
-rwxr-xr-x 1 hadoop supergroup 296148323 2019-04-23 14:42 /user/hive/warehouse/page_views_lzo_split/000000_0.lzo
-rw-r--r-- 1 hadoop supergroup 20448 2019-04-23 14:48 /user/hive/warehouse/page_views_lzo_split/000000_0.lzo.index
hive> select count(1) from page_views_lzo_split;
Query ID = hadoop_20190423142626_386a65de-1dad-4000-b223-15239ce16743
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1556000359234_0003, Tracking URL = http://hadoop004:8088/proxy/application_1556000359234_0003/
Kill Command = /home/hadoop/app/hadoop-2.6.0-cdh5.7.0/bin/hadoop job -kill job_1556000359234_0003
Hadoop job information for Stage-1: number of mappers: 3; number of reducers: 1
2019-04-23 14:49:57,100 Stage-1 map = 0%, reduce = 0%
2019-04-23 14:50:11,166 Stage-1 map = 33%, reduce = 0%, Cumulative CPU 2.27 sec
2019-04-23 14:50:12,201 Stage-1 map = 67%, reduce = 0%, Cumulative CPU 6.27 sec
2019-04-23 14:50:14,285 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 10.41 sec
2019-04-23 14:50:19,470 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 12.18 sec
MapReduce Total cumulative CPU time: 12 seconds 180 msec
Ended Job = job_1556000359234_0003
MapReduce Jobs Launched:
Stage-Stage-1: Map: 3 Reduce: 1 Cumulative CPU: 12.18 sec HDFS Read: 296399059 HDFS Write: 58 SUCCESS
Total MapReduce CPU Time Spent: 12 seconds 180 msec
OK
3300000
Time taken: 29.314 seconds, Fetched: 1 row(s)
有上面结果可以看到Map数为3,证明加了索引的lzo支持分片