在HDFS上存储文件,大量的小文件是非常消耗NameNode内存的,因为每个文件都会分配一个文件描述符,NameNode需要在启动的时候加载全部文件的描述信息,所以文件越多,对
NameNode来说开销越大。
我们可以考虑,将小文件压缩以后,再上传到HDFS中,这时只需要一个文件描述符信息,自然大大减轻了NameNode对内存使用的开销。MapReduce计算中,Hadoop内置提供了如下几
种压缩格式:
使用压缩文件进行MapReduce计算,它的开销在于解压缩所消耗的时间,在特定的应用场景中这个也是应该考虑的问题。不过对于海量小文件的应用场景,我们压缩了小文件,却换
来的Locality特性。
假如成百上千的小文件压缩后只有一个Block,那么这个Block必然存在一个DataNode节点上,在计算的时候输入一个InputSplit,没有网络间传输数据的开销,而且是在本地进行
运算。倘若直接将小文件上传到HDFS上,成百上千的小Block分布在不同DataNode节点上,为了计算可能需要“移动数据”之后才能进行计算。文件很少的情况下,除了NameNode内
存使用开销以外,可能感觉不到网络传输开销,但是如果小文件达到一定规模就非常明显了。
下面,我们使用gzip格式压缩小文件,然后上传到HDFS中,实现MapReduce程序进行任务处理。
使用一个类实现了基本的Map任务和Reduce任务,代码如下所示:
01 | package org.shirdrn.kodz.inaction.hadoop.smallfiles.compression; |
03 | import java.io.IOException; |
04 | import java.util.Iterator; |
06 | import org.apache.hadoop.conf.Configuration; |
07 | import org.apache.hadoop.fs.Path; |
08 | import org.apache.hadoop.io.LongWritable; |
09 | import org.apache.hadoop.io.Text; |
10 | import org.apache.hadoop.io.compress.CompressionCodec; |
11 | import org.apache.hadoop.io.compress.GzipCodec; |
12 | import org.apache.hadoop.mapreduce.Job; |
13 | import org.apache.hadoop.mapreduce.Mapper; |
14 | import org.apache.hadoop.mapreduce.Reducer; |
15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; |
16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; |
17 | import org.apache.hadoop.util.GenericOptionsParser; |
19 | public class GzipFilesMaxCostComputation { |
21 | public static class GzipFilesMapper extends Mapper<LongWritable, Text, Text, LongWritable> { |
23 | private final static LongWritable costValue = new LongWritable( 0 ); |
24 | private Text code = new Text(); |
27 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { |
29 | String line = value.toString(); |
30 | String[] array = line.split( "\\s" ); |
31 | if (array.length == 4 ) { |
32 | String countryCode = array[ 0 ]; |
33 | String strCost = array[ 3 ]; |
36 | cost = Long.parseLong(strCost); |
37 | } catch (NumberFormatException e) { |
41 | code.set(countryCode); |
43 | context.write(code, costValue); |
49 | public static class GzipFilesReducer extends Reducer<Text, LongWritable, Text, LongWritable> { |
52 | protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { |
54 | Iterator<LongWritable> iter = values.iterator(); |
55 | while (iter.hasNext()) { |
56 | LongWritable current = iter.next(); |
57 | if (current.get() > max) { |
61 | context.write(key, new LongWritable(max)); |
66 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { |
68 | Configuration conf = new Configuration(); |
69 | String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); |
70 | if (otherArgs.length != 2 ) { |
71 | System.err.println( "Usage: gzipmaxcost <in> <out>" ); |
75 | Job job = new Job(conf, "gzip maxcost" ); |
77 | job.getConfiguration().setBoolean( "mapred.output.compress" , true ); |
78 | job.getConfiguration().setClass( "mapred.output.compression.codec" , GzipCodec. class , CompressionCodec. class ); |
80 | job.setJarByClass(GzipFilesMaxCostComputation. class ); |
81 | job.setMapperClass(GzipFilesMapper. class ); |
82 | job.setCombinerClass(GzipFilesReducer. class ); |
83 | job.setReducerClass(GzipFilesReducer. class ); |
85 | job.setMapOutputKeyClass(Text. class ); |
86 | job.setMapOutputValueClass(LongWritable. class ); |
87 | job.setOutputKeyClass(Text. class ); |
88 | job.setOutputValueClass(LongWritable. class ); |
90 | job.setNumReduceTasks( 1 ); |
92 | FileInputFormat.addInputPath(job, new Path(otherArgs[ 0 ])); |
93 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[ 1 ])); |
95 | int exitFlag = job.waitForCompletion( true ) ? 0 : 1 ; |
96 | System.exit(exitFlag); |
上面程序就是计算最大值的问题,实现比较简单,而且使用gzip压缩文件。另外,如果考虑Mapper输出后,需要向Reducer拷贝的数据量比较大,可以考虑在配置Job的时候,指定
压缩选项,详见上面代码中的配置。
下面看运行上面程序的过程:
01 | xiaoxiang@ubuntu3:/opt/stone/cloud/hadoop-1.0.3$ du -sh ../dataset/gzipfiles/* |
02 | 147M ../dataset/gzipfiles/data_10m.gz |
03 | 43M ../dataset/gzipfiles/data_50000_1.gz |
04 | 16M ../dataset/gzipfiles/data_50000_2.gz |
05 | xiaoxiang@ubuntu3:/opt/stone/cloud/hadoop-1.0.3$ bin/hadoop fs - mkdir /user/xiaoxiang/datasets/gzipfiles |
06 | xiaoxiang@ubuntu3:/opt/stone/cloud/hadoop-1.0.3$ bin/hadoop fs -copyFromLocal ../dataset/gzipfiles/* /user/xiaoxiang/datasets/gzipfiles |
07 | xiaoxiang@ubuntu3:/opt/stone/cloud/hadoop-1.0.3$ bin/hadoop fs - ls /user/xiaoxiang/datasets/gzipfiles |
09 | -rw-r--r-- 3 xiaoxiang supergroup 153719349 2013-03-24 12:56 /user/xiaoxiang/datasets/gzipfiles/data_10m.gz |
10 | -rw-r--r-- 3 xiaoxiang supergroup 44476101 2013-03-24 12:56 /user/xiaoxiang/datasets/gzipfiles/data_50000_1.gz |
11 | -rw-r--r-- 3 xiaoxiang supergroup 15935178 2013-03-24 12:56 /user/xiaoxiang/datasets/gzipfiles/data_50000_2.gz |
01 | xiaoxiang@ubuntu3:/opt/stone/cloud/hadoop-1.0.3$ bin/hadoop jar gzip -compression.jar |
03 | org.shirdrn.kodz.inaction.hadoop.smallfiles.compression.GzipFilesMaxCostComputation /user/xiaoxiang/datasets/gzipfiles /user/xiaoxiang/output/smallfiles/ gzip |
04 | 13/03/24 13:06:28 INFO input.FileInputFormat: Total input paths to process : 3 |
05 | 13/03/24 13:06:28 INFO util.NativeCodeLoader: Loaded the native-hadoop library |
06 | 13/03/24 13:06:28 WARN snappy.LoadSnappy: Snappy native library not loaded |
07 | 13/03/24 13:06:28 INFO mapred.JobClient: Running job: job_201303111631_0039 |
08 | 13/03/24 13:06:29 INFO mapred.JobClient: map 0% reduce 0% |
09 | 13/03/24 13:06:55 INFO mapred.JobClient: map 33% reduce 0% |
10 | 13/03/24 13:07:04 INFO mapred.JobClient: map 66% reduce 11% |
11 | 13/03/24 13:07:13 INFO mapred.JobClient: map 66% reduce 22% |
12 | 13/03/24 13:07:25 INFO mapred.JobClient: map 100% reduce 22% |
13 | 13/03/24 13:07:31 INFO mapred.JobClient: map 100% reduce 100% |
14 | 13/03/24 13:07:36 INFO mapred.JobClient: Job complete: job_201303111631_0039 |
15 | 13/03/24 13:07:36 INFO mapred.JobClient: Counters: 29 |
16 | 13/03/24 13:07:36 INFO mapred.JobClient: Job Counters |
17 | 13/03/24 13:07:36 INFO mapred.JobClient: Launched reduce tasks=1 |
18 | 13/03/24 13:07:36 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=78231 |
19 | 13/03/24 13:07:36 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 |
20 | 13/03/24 13:07:36 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 |
21 | 13/03/24 13:07:36 INFO mapred.JobClient: Launched map tasks=3 |
22 | 13/03/24 13:07:36 INFO mapred.JobClient: Data- local map tasks=3 |
23 | 13/03/24 13:07:36 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=34413 |
24 | 13/03/24 13:07:36 INFO mapred.JobClient: File Output Format Counters |
25 | 13/03/24 13:07:36 INFO mapred.JobClient: Bytes Written=1337 |
26 | 13/03/24 13:07:36 INFO mapred.JobClient: FileSystemCounters |
27 | 13/03/24 13:07:36 INFO mapred.JobClient: FILE_BYTES_READ=288127 |
28 | 13/03/24 13:07:36 INFO mapred.JobClient: HDFS_BYTES_READ=214131026 |
29 | 13/03/24 13:07:36 INFO mapred.JobClient: FILE_BYTES_WRITTEN=385721 |
30 | 13/03/24 13:07:36 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=1337 |
31 | 13/03/24 13:07:36 INFO mapred.JobClient: File Input Format Counters |
32 | 13/03/24 13:07:36 INFO mapred.JobClient: Bytes Read=214130628 |
33 | 13/03/24 13:07:36 INFO mapred.JobClient: Map-Reduce Framework |
34 | 13/03/24 13:07:36 INFO mapred.JobClient: Map output materialized bytes=9105 |
35 | 13/03/24 13:07:36 INFO mapred.JobClient: Map input records=14080003 |
36 | 13/03/24 13:07:36 INFO mapred.JobClient: Reduce shuffle bytes=6070 |
37 | 13/03/24 13:07:36 INFO mapred.JobClient: Spilled Records=22834 |
38 | 13/03/24 13:07:36 INFO mapred.JobClient: Map output bytes=154878493 |
39 | 13/03/24 13:07:36 INFO mapred.JobClient: CPU time spent (ms)=90200 |
40 | 13/03/24 13:07:36 INFO mapred.JobClient: Total committed heap usage (bytes)=688193536 |
41 | 13/03/24 13:07:36 INFO mapred.JobClient: Combine input records=14092911 |
42 | 13/03/24 13:07:36 INFO mapred.JobClient: SPLIT_RAW_BYTES=398 |
43 | 13/03/24 13:07:36 INFO mapred.JobClient: Reduce input records=699 |
44 | 13/03/24 13:07:36 INFO mapred.JobClient: Reduce input groups =233 |
45 | 13/03/24 13:07:36 INFO mapred.JobClient: Combine output records=13747 |
46 | 13/03/24 13:07:36 INFO mapred.JobClient: Physical memory (bytes) snapshot=765448192 |
47 | 13/03/24 13:07:36 INFO mapred.JobClient: Reduce output records=233 |
48 | 13/03/24 13:07:36 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2211237888 |
49 | 13/03/24 13:07:36 INFO mapred.JobClient: Map output records=14079863 |
001 | xiaoxiang@ubuntu3:/opt/stone/cloud/hadoop-1.0.3$ bin/hadoop fs - ls /user/xiaoxiang/output/smallfiles/ gzip |
003 | -rw-r--r-- 3 xiaoxiang supergroup 0 2013-03-24 13:07 /user/xiaoxiang/output/smallfiles/ gzip /_SUCCESS |
004 | drwxr-xr-x - xiaoxiang supergroup 0 2013-03-24 13:06 /user/xiaoxiang/output/smallfiles/ gzip /_logs |
005 | -rw-r--r-- 3 xiaoxiang supergroup 1337 2013-03-24 13:07 /user/xiaoxiang/output/smallfiles/ gzip /part-r-00000.gz |
006 | xiaoxiang@ubuntu3:/opt/stone/cloud/hadoop-1.0.3$ bin/hadoop fs -copyToLocal /user/xiaoxiang/output/smallfiles/ gzip /part-r-00000.gz ./ |
007 | xiaoxiang@ubuntu3:/opt/stone/cloud/hadoop-1.0.3$ gunzip -c ./part-r-00000.gz |