输入数据:
代码:
package inputformat;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
/*
* 处理的数据源是多个小文件
* 会把多个小文件合并处理,合并的大小如果小于128M,就当成一个InputSplit处理。
* 与SequenceFileInputFormat不同的是,SequenceFileInputFormat处理的数据源是合并好的SequencceFile类型的数据。
*/
public class CombineTextInputFormatTest {
public static class MyMapper extends
Mapper<LongWritable, Text, Text, LongWritable> {
final Text k2 = new Text();
final LongWritable v2 = new LongWritable();
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws InterruptedException, IOException {
final String line = value.toString();
final String[] splited = line.split("\\s");
for (String word : splited) {
k2.set(word);
v2.set(1);
context.write(k2, v2);
}
}
}
public static class MyReducer extends
Reducer<Text, LongWritable, Text, LongWritable> {
LongWritable v3 = new LongWritable();
protected void reduce(Text k2, Iterable<LongWritable> v2s,
Reducer<Text, LongWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
long count = 0L;
for (LongWritable v2 : v2s) {
count += v2.get();
}
v3.set(count);
context.write(k2, v3);
}
}
public static void main(String[] args) throws Exception {
final Configuration conf = new Configuration();
final Job job = Job.getInstance(conf, CombineTextInputFormatTest.class.getSimpleName());
// 1.1
FileInputFormat.setInputPaths(job,
"hdfs://192.168.1.10:9000/input");
//这里改了一下
job.setInputFormatClass(CombineTextInputFormat.class);
// 1.2
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
// 1.3 默认只有一个分区
job.setPartitionerClass(HashPartitioner.class);
job.setNumReduceTasks(1);
// 1.4省略不写
// 1.5省略不写
// 2.2
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 2.3
FileOutputFormat.setOutputPath(job, new Path(
"hdfs://192.168.1.10:9000/out2"));
job.setOutputFormatClass(TextOutputFormat.class);
// 执行打成jar包的程序时,必须调用下面的方法
job.setJarByClass(CombineTextInputFormatTest.class);
job.waitForCompletion(true);
}
}
console输出:
[root@i-love-you hadoop]# bin/hadoop jar data/ConbineText.jar
15/04/16 15:27:02 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
15/04/16 15:27:06 WARN mapreduce.JobSubmitter: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
15/04/16 15:27:07 INFO input.FileInputFormat: Total input paths to process : 2
15/04/16 15:27:07 INFO input.CombineFileInputFormat: DEBUG: Terminated node allocation with : CompletedNodes: 1, size left: 79
15/04/16 15:27:07 INFO mapreduce.JobSubmitter: number of splits:1
15/04/16 15:27:08 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1429167587909_0003
15/04/16 15:27:08 INFO impl.YarnClientImpl: Submitted application application_1429167587909_0003
15/04/16 15:27:08 INFO mapreduce.Job: The url to track the job: http://i-love-you:8088/proxy/application_1429167587909_0003/
15/04/16 15:27:08 INFO mapreduce.Job: Running job: job_1429167587909_0003
15/04/16 15:27:23 INFO mapreduce.Job: Job job_1429167587909_0003 running in uber mode : false
15/04/16 15:27:23 INFO mapreduce.Job: map 0% reduce 0%
15/04/16 15:27:39 INFO mapreduce.Job: map 100% reduce 0%
15/04/16 15:28:07 INFO mapreduce.Job: map 100% reduce 100%
15/04/16 15:28:17 INFO mapreduce.Job: Job job_1429167587909_0003 completed successfully
15/04/16 15:28:18 INFO mapreduce.Job: Counters: 49
File System Counters
FILE: Number of bytes read=215
FILE: Number of bytes written=212395
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=259
HDFS: Number of bytes written=38
HDFS: Number of read operations=7
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=1
Launched reduce tasks=1
Other local map tasks=1
Total time spent by all maps in occupied slots (ms)=14359
Total time spent by all reduces in occupied slots (ms)=22113
Total time spent by all map tasks (ms)=14359
Total time spent by all reduce tasks (ms)=22113
Total vcore-seconds taken by all map tasks=14359
Total vcore-seconds taken by all reduce tasks=22113
Total megabyte-seconds taken by all map tasks=14703616
Total megabyte-seconds taken by all reduce tasks=22643712
Map-Reduce Framework
Map input records=4
Map output records=13
Map output bytes=183
Map output materialized bytes=215
Input split bytes=180
Combine input records=0
Combine output records=0
Reduce input groups=5
Reduce shuffle bytes=215
Reduce input records=13
Reduce output records=5
Spilled Records=26
Shuffled Maps =1
Failed Shuffles=0
Merged Map outputs=1
GC time elapsed (ms)=209
CPU time spent (ms)=2860
Physical memory (bytes) snapshot=313401344
Virtual memory (bytes) snapshot=1687605248
Total committed heap usage (bytes)=136450048
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=0
File Output Format Counters
Bytes Written=38
计算结果:
[root@i-love-you hadoop]# bin/hdfs dfs -text /out2/part-*
hadoop 6
hello 2
java 3
me 1
struts 1
可见把两个小文件的数据合并在一起处理了,合并成一个InputSplit。