SequenceFileInputFormat只能处理SequenceFile类型的文件。
代码:
- package inputformat;
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
- import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
- //用之前SequenceFile类型的文件作为处理数据,用那个for循环生成的数据,那个数据指定的类型是<LongWritable,Text>
- //SequenceFileInputFormat只能处理SequenceFile类型的数据
- public class SequenceFileInputFormatTest {
- public static class MyMapper extends
- Mapper<LongWritable, Text, Text, LongWritable> {
- final Text k2 = new Text();
- final LongWritable v2 = new LongWritable();
- protected void map(LongWritable key, Text value,
- Mapper<LongWritable, Text, Text, LongWritable>.Context context)
- throws InterruptedException, IOException {
- final String line = value.toString();
- final String[] splited = line.split("\\s");
- for (String word : splited) {
- k2.set(word);
- v2.set(1);
- context.write(k2, v2);
- }
- }
- }
- public static class MyReducer extends
- Reducer<Text, LongWritable, Text, LongWritable> {
- LongWritable v3 = new LongWritable();
- protected void reduce(Text k2, Iterable<LongWritable> v2s,
- Reducer<Text, LongWritable, Text, LongWritable>.Context context)
- throws IOException, InterruptedException {
- long count = 0L;
- for (LongWritable v2 : v2s) {
- count += v2.get();
- }
- v3.set(count);
- context.write(k2, v3);
- }
- }
- public static void main(String[] args) throws Exception {
- final Configuration conf = new Configuration();
- final Job job = Job.getInstance(conf, SequenceFileInputFormatTest.class.getSimpleName());
- // 1.1
- FileInputFormat.setInputPaths(job,
- "hdfs://192.168.1.10:9000/sf1");
- //这里改了一下,把TextInputFormat改成了SequenceFileInputFormat
- job.setInputFormatClass(SequenceFileInputFormat.class);
- // 1.2
- job.setMapperClass(MyMapper.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(LongWritable.class);
- // 1.3 默认只有一个分区
- job.setPartitionerClass(HashPartitioner.class);
- job.setNumReduceTasks(1);
- // 1.4省略不写
- // 1.5省略不写
- // 2.2
- job.setReducerClass(MyReducer.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(LongWritable.class);
- // 2.3
- FileOutputFormat.setOutputPath(job, new Path(
- "hdfs://192.168.1.10:9000/out1"));
- job.setOutputFormatClass(TextOutputFormat.class);
- // 执行打成jar包的程序时,必须调用下面的方法
- job.setJarByClass(SequenceFileInputFormatTest.class);
- job.waitForCompletion(true);
- }
- }
生成SequenceFile类型的文件,供上述SequenceFileInputFormat使用,作为输入数据:
- package sequenceFile;
- import java.net.URI;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.SequenceFile;
- import org.apache.hadoop.io.Text;
- import org.apache.zookeeper.common.IOUtils;
- //for循环读写操作演示
- public class Forduxie {
- public static void main(String args[]) throws Exception {
- final Path path = new Path("/sf1");
- Configuration conf = new Configuration();
- final FileSystem fs = FileSystem.get(new URI("hdfs://192.168.1.10:9000/"), conf);
- @SuppressWarnings("deprecation")
- final SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,path, LongWritable.class,Text.class);
- for (int i = 0; i < 10; i++) {
- writer.append(new LongWritable(i), new Text(i+"=_="));
- }
- IOUtils.closeStream(writer);
- @SuppressWarnings({ "deprecation" })
- final SequenceFile.Reader reader = new SequenceFile.Reader(fs, path,conf);
- LongWritable key = new LongWritable();
- Text val = new Text();
- while (reader.next(key, val)) {
- System.out.println(key.get() + "\t" + val.toString());
- }IOUtils.closeStream(reader);
- }
- }
如果创建的是Maven项目,需要在pom包里添加:
- <span style="white-space:pre"> </span><dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>2.4</version>
- </dependency>