SequenceFile使用随笔

最新推荐文章于 2021-08-10 10:03:22 发布

陈先生-HDU

最新推荐文章于 2021-08-10 10:03:22 发布

阅读量557

点赞数

分类专栏： hadoop 文章标签： hadoop

本文链接：https://blog.csdn.net/qq_20049243/article/details/49384913

版权

hadoop 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

hadoop不适合小文件，所以需要对小文件进行额外处理，常使用SequenceFile，下面是刚刚使用SequenceFile的感悟。

1.创建sequenceFile

public static void main(String[] args) throws Exception {
		final Configuration conf = new Configuration();
		final FileSystem fs = FileSystem.get(new URI(args[0]), conf);
		FileStatus[] fss = fs.globStatus(new Path(args[1]));//还可以使用listStatus()方法，不过<span style="font-family: Arial, Helvetica, sans-serif;">globStatus更加灵活</span>
		@SuppressWarnings("deprecation")
		final Writer writer = new SequenceFile.Writer(fs, conf, new Path(
				args[2]), Text.class, Text.class);
		FSDataInputStream in;
		Text key = new Text();
		Text value = new Text();
		for (FileStatus fileStatus : fss) {
			byte[] b = new byte[(int) fileStatus.getLen()];
			in = fs.open(fileStatus.getPath());
			in.readFully(b);
			key = new Text(fileStatus.getPath().getName());
			value = new Text(new String(b));
			writer.append(key, value);
		}
		IOUtils.closeStream(writer);

		@SuppressWarnings("deprecation")
		final SequenceFile.Reader reader = new SequenceFile.Reader(fs,
				new Path(args[2]), conf);
		while (reader.next(key, value)) {
			System.out.println(key.toString() + "\t" + value.toString());
		}
		IOUtils.closeStream(reader);
	}

创建sf(SequenceFile)没什么好说的，上面实现了传入一个文件夹路径，然后遍历文件夹中的所有小文件。实现结果如下图

hdfs中有个smallFiles文件夹专门存放小文件。运行之前的代码，创建一个sf文件，并且读取内容。

2.使用sf文件作为MR的输入

原始的wordCount代码：

package mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCountApp {
	public static void main(String[] args) throws Exception {
		Job job = Job.getInstance(new Configuration(),
				WordCountApp.class.getSimpleName());
		
		job.setJarByClass(WordCountApp.class);
		
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job, args[0]);
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
	}

	private static class MyMapper extends
			Mapper<LongWritable, Text, Text, IntWritable> {

		Text k2 = new Text();
		IntWritable v2 = new IntWritable();

		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			String line = value.toString();
			String[] splited = line.split(" ");
			for (String word : splited) {
				k2.set(word);
				v2.set(1);
				context.write(k2, v2);
			}
		}
	}

	private static class MyReducer extends
			Reducer<Text, IntWritable, Text, IntWritable> {
		IntWritable v3 = new IntWritable();

		@Override
		protected void reduce(Text k2, Iterable<IntWritable> v2s,
				Reducer<Text, IntWritable, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			int sum = 0;
			for (IntWritable v2 : v2s) {
				sum += v2.get();
			}
			v3.set(sum);
			context.write(k2, v3);
		}
	}

}

默认的输入格式是TextInputFormat，但是这里使用sf文件的话，就需要指定输入格式为SequenceFileInputFormat，直接运行wordCount代码会有问题，如下图：

这里说Text不能转化成LongWritable，整个代码唯一出现LongWritable类型是在k1位置，说明SequenceFileInputFormat的使用没有想象中的简单，不是直接修改输入格式就OK了，到底如何使用SequenceFileInputFormat，就必须进入源码学习了。研究过MR源码的同学就知道，肯定是解析阶段出现问题，也就是createRecordReader()方法的问题，进入SequenceFileRecordReader类中的nextKeyValue()方法。

public boolean nextKeyValue() throws IOException, InterruptedException {
    if (!more) {
      return false;
    }
    long pos = in.getPosition();
    key = (K) in.next(key);//重点
    if (key == null || (pos >= end && in.syncSeen())) {
      more = false;
      key = null;
      value = null;
    } else {
      value = (V) in.getCurrentValue(value);
    }
    return more;
  }

着重关注key的变化，刚看到的时候我表示震惊了！（固定的思维害死人）完全与之前学习的简单的MR南辕北辙，之前说过简单的MR中k1的类型默认是LongWritable，是每行文本的偏移量，v1类型是Text，是每行的文本内容。而这里完全不一样，这里的key就是sf文件的key，也就是文件名，而value是sf文件的value，也就是整个小文件的文件内容。这怎么办，完全混乱了，先一步步改吧，首先把之前的异常解决掉，定义MyMapper时，把k1设置为Text类型，然后运行看看。

虽然可以执行，但是结果完全不对，问题应该出现在map函数上面。早回去看源码，之前写的代码完全是参照v1是每行的文本内容去写的map函数，但是这里v1变成了整个小文件的文件内容，那我在map中添加一步把v1按照行分割的操作，问题不就解决了吗？试试看，完整的代码如下：

package mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCountApp2 {
	public static void main(String[] args) throws Exception {
		Job job = Job.getInstance(new Configuration(),
				WordCountApp2.class.getSimpleName());
		job.setInputFormatClass(SequenceFileInputFormat.class);
		job.setJarByClass(WordCountApp2.class);
		
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		FileInputFormat.setInputPaths(job, args[0]);
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		job.waitForCompletion(true);
	}

	private static class MyMapper extends
			Mapper<Text, Text, Text, IntWritable> {

		Text k2 = new Text();
		IntWritable v2 = new IntWritable();

		@Override
		protected void map(Text key, Text value,
				Mapper<Text, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			String text = value.toString();
			String[] lines = text.split("\n");//改变的部分
			String[] splited;
			for (String line : lines) {
				splited = line.split(" ");
				for (String word : splited) {
					k2.set(word);
					v2.set(1);
					context.write(k2, v2);
				}
			}
		}
	}

	private static class MyReducer extends
			Reducer<Text, IntWritable, Text, IntWritable> {
		IntWritable v3 = new IntWritable();

		@Override
		protected void reduce(Text k2, Iterable<IntWritable> v2s,
				Reducer<Text, IntWritable, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			int sum = 0;
			for (IntWritable v2 : v2s) {
				sum += v2.get();
			}
			v3.set(sum);
			context.write(k2, v3);
		}
	}

}

结果如下：

OK，搞定。

通过这次小问题，发现了一些hadoop学习与以前java学习的区别。之前弄java的时候，教材参考书，网上的demo特别多，也就习惯了碰见问题去百度，从来没有看过源码，但是hadoop不一样，比较新，网上内容很少，一个简单的功能都需要自己去理解去看源码，感觉很好玩，很有挑战，希望后面越来越好。

陈先生-HDU

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
SequenceFile使用随笔

hadoop不适合小文件，所以需要对小文件进行额外处理，常使用SequenceFile，下面是刚刚使用SequenceFile的感悟。1.创建sequenceFilepublic static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); fin
复制链接

扫一扫