需求:将原始数据按近似比例采样,将数据分为训练集和测试集。训练集存放于指定输出目录的train目录下,测试集存放于指定输出目录的test目录下。
class SampleMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
private double ratio;
private Random random = new Random();
MultipleOutputs<NullWritable, Text> multipleOutputs;
protected void setup(Context context) throws IOException, InterruptedException {
ratio = Double.parseDouble(context.getConfiguration().get("ratio"));
multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
if (random.nextDouble() <= ratio) {
multipleOutputs.write(NullWritable.get(), value,"train/");
} else {
multipleOutputs.write(NullWritable.get(), value,"test/");
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
multipleOutputs.close();
}
}
public static void job(Configuration config, Path inputPath, Path outputPath, String ratio) throws IOException {
config.set("ratio", ratio);
Job job = Job.getInstance(config);
job.setJobName("Random Sample");
job.setJarByClass(Sampler.class);
job.setMapperClass(SampleMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
MultipleOutputs.addNamedOutput(job, "train", TextOutputFormat.class, NullWritable.class, Text.class);
MultipleOutputs.addNamedOutput(job, "test", TextOutputFormat.class, NullWritable.class, Text.class);
try {
job.waitForCompletion(true);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
关键代码:
multipleOutputs.write(NullWritable.get(), value,"train/");
multipleOutputs.write(NullWritable.get(), value,"test/");
FileOutputFormat.setOutputPath(job, outputPath);
MultipleOutputs.addNamedOutput(job, "train", TextOutputFormat.class, NullWritable.class, Text.class);
MultipleOutputs.addNamedOutput(job, "test", TextOutputFormat.class, NullWritable.class, Text.class);
指定采样比例、输入路径和输出路径为:
hadoop.sampler.ratio = 0.2
hadoop.sampler.datainputpath = /lgh/data/input
hadoop.sampler.dataoutputpath = /lgh/sampleoutput
输出目录:
/lgh/sampleoutput/train
/lgh/sampleoutput/test