统计单词
input_file内容
hadoop yarn
mapreduce hbase
编写mapreduce的模版
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* map reduce
*/
public class ModuleMapReduce extends Configured implements Tool {
// 1: map class
/**
* public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
*/
// TODO
public static class ModuleMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// TODO
}
}
// 2: reduce class
// TODO
public static class ModuleReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
// TODO
}
}
// 3: driver class
public int run(String[] args) throws Exception {
// 1. get configuration
Configuration configuration = getConf();
// 2. create job
Job job = Job.getInstance(configuration, this.getClass().getSimpleName());
// run jar
job.setJarByClass(this.getClass());
// 3: set job
/**
* input -> map -> reduce -> output
*/
// 3.1 input
Path inPath = new Path(args[0]);
FileInputFormat.addInputPath(job, inPath);
// 3.2 map
job.setMapperClass(ModuleMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 3.3 reduce
job.setReducerClass(ModuleReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 3.4 output
Path outPath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outPath);
// 4. submit job
boolean isSuccess = job.waitForCompletion(true);
return isSuccess ? 0 : 1;
}
public static void main(String[] args) throws Exception {
// new configuration
Configuration configuration = new Configuration();
int status = ToolRunner.run(configuration, new ModuleMapReduce(), args);
System.exit(status);
}
}
根据模版写的实例1
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* map reduce
*/
public class WordCount extends Configured implements Tool {
// 1: map class
/**
* public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
*/
public static class WordCountMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
private Text mapOutputKey = new Text();
private final static IntWritable mapOutputValue = new IntWritable(1);
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// line value
String lineValue = value.toString();
// split
StringTokenizer stringTokenizer = new StringTokenizer(lineValue);
// iterator
while(stringTokenizer.hasMoreElements()){
// get word value
String wordval = stringTokenizer.nextToken();
// set value
mapOutputKey.set(wordval);
// output
context.write(mapOutputKey, mapOutputValue);
}
}
}
// 2: reduce class
public static class WordCountReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable outputValue = new IntWritable(1);
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
// sum tmp
int sum = 0;
// iterator
for(IntWritable value: values){
// total
sum += value.get();
}
// set value
outputValue.set(sum);
// output
context.write(key, outputValue);
}
}
// 3: driver class
public int run(String[] args) throws Exception {
// 1. get configuration
Configuration configuration = getConf();
// 2. create job
Job job = Job.getInstance(configuration, this.getClass().getSimpleName());
// run jar
job.setJarByClass(this.getClass());
// 3: set job
/**
* input -> map -> reduce -> output
*/
// 3.1 input
Path inPath = new Path(args[0]);
FileInputFormat.addInputPath(job, inPath);
// 3.2 map
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 3.3 reduce
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 3.4 output
Path outPath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outPath);
// 4. submit job
boolean isSuccess = job.waitForCompletion(true);
return isSuccess ? 0 : 1;
}
public static void main(String[] args) throws Exception {
// new configuration
Configuration configuration = new Configuration();
//int status = new WordCount().run(args);
int status = ToolRunner.run(configuration, new WordCount(), args);
System.exit(status);
}
}
非模版写的原生实例2
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
/**
* map reduce
*
*/
public class WordCount extends Configured implements Tool {
// 1: map class
/**
* public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
*/
public static class WordCountMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
private Text mapOutputKey = new Text();
private final static IntWritable mapOutputValue = new IntWritable(1);
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// line value
String lineValue = value.toString();
// split
StringTokenizer stringTokenizer = new StringTokenizer(lineValue);
// iterator
while(stringTokenizer.hasMoreElements()){
// get word value
String wordval = stringTokenizer.nextToken();
// set value
mapOutputKey.set(wordval);
// output
context.write(mapOutputKey, mapOutputValue);
}
}
}
// 2: reduce class
public static class WordCountReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable outputValue = new IntWritable(1);
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
// sum tmp
int sum = 0;
// iterator
for(IntWritable value: values){
// total
sum += value.get();
}
// set value
outputValue.set(sum);
// output
context.write(key, outputValue);
}
}
// 3: driver class
public int run(String[] args) throws Exception {
// 1. get configuration
Configuration configuration = new Configuration();
// 2. create job
Job job = Job.getInstance(configuration, this.getClass().getSimpleName());
// run jar
job.setJarByClass(this.getClass());
// 3: set job
/**
* input -> map -> reduce -> output
*/
// 3.1 input
Path inPath = new Path(args[0]);
FileInputFormat.addInputPath(job, inPath);
// 3.2 map
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 3.3 reduce
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 3.4 output
Path outPath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outPath);
// 4. submit job
boolean isSuccess = job.waitForCompletion(true);
return isSuccess ? 0 : 1;
}
public static void main(String[] args) throws Exception {
int status = new WordCount().run(args);
System.exit(status);
}
}
模版优化
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* map reduce
*/
public class ModuleMapReduce extends Configured implements Tool {
// 1: map class
/**
* public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
*/
// TODO
public static class ModuleMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
@Override
public void setup(Context context) throws IOException,
InterruptedException {
// Nothing
}
@Override
public void cleanup(Context context) throws IOException,
InterruptedException {
// Nothing
}
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// TODO
}
}
// 2: reduce class
// TODO
public static class ModuleReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
// Nothing
}
@Override
protected void cleanup(
Context context)
throws IOException, InterruptedException {
// Nothing
}
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
// TODO
}
}
// 3: driver class
public int run(String[] args) throws Exception {
// 1. get configuration
Configuration configuration = getConf();
// 2. create job
Job job = Job.getInstance(configuration, this.getClass().getSimpleName());
// run jar
job.setJarByClass(this.getClass());
// 3: set job
/**
* input -> map -> reduce -> output
*/
// 3.1 input
Path inPath = new Path(args[0]);
FileInputFormat.addInputPath(job, inPath);
// 3.2 map
job.setMapperClass(ModuleMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 3.3 reduce
job.setReducerClass(ModuleReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 3.4 output
Path outPath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outPath);
// 4. submit job
boolean isSuccess = job.waitForCompletion(true);
return isSuccess ? 0 : 1;
}
public static void main(String[] args) throws Exception {
// new configuration
Configuration configuration = new Configuration();
int status = ToolRunner.run(configuration, new ModuleMapReduce(), args);
System.exit(status);
}
}
增加shuffle优化
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* map reduce
*/
public class ModuleMapReduce extends Configured implements Tool {
// 1: map class
/**
* public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
*/
// TODO
public static class ModuleMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
@Override
public void setup(Context context) throws IOException,
InterruptedException {
// Nothing
}
@Override
public void cleanup(Context context) throws IOException,
InterruptedException {
// Nothing
}
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// TODO
}
}
// 2: reduce class
// TODO
public static class ModuleReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
// Nothing
}
@Override
protected void cleanup(
Context context)
throws IOException, InterruptedException {
// Nothing
}
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
// TODO
}
}
// 3: driver class
public int run(String[] args) throws Exception {
// 1. get configuration
Configuration configuration = getConf();
// 2. create job
Job job = Job.getInstance(configuration, this.getClass().getSimpleName());
// run jar
job.setJarByClass(this.getClass());
// 3: set job
/**
* input -> map -> reduce -> output
*/
// 3.1 input
Path inPath = new Path(args[0]);
FileInputFormat.addInputPath(job, inPath);
// 3.2 map
job.setMapperClass(ModuleMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
///
// shuffle
// 1. partitioner default hash partitioner
// job.setPartitionerClass(cls);
// 2. sort
// job.setSortComparatorClass(cls);
// 3. optional combiner (small reduce)
// job.setCombinerClass(cls);
// 4. group
// job.setGroupingComparatorClass(cls);
///
// 3.3 reduce
job.setReducerClass(ModuleReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//set reduce number
job.setNumReduceTasks(2);
// 3.4 output
Path outPath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outPath);
// 4. submit job
boolean isSuccess = job.waitForCompletion(true);
return isSuccess ? 0 : 1;
}
public static void main(String[] args) throws Exception {
// new configuration
Configuration configuration = new Configuration();
// set compress
// configuration.set("mapreduce.map.output.compress","true");
// configuration.set("mapreduce.map.output.compresscodec","org.apache.hadoop.io.comperss.SnappyCodec");
//DefaultCodec tx = null;
int status = ToolRunner.run(configuration, new ModuleMapReduce(), args);
System.exit(status);
}
}