对应与map,reduce函数操作列举个小例子,操作平台eclipse与hadoop。输入文件要求:可以是一行对应多个字段,且每个字段需要使用空格分割。
Test_1:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Test_1 extends Configured implements Tool{
/**
* @Author XD 2014-8-15
*/
enum Counter{
LINESKIP, //对应于出错行的计数
}
public static class Map extends Mapper<LongWritable,Text,NullWritable,Text>{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
//读取源文件,line得到的是输入一行的数据
String line = value.toString();
try{
//对文件进行分割
String[] lineSplit = line.split(" ");
String name = lineSplit[0];
String id = lineSplit[2];
String school = lineSplit[3];
Text out = new Text(name+' '+id+' '+school+' ');
context.write( NullWritable.get(),out);//要与上面的Mapper接口中的输入输出类型匹配
}catch(java.lang.ArrayIndexOutOfBoundsException e){
context.getCounter(Counter.LINESKIP).increment(1);
return;
}
}
}
public int run(String[] args) throws Exception{
//配置,初始化作业
Configuration conf = getConf();
Job job = new Job(conf,"Test_1");
job.setJarByClass(Test_1.class);
//设置输入输出路径
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//设置处理map的处理类
job.setMapperClass(Map.class);
job.setOutputFormatClass(TextOutputFormat.class);
//设置map输出类型
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(true);
System.out.println("任务名称: "+job.getJobName());
System.out.println("任务成功: "+(job.isSuccessful()?"Yes":"No"));
System.out.println("跳过行数:"+job.getCounters().findCounter(Counter.LINESKIP).getValue());
return job.isSuccessful()? 0:1;
}
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
int result = ToolRunner.run(new Configuration(),new Test_1(),args);
System.out.println(result);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
此例并不具备reduce函数,只是一个简单的map处理。
Test_2:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Test_2 extends Configured implements Tool{
/**
* @Author XD 2014-8-15
*/
enum Counter{
LINESKIP,
}
public static class Map extends Mapper<LongWritable,Text,Text,Text>{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
//读取源文件,line得到的是输入一行的数据
String line = value.toString();
try{
String[] lineSplit = line.split(" ");
String anum = lineSplit[0];
String bnum = lineSplit[1];
context.write(new Text(bnum),new Text(anum));
}catch(java.lang.ArrayIndexOutOfBoundsException e){
context.getCounter(Counter.LINESKIP).increment(1);
return;
}
}
}
public static class Reduce extends Reducer<Text,Text,Text,Text>{
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
String valueString;
String out = "";
//对于map产生的数据进行reduce 处理
for(Text value : values){
valueString = value.toString();
out += valueString+"|";
}
//reduce 输出
context.write(key,new Text(out));
}
}
public int run(String[] args) throws Exception{
//作业初始化
Configuration conf = getConf();
Job job = new Job(conf,"Test_2");
job.setJarByClass(Test_2.class);
//输入输出路径
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//处理map,reduce的类
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputFormatClass(TextOutputFormat.class);
//reduce的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(true);
System.out.println("任务名称: "+job.getJobName());
System.out.println("任务成功: "+(job.isSuccessful()?"Yes":"No"));
System.out.println("跳过行数:"+job.getCounters().findCounter(Counter.LINESKIP).getValue());
return job.isSuccessful()? 0:1;
}
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
int result = ToolRunner.run(new Configuration(),new Test_2(),args);
System.out.println(result);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
此例对应与map,reduce操作,只是个简单的小例子,只是为了实验运行环境,并不代表什么。书写程序的工作,可以在eclipse下面进行,这样易于修改和调试。但是运行建议在终端输入命令运行...