实现功能:随机生成10000个数字置于文件VInput中,利用mapreduce找出其中的最大值。
我们需要三样东西:一个map函数,一个reduce函数,一些来运行作业的代码。
求最大值实例的Mapper接口:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private final Text writeKey = new Text("K");
private LongWritable writeValue = new LongWritable(0);
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer tokenizer = null;
String lineValue = null;
tokenizer = new StringTokenizer(value.toString().trim());
while (tokenizer.hasMoreTokens()) {
lineValue = tokenizer.nextToken().trim();
if (lineValue.equals("")) {
continue;
}
try {
writeValue.set(Long.parseLong(lineValue));
context.write(writeKey, writeValue);
} catch (NumberFormatException e) {
continue;
}
}
}
}
求最大值实例的Reducer:
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Reducer;
public class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
private final Text maxValueKey = new Text("maxValue");
public void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
long maxNum = Long.MIN_VALUE;
for (LongWritable value : values) {
if (value.get() > maxNum) {
maxNum = value.get();
}
}
context.write(maxValueKey, new LongWritable(maxNum));
}
}
在10000个数字中找最大值的应用程序:
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxValue {
public static void main(String[] args) {
Configuration conf = null;
Job job = null;
FileSystem fs = null;
Path inputPath = null;
Path outputPath = null;
FileWriter fw = null;
long generateValue = 0;
String output = "testData/mapreduce/GetMax";
try {
String inpath = "testData/mapreduce/VInput";
String outpath = "testData/mapreduce/VOutput";
inputPath = new Path(inpath);
outputPath = new Path(outpath);
File file = new File(inpath);
if (!file.getParentFile().exists()) {
if (!file.getParentFile().mkdirs()) {
throw new Exception(
"generate datas error,can not create dir:"
+ file.getParentFile().getAbsolutePath());
}
}
try {
fw = new FileWriter(file);
for (int i = 0; i < 10000; i++) {
generateValue = (long) (Math.random() * 10000);
fw.write(generateValue + "\r");
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (fw != null) {
fw.flush();
fw.close();
}
}
conf = new Configuration();
job = new Job(conf, "getMaxValue");
fs = FileSystem.getLocal(conf);
if (fs.exists(outputPath)) {
if (!fs.delete(outputPath, true)) {
System.err.println("Delete output file:" + outputPath
+ " failed!");
return;
}
}
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setNumReduceTasks(2);
FileInputFormat.addInputPath(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
long begin = System.currentTimeMillis();
job.waitForCompletion(true);
System.out.println("=========================================");
System.out.println("Spend time:"
+ (System.currentTimeMillis() - begin));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
运行结果:
随机生成的10000个数字被写入文件VInput中,找出的最大值结果写入VOutput文件夹下的生成文件(part-r-0000)中,格式为:"maxValue XXXX".其中XXXX表示所找到的最大值。
遇到比较难解的问题:
①将MyMapper和MyReducer作为内部类写在类MaxValue中时,报错初始化错误。
解决方案:将MyMapper和MyReducer作为单独的类拿出来,分成三个类。
②报错java.lang.NoClassDefFoundError
解决方案:将hadoop/lib下的相应的jar包添加到工程里面去 (单单把hadoop文件夹下的几个jar包加进去是不够的,还要把lib文件夹下相对应的jar包导入)
实验心得:
我们需要三样东西:一个map函数,一个reduce函数,一些来运行作业的代码。
求最大值实例的Mapper接口:
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
private final Text writeKey = new Text("K");
private LongWritable writeValue = new LongWritable(0);
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer tokenizer = null;
String lineValue = null;
tokenizer = new StringTokenizer(value.toString().trim());
while (tokenizer.hasMoreTokens()) {
lineValue = tokenizer.nextToken().trim();
if (lineValue.equals("")) {
continue;
}
try {
writeValue.set(Long.parseLong(lineValue));
context.write(writeKey, writeValue);
} catch (NumberFormatException e) {
continue;
}
}
}
}
求最大值实例的Reducer:
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapreduce.Reducer;
public class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
private final Text maxValueKey = new Text("maxValue");
public void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
long maxNum = Long.MIN_VALUE;
for (LongWritable value : values) {
if (value.get() > maxNum) {
maxNum = value.get();
}
}
context.write(maxValueKey, new LongWritable(maxNum));
}
}
在10000个数字中找最大值的应用程序:
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxValue {
public static void main(String[] args) {
Configuration conf = null;
Job job = null;
FileSystem fs = null;
Path inputPath = null;
Path outputPath = null;
FileWriter fw = null;
long generateValue = 0;
String output = "testData/mapreduce/GetMax";
try {
String inpath = "testData/mapreduce/VInput";
String outpath = "testData/mapreduce/VOutput";
inputPath = new Path(inpath);
outputPath = new Path(outpath);
File file = new File(inpath);
if (!file.getParentFile().exists()) {
if (!file.getParentFile().mkdirs()) {
throw new Exception(
"generate datas error,can not create dir:"
+ file.getParentFile().getAbsolutePath());
}
}
try {
fw = new FileWriter(file);
for (int i = 0; i < 10000; i++) {
generateValue = (long) (Math.random() * 10000);
fw.write(generateValue + "\r");
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (fw != null) {
fw.flush();
fw.close();
}
}
conf = new Configuration();
job = new Job(conf, "getMaxValue");
fs = FileSystem.getLocal(conf);
if (fs.exists(outputPath)) {
if (!fs.delete(outputPath, true)) {
System.err.println("Delete output file:" + outputPath
+ " failed!");
return;
}
}
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setNumReduceTasks(2);
FileInputFormat.addInputPath(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
long begin = System.currentTimeMillis();
job.waitForCompletion(true);
System.out.println("=========================================");
System.out.println("Spend time:"
+ (System.currentTimeMillis() - begin));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
运行结果:
随机生成的10000个数字被写入文件VInput中,找出的最大值结果写入VOutput文件夹下的生成文件(part-r-0000)中,格式为:"maxValue XXXX".其中XXXX表示所找到的最大值。
遇到比较难解的问题:
①将MyMapper和MyReducer作为内部类写在类MaxValue中时,报错初始化错误。
解决方案:将MyMapper和MyReducer作为单独的类拿出来,分成三个类。
②报错java.lang.NoClassDefFoundError
解决方案:将hadoop/lib下的相应的jar包添加到工程里面去 (单单把hadoop文件夹下的几个jar包加进去是不够的,还要把lib文件夹下相对应的jar包导入)
实验心得:
①
map阶段和reduce阶段都有键/值对作为输入和输出,并且他们的类型可由程序员选择。
②map函数的输出先由MapReduce框架处理,然后再被发送到reduce函数。这一处理过程根据键来对键/值对进行排序和分组。
③MapReduce作业(job)是客户端执行的单位:它包括输入数据、mapreduce程序和配置信息。Hadoop通过把作业分成若干个小任务(task)来工作,其包括两种类型的任务:map任务和reduce任务。
④有两种类型的节点控制着作业运行过程:jobtracker和多个tasktracker。jobtracker通过调度任务在tasktracker上运行,来协调所有运行在系统上的作业。Tasktracker运行任务的同时,把进度报告传送到jobtracker,jobtracker则记录着每项任务的整体进展情况。
⑤Hadoop把输入数据划分成等长的小数据发送到MapReduce,成为输入分片(input split)或分片。Hadoop为每个分片创建一个map任务,由它来运行用户自定义的map函数来分析每个分片中的记录。