Hadoop MapReduce
首先上例子
补充:数据在天气数据
这是一个气象数据:
将源数据map处理后:
MapReduce整理流程如下:
Mapper和Reducer类
MaxTemperatureMapper
package cn.tju.hadoop;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* Mapper类
*
* @author WWK
*
*/
public class MaxTemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
//缺失
private static final int MISSING = 9999;
@Override
//map方法
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//转换text类型转化为string型
String line = value.toString();
//提取年份数据
String year = line.substring(15, 19);
//气温变量
int airTemperature;
//判断符号
if (line.charAt(87) == '+') { // parseInt doesn't like leading plus signs
airTemperature = Integer.parseInt(line.substring(88, 92));
} else {
airTemperature = Integer.parseInt(line.substring(87, 92));
}
//质量
String quality = line.substring(92, 93);
// 将有效数据写入到map的context中,注意类型务必要和泛型声明一致
if (airTemperature != MISSING && quality.matches("[01459]")) {
context.write(new Text(year), new IntWritable(airTemperature));
}
}
}
MaxTemperatureReducer
package cn.tju.hadoop;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* reducer类
* @author WWK
*
*/
public class MaxTemperatureReducer extends Reducer<Text, IntWritable, Text,IntWritable> {
@Override
protected void reduce(Text keyin, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int maxValue = Integer.MIN_VALUE;
for (IntWritable value : values) {
maxValue = Math.max(maxValue, value.get());
}
//写入
context.write(keyin, new IntWritable(maxValue));
}
}
app
package cn.tju.hadoop;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* App类
*
* @author WWK
*
*/
public class MaxTemperature {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: MaxTemperature <input path> <output path>");
System.exit(-1);
}
//作业
Job job = new Job();
//设置jar包的搜索路径
job.setJarByClass(MaxTemperature.class);
//设置作业名称,便于调试
job.setJobName("Max temperature");
//添加输入路径,可以添加多个路径
//输入路径不仅可以是具体文件,还可以是文件夹(目录,不会递归)
FileInputFormat.addInputPath(job, new Path(args[0]));
//添加输出路径,输出只能一个,而且不能存在
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//设置mapper类
job.setMapperClass(MaxTemperatureMapper.class);
//设置reducer类
job.setReducerClass(MaxTemperatureReducer.class);
//设置输出的key类型
job.setOutputKeyClass(Text.class);
//设置输出的value类型
job.setOutputValueClass(IntWritable.class);
//等待作业的完成
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
具体思路
执行程序
- 导出jar
- 查看jar是否正确
- 在本地文件系统上执行程序
set HADOOP_CLASSPATH=hadoop-xxxxxx.jar //win7
export HADOOP_CLASSPATH=hadoop-xxxxx.jar //Linux
hadoop xx.MaxTemperature input/ncdc/sample.txt output
- 在HDFS集群上执行程序(将天气数据上传到hdfs文件系统)
hadoop fs -ls -R /
hadoop fs -mkdir /user/wwk/ncdc_data
hadoop fs -put ~/hadoop_learning/19* /user/wwk/ncdc_data
hadodop jar HadoopDemo2.jar /user/wwk/ncdc_data / /user/wwk/out