数据集及其介绍
数据来源:conventional_weather_stations_inmet_brazil_1961_2019.csv
序号 | 葡萄牙文字段 - 英文字段 | 中文描述 |
---|---|---|
1 | Esracao - Weather station sode | 气象监测站代码 |
2 | Data - Date (dd/MM/YYYY) | 日期 |
3 | Hora - Hour (%H%M) | 小时 |
4 | Precipitacao - Precipitation (mm) | 降水量 |
5 | TempBulboSeco - Dry bulb temperature (°C) | 干球湿度 |
6 | TempBulboUmido - Wet bulb temperature (°C) | 湿球湿度 |
7 | UmidadeRelativa - Relative humidity (%) | 最高温度 |
8 | TempMinima - Minimum temperature (°C) | 最低温度 |
9 | UmidadeRelativa - Relative humidity (%) | 相对湿度 |
10 | PressaoAtmEstacao - Station Atmospheric Pressure (mbar) | 站大气压力 |
11 | PressaoAtmMar - Atmospheric pressure at sea level (mbar) | 海平面的大气压 |
12 | DirecaoVento - Wind Direction (tabela) | 风向 |
13 | VelocidadeVento - Wind speed (m/s) | 风速 |
14 | Insolacao - Insolation (hs) | 日照 |
15 | Nebulosidade - Cloudiness (tenths) | 云量 |
16 | Evaporacao Piche - Piche Evaporation (mm) | 蒸发量 |
17 | Temp Comp Media - Average Compensated Temperature (°C) | 平均补偿温度(平均温度) |
18 | Umidade Relativa Media - Avarage Relative Humidity (%) | 平均相对湿度(平均湿度) |
19 | Velocidade do Vento Media - Average Wind Speed (mps) | 平均风速 |
需求分析
任务
- 天气数据导入
- 查询某一天的天气数据
- 查询每一年的最高气温
- 查询每年的最低气温
- 查询每年的平均气温
- 查询每年下雨天数
- 预测明天气温
- 菜单与运行
- 数据可视化
思路分析
天气数据导入
通过Web管理界面上传
查询
由于Hadoop不擅长秒级响应,因此应当在查询之前将数据按照需求处理完毕。
该需求中,涉及按天和按年查询,因此实体类中需要记录日期,当天的最高气温、最低气温和平均气温,以及降水量。在数据集中,不光包含了83377观测站记录的信息,因此实体类中也应当添加观测站点的编号,以保证后续的业务扩大。
数据集中,一天有三条记录,这三条记录分别记录了最高气温、最低气温和平均气温,以及每次观测时的降水量,因此可以将原数据压缩,将每天的数据汇总在一条数据中。
预测
可视化
工具类编写:HadoopUtils.java
由于需要多次使用MapReduce,有大量重复性工作,因此编写工具类。
package club.kwcoder.weather.util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
/**
* Hadoop 工具类
* @author zhinushannan
*/
public class HadoopUtils {
/**
* hadoop配置
*/
private static final Configuration conf;
/**
* hdfs文件系统
*/
private static final FileSystem hdfs;
static {
conf = new Configuration();
try {
hdfs = FileSystem.get(conf);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* 若指定目标路径存在,则删除
* @param targetPath 指定目标路径
*/
public static void deleteIfExist(Path targetPath) {
try {
if (hdfs.exists(targetPath)) {
hdfs.delete(targetPath, true);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* 打印指定路径下的文件内容
* @param path 指定路径
*/
public static void showAllFiles(Path path) {
FSDataInputStream open = null;
InputStreamReader inputStreamReader = null;
BufferedReader bufferedReader = null;
try {
if (!hdfs.exists(path)) {
throw new RuntimeException("target path is not exist!");
}
FileStatus[] fileStatuses = hdfs.listStatus(path);
for (FileStatus fileStatus : fileStatuses) {
if (fileStatus.isFile()) {
open = hdfs.open(fileStatus.getPath());
inputStreamReader = new InputStreamReader(open);
bufferedReader = new BufferedReader(inputStreamReader);
bufferedReader.lines().forEach(System.out::println);
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (bufferedReader != null) {
try {
bufferedReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (inputStreamReader != null) {
try {
inputStreamReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (open != null) {
try {
open.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public static Configuration getConf() {
return conf;
}
public static FileSystem getHdfs() {
return hdfs;
}
}
工具类编写:ValidateUtils.java
package club.kwcoder.weather.util;
import org.apache.commons.lang3.StringUtils;
/**
* 校验工具类,统一返回方式,针对本项目,合法返回false,非法返回true
*
* @author zhinushannan
*/
public class ValidateUtils {
/**
* 校验输入文本数据
* @param line 文本数据
* @return 空返回true
*/
public static boolean validate(String line) {
return StringUtils.isBlank(line);
}
/**
* 校验字符串数组长度
* @param items 字符串数组
* @param length 长度
* @return 不相同返回true
*/
public static boolean validate(String[] items, int length) {
return items.length != length;
}
/**
* 校验输入文本数据并分割
* @param line 输入文本数据
* @param sep 分隔符
* @param limit 目标长度
* @return 当字符串为空或无法分割到目标长度时,返回true
*/
public static String[] splitAndValidate(String line, String sep, int limit) {
if (validate(line)) {
return null;
}
String[] split = line.split(sep, limit);
return validate(split, limit) ? null : split;
}
}
启动器编写:WeatherStarter.java
因为需要多次启动MapReduce,会有较多的程序,因此借助启动器进行统一规划。
package club.kwcoder.weather;
import club.kwcoder.weather.runner.Runner;
import club.kwcoder.weather.util.HadoopUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.HashMap;
import java.util.Map;
/**
* 启动类
*/
public class WeatherStarter {
/**
* 运行时参数,用于添加输入/输出路径
*/
private static Map<String, String> PATH;
static {
PATH = new HashMap<>();
}
public static void main(String[] args) throws InvocationTargetException, NoSuchMethodException, IllegalAccessException, InstantiationException {
}
/**
* 根据Runner接口的实现类,利用反射机制获取该类实例和run方法并调用
*
* @param step run类
* @param jobName 运行的job名称
* @param inputKey 输入路径
* @param outputKey 输出路径
*/
public static void run(Class<? extends Runner> step, String jobName, String inputKey, String outputKey) throws NoSuchMethodException, InvocationTargetException, IllegalAccessException, InstantiationException {
String input = PATH.get(inputKey);
String output = PATH.get(outputKey);
// 获取run方法
Method run = step.getMethod("run", RunnerBuilder.class);
RunnerBuilder build = new RunnerBuilder()
.setJobName(jobName)
.setInput(input)
.setOutput(output)
.build();
// 获取实例对象
Runner runner = step.newInstance();
// 调用方法
run.invoke(runner, build);
}
public static class RunnerBuilder {
private Configuration conf;
private String jobName;
private FileSystem hdfs;
private Path input;
private Path output;
private RunnerBuilder() {
}
public RunnerBuilder build() {
this.conf = HadoopUtils.getConf();
this.hdfs = HadoopUtils.getHdfs();
if (jobName == null || input == null || output == null) {
throw new RuntimeException("参数配置不完整!");
}
try {
if (this.hdfs.exists(output)) {
hdfs.delete(output, true);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return this;
}
public RunnerBuilder setJobName(String jobName) {
this.jobName = jobName;
return this;
}
public RunnerBuilder setInput(String input) {
this.input = new Path(input);
return this;
}
public RunnerBuilder setOutput(String output) {
this.output = new Path(output);
return this;
}
public Configuration getConf() {
return conf;
}
public String getJobName() {
return jobName;
}
public FileSystem getHdfs() {
return hdfs;
}
public Path getInput() {
return input;
}
public Path getOutput() {
return output;
}
}
}
数据清洗
需求
将原数据清洗为如下格式:
监测站代码、日期、降水量、最高温度、最低温度、平均温度
方案
- 筛选并只保留83377观测站的信息
- 将每一天的三条数据合并
- 丢弃非法记录
代码编写
实体类WeatherWritable.java
package club.kwcoder.weather.writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Objects;
public class WeatherWritable implements WritableComparable<WeatherWritable> {
/**
* 气象站代码
*/
private String code;
/**
* 日期
*/
private String date;
/**
* 降水量
*/
private Float precipitation;
/**
* 最高温度
*/
private Float maxTemperature;
/**
* 最低温度
*/
private Float minTemperature;
/**
* 平均温度
*/
private Float avgTemperature;
public static class Builder {
private String code;
private String date;
private Float precipitation;
private Float maxTemperature;
private Float minTemperature;
private Float avgTemperature;
public Builder setCode(String code) {
this.code = code;
return this;
}
public Builder setDate(String date) {
this.date = date;
return this;
}
public Builder setPrecipitation(Float precipitation) {
this.precipitation = precipitation;
return this;
}
public Builder setMaxTemperature(Float maxTemperature) {
this.maxTemperature = maxTemperature;
return this;
}
public Builder setMinTemperature(Float minTemperature) {
this.minTemperature = minTemperature;
return this;
}
public Builder setAvgTemperature(Float avgTemperature) {
this.avgTemperature = avgTemperature;
return this;
}
public WeatherWritable build() {
return new WeatherWritable(this);
}
public Builder() {}
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.code);
out.writeUTF(this.date);
out.writeFloat(this.precipitation);
out.writeFloat(this.maxTemperature);
out.writeFloat(this.minTemperature);
out.writeFloat(this.avgTemperature);
}
@Override
public void readFields(DataInput in) throws IOException {
this.code = in.readUTF();
this.date = in.readUTF();
this.precipitation = in.readFloat();
this.maxTemperature = in.readFloat();
this.minTemperature = in.readFloat();
this.avgTemperature = in.readFloat();
}
@Override
public int compareTo(WeatherWritable other) {
if (null == other) {
return 1;
}
// 仅考虑83377巴西利亚的数据
return this.date.compareTo(other.getDate());
}
public WeatherWritable(Builder builder) {
this.code = builder.code;
this.date = builder.date;
this.precipitation = builder.precipitation;
this.maxTemperature = builder.maxTemperature;
this.minTemperature = builder.minTemperature;
this.avgTemperature = builder.avgTemperature;
}
public WeatherWritable() {
}
@Override
public String toString() {
return code + '\t' + date + '\t' + precipitation + '\t' + maxTemperature + '\t' + minTemperature + '\t' + avgTemperature;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
WeatherWritable that = (WeatherWritable) o;
if (!Objects.equals(code, that.code)) return false;
if (!Objects.equals(date, that.date)) return false;
if (!Objects.equals(precipitation, that.precipitation))
return false;
if (!Objects.equals(maxTemperature, that.maxTemperature))
return false;
if (!Objects.equals(minTemperature, that.minTemperature))
return false;
return Objects.equals(avgTemperature, that.avgTemperature);
}
@Override
public int hashCode() {
int result = code != null ? code.hashCode() : 0;
result = 31 * result + (date != null ? date.hashCode() : 0);
result = 31 * result + (precipitation != null ? precipitation.hashCode() : 0);
result = 31 * result + (maxTemperature != null ? maxTemperature.hashCode() : 0);
result = 31 * result + (minTemperature != null ? minTemperature.hashCode() : 0);
result = 31 * result + (avgTemperature != null ? avgTemperature.hashCode() : 0);
return result;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public Float getPrecipitation() {
return precipitation;
}
public void setPrecipitation(Float precipitation) {
this.precipitation = precipitation;
}
public Float getMaxTemperature() {
return maxTemperature;
}
public void setMaxTemperature(Float maxTemperature) {
this.maxTemperature = maxTemperature;
}
public Float getMinTemperature() {
return minTemperature;
}
public void setMinTemperature(Float minTemperature) {
this.minTemperature = minTemperature;
}
public Float getAvgTemperature() {
return avgTemperature;
}
public void setAvgTemperature(Float avgTemperature) {
this.avgTemperature = avgTemperature;
}
public WeatherWritable(String code, String date, Float precipitation, Float maxTemperature, Float minTemperature, Float avgTemperature) {
this.code = code;
this.date = date;
this.precipitation = precipitation;
this.maxTemperature = maxTemperature;
this.minTemperature = minTemperature;
this.avgTemperature = avgTemperature;
}
}
DataCleaning.java
package club.kwcoder.weather.runner.runnerimpl;
import club.kwcoder.weather.WeatherStarter;
import club.kwcoder.weather.util.ValidateUtils;
import club.kwcoder.weather.writable.WeatherWritable;
import club.kwcoder.weather.runner.Runner;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
/**
* 第一步:数据清洗,数据验证,数据导入,三行合一
* 输出格式:监测站代码、日期、降水量、最高温度、最低温度、平均温度
*/
public class DataCleaning implements Runner {
@Override
public void run(WeatherStarter.RunnerBuilder builder) {
try {
Job job = Job.getInstance(builder.getConf(), builder.getJobName());
// 设置执行类
job.setJarByClass(DataCleaning.class);
// 设置输入
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.setInputPaths(job, builder.getInput());
// 设置Mapper
job.setMapperClass(DataCleanMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(WeatherWritable.class);
// 设置Reducer
job.setReducerClass(DataCleanReducer.class);
job.setOutputKeyClass(WeatherWritable.class);
job.setOutputValueClass(NullWritable.class);
// 设置输出
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, builder.getOutput());
// 运行
boolean flag = job.waitForCompletion(true);
if (flag) {
System.out.println(builder.getJobName() + " process success");
}
} catch (IOException | InterruptedException | ClassNotFoundException e) {
e.printStackTrace();
}
}
private static class DataCleanMapper extends Mapper<LongWritable, Text, Text, WeatherWritable> {
/*
Estacao;Data;Hora;Precipitacao;TempBulboSeco;TempBulboUmido;TempMaxima;TempMinima;UmidadeRelativa;PressaoAtmEstacao;PressaoAtmMar;DirecaoVento;VelocidadeVento;Insolacao;Nebulosidade;Evaporacao Piche;Temp Comp Media;Umidade Relativa Media;Velocidade do Vento Media;
82024;01/01/1961;0000;;;;32.3;;;;;;;4.4;;;26.56;82.5;3;
82024;01/01/1961;1200;;26;23.9;;22.9;83;994.2;;5;5;;8;;;;;
82024;01/01/1961;1800;;32.3;27;;;65;991.6;;5;3;;9;;;;;
*/
private static final Text outKey = new Text();
WeatherWritable.Builder builder = new WeatherWritable.Builder();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, WeatherWritable>.Context context) throws IOException, InterruptedException {
String line = value.toString();
if (ValidateUtils.validate(line)) {
return;
}
// 跳过标题行:key为0指的是每一块的第一个字符,不是整个文件的,不能只使用key判断
if (key.equals(new LongWritable(0L)) && line.startsWith("Estacao")) {
return;
}
// 不选择83377巴西利亚的数据
if (!line.contains("83377")) {
return;
}
/*
拆分数据:如果指使用;,则在没有数据的地方会直接跳过,即";;;;;".split(";").length为0
使用强制拆分,在满足条件的情况下拆分成19列
*/
String[] items = line.split(";", 19);
// 块切分时可能会把一行切在两块中
if (ValidateUtils.validate(items, 19)) {
return;
}
WeatherWritable weatherWritable = builder
.setCode(items[0])
.setDate(items[1])
.setPrecipitation(StringUtils.isBlank(items[3]) ? 0 : Float.parseFloat(items[3]))
.setMaxTemperature(StringUtils.isBlank(items[6]) ? 0 : Float.parseFloat(items[6]))
.setMinTemperature(StringUtils.isBlank(items[7]) ? 0 : Float.parseFloat(items[7]))
.setAvgTemperature(StringUtils.isBlank(items[16]) ? 0 : Float.parseFloat(items[16]))
.build();
// <83377-01/01/1961, weatherWritable>
outKey.set(weatherWritable.getCode() + "-" + weatherWritable.getDate());
context.write(outKey, weatherWritable);
}
}
private static class DataCleanReducer extends Reducer<Text, WeatherWritable, WeatherWritable, NullWritable> {
WeatherWritable.Builder builder = new WeatherWritable.Builder();
@Override
protected void reduce(Text key, Iterable<WeatherWritable> values, Reducer<Text, WeatherWritable, WeatherWritable, NullWritable>.Context context) throws IOException, InterruptedException {
String code = null, date = null;
float precipitation = 0.0F, maxTemp = 0.0F, minTemp = 0.0F, avgTemp = 0.0F;
for (WeatherWritable value : values) {
code = value.getCode();
date = value.getDate();
precipitation += value.getPrecipitation();
maxTemp = maxTemp + value.getMaxTemperature();
minTemp = minTemp + value.getMinTemperature();
avgTemp = avgTemp + value.getAvgTemperature();
}
// 数据验证
if (avgTemp > maxTemp || avgTemp < minTemp) {
return;
}
WeatherWritable weatherWritable = builder
.setCode(code)
.setDate(date)
.setMaxTemperature(maxTemp)
.setMinTemperature(minTemp)
.setAvgTemperature(avgTemp)
.setPrecipitation(precipitation).build();
context.write(weatherWritable, NullWritable.get());
}
}
}
修改启动类
在PATH
中添加如下数据:
static {
PATH = new HashMap<>();
// 输入路径
PATH.put("data_cleaning_input", "/weather");
// 输出路径
PATH.put("data_cleaning_output", "/weather_result/data_cleaning");
}
在main方法中添加:
run(DataCleaning.class, "DataCleaning", "data_cleaning_input", "data_cleaning_output");
数据分析:按年统计
需求
将数据清洗步骤的结果进行进一步提取统计,统计为以年为单位的数据,其格式为:
监测站代码、年份、年降水量、年最高温度、年最低温度、降水天数
方案
- 将每一天的数据格式转换为目标数据格式
- 按年份统计年降水量、年最高温度、年最低温度、降水天数
代码编写
实体类:WeatherWritableSummary.java
package club.kwcoder.weather.writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Objects;
public class WeatherWritableSummary extends WeatherWritable {
/**
* 年份
*/
private String year;
/**
* 降雨天数
*/
private Integer rainDays;
public WeatherWritableSummary(WeatherWritableSummary.Builder builder) {
super(builder);
this.year = builder.year;
this.rainDays = builder.rainDays;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.year);
out.writeInt(this.rainDays);
out.writeUTF(super.getCode());
out.writeFloat(super.getPrecipitation());
out.writeFloat(super.getMaxTemperature());
out.writeFloat(super.getMinTemperature());
}
@Override
public void readFields(DataInput in) throws IOException {
this.year = in.readUTF();
this.rainDays = in.readInt();
super.setCode(in.readUTF());
super.setPrecipitation(in.readFloat());
super.setMaxTemperature(in.readFloat());
super.setMinTemperature(in.readFloat());
}
@Override
public String toString() {
return super.getCode() + '\t' + this.getYear() + '\t' + super.getPrecipitation() + '\t' + super.getMaxTemperature() + '\t' + super.getMinTemperature() + "\t" + this.rainDays;
}
public static class Builder extends WeatherWritable.Builder {
private String year;
private Integer rainDays;
public WeatherWritableSummary.Builder setRainDays(Integer rainDays) {
this.rainDays = rainDays;
return this;
}
public WeatherWritableSummary.Builder setYear(String year) {
this.year = year;
return this;
}
public WeatherWritableSummary.Builder setCode(String code) {
super.setCode(code);
return this;
}
public WeatherWritableSummary.Builder setPrecipitation(Float precipitation) {
super.setPrecipitation(precipitation);
return this;
}
public WeatherWritableSummary.Builder setMaxTemperature(Float maxTemperature) {
super.setMaxTemperature(maxTemperature);
return this;
}
public WeatherWritableSummary.Builder setMinTemperature(Float minTemperature) {
super.setMinTemperature(minTemperature);
return this;
}
public WeatherWritableSummary.Builder setAvgTemperature(Float avgTemperature) {
super.setAvgTemperature(avgTemperature);
return this;
}
public WeatherWritableSummary buildSummary() {
return new WeatherWritableSummary(this);
}
public Builder() {
}
}
public WeatherWritableSummary(String code, Float precipitation, Float maxTemperature, Float minTemperature, Float avgTemperature, String year, Integer rainDays) {
super(code, null, precipitation, maxTemperature, minTemperature, avgTemperature);
this.year = year;
this.rainDays = rainDays;
}
public WeatherWritableSummary(String year, Integer rainDays) {
this.year = year;
this.rainDays = rainDays;
}
public WeatherWritableSummary() {
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public Integer getRainDays() {
return rainDays;
}
public void setRainDays(Integer rainDays) {
this.rainDays = rainDays;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (!super.equals(o)) return false;
WeatherWritableSummary that = (WeatherWritableSummary) o;
if (!Objects.equals(year, that.year)) return false;
return Objects.equals(rainDays, that.rainDays);
}
@Override
public int hashCode() {
int result = super.hashCode();
result = 31 * result + (year != null ? year.hashCode() : 0);
result = 31 * result + (rainDays != null ? rainDays.hashCode() : 0);
return result;
}
}
YearSummary.java
package club.kwcoder.weather.runner.runnerimpl;
import club.kwcoder.weather.WeatherStarter;
import club.kwcoder.weather.runner.Runner;
import club.kwcoder.weather.util.ValidateUtils;
import club.kwcoder.weather.writable.WeatherWritable;
import club.kwcoder.weather.writable.WeatherWritableSummary;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
/**
* 按年汇总数据,输出结果:观测站编码、年份、年降水量、年最高温度、年最低温度、降水天数
*/
public class YearSummary implements Runner {
@Override
public void run(WeatherStarter.RunnerBuilder builder) {
try {
Job job = Job.getInstance(builder.getConf(), builder.getJobName());
// 设置执行类
job.setJarByClass(DataCleaning.class);
// 设置输入
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.setInputPaths(job, builder.getInput());
// 设置Mapper
job.setMapperClass(YearSummaryMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(WeatherWritableSummary.class);
// 设置Reducer
job.setReducerClass(YearSummaryReducer.class);
job.setOutputKeyClass(WeatherWritableSummary.class);
job.setOutputValueClass(NullWritable.class);
// 设置输出
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, builder.getOutput());
// 运行
boolean flag = job.waitForCompletion(true);
if (flag) {
System.out.println(builder.getJobName() + " process success");
}
} catch (IOException | InterruptedException | ClassNotFoundException e) {
e.printStackTrace();
}
}
private static class YearSummaryMapper extends Mapper<LongWritable, Text, Text, WeatherWritableSummary> {
private final Text outKey = new Text();
private final WeatherWritableSummary.Builder outValBuilder = new WeatherWritableSummary.Builder();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, WeatherWritableSummary>.Context context) throws IOException, InterruptedException {
/*
编号 日期 降水量 最高温度 最低温度 平均温度
83377 01/01/1963 0.0 29.0 16.7 21.74
83377 01/01/1964 3.2 26.0 18.0 20.84
83377 01/01/1965 21.2 24.7 16.6 19.66
83377 01/01/1966 20.0 27.8 17.5 20.7
83377 01/01/1967 0.1 27.6 15.4 21.2
83377 01/01/1968 0.0 28.6 17.8 22.28
83377 01/01/1970 45.2 26.3 16.3 19.8
83377 01/01/1971 0.0 30.3 18.5 24.02
*/
String line = value.toString();
String[] items = ValidateUtils.splitAndValidate(line, "\t", 6);
if (null == items) {
return;
}
WeatherWritableSummary outVal = outValBuilder
.setCode(items[0])
.setYear(items[1].split("/")[2])
.setPrecipitation(Float.parseFloat(items[2]))
.setMaxTemperature(Float.parseFloat(items[3]))
.setMinTemperature(Float.parseFloat(items[4]))
.setRainDays(items[2].equals("0.0") ? 0 : 1)
.buildSummary();
outKey.set(outVal.getCode() + "-" + outVal.getYear());
context.write(outKey, outVal);
}
}
private static class YearSummaryReducer extends Reducer<Text, WeatherWritableSummary, WeatherWritable, NullWritable> {
private final WeatherWritableSummary.Builder outKeyBuilder = new WeatherWritableSummary.Builder();
private final NullWritable outVal = NullWritable.get();
@Override
protected void reduce(Text key, Iterable<WeatherWritableSummary> values, Reducer<Text, WeatherWritableSummary, WeatherWritable, NullWritable>.Context context) throws IOException, InterruptedException {
String code = "", year = "";
int rainDays = 0;
float precipitation = 0.0f, maxTemp = Float.MIN_VALUE, minTemp = Float.MAX_VALUE;
// 计算年降雨量、年最高气温、年最低气温、降雨天数
for (WeatherWritableSummary value : values) {
if (ValidateUtils.validate(code) || ValidateUtils.validate(year)) {
code = value.getCode();
year = value.getYear();
}
precipitation += value.getPrecipitation();
maxTemp = Math.max(maxTemp, value.getMaxTemperature());
if (value.getMinTemperature() != 0.0F) {
minTemp = Math.min(minTemp, value.getMinTemperature());
}
rainDays += value.getRainDays();
}
WeatherWritableSummary outKey = outKeyBuilder
.setCode(code)
.setYear(year)
.setPrecipitation(precipitation)
.setMaxTemperature(maxTemp)
.setMinTemperature(minTemp)
.setRainDays(rainDays)
.buildSummary();
// 输出格式:code year precipitation maxTemperature minTemperature rainDays
context.write(outKey, outVal);
}
}
}
修改启动类
在PATH
中添加如下数据: