一、tablebean类
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
//join创建写入写出的流,reduce端表合并
public class TableBean implements Writable {
//空参构造
public TableBean(){
}
private int hour; //小时
private long maxPM; //最大PM
private long minPM; //最小PM
private String citymaxPM;//最大PM城市
private String cityminPM;//最小PM城市
public String getCitymaxPM() {
return citymaxPM;
}
public void setCitymaxPM(String citymaxPM) {
this.citymaxPM = citymaxPM;
}
public String getCityminPM() {
return cityminPM;
}
public void setCityminPM(String cityminPM) {
this.cityminPM = cityminPM;
}
public long getMaxPM() {
return maxPM;
}
public void setMaxPM(Long maxPM) {
this.maxPM = maxPM;
}
public long getMinPM() {
return minPM;
}
public void setMinPM(long minPM) {
this.minPM = minPM;
}
public int getHour() {
return hour;
}
public void setHour(int hour) {
this.hour = hour;
}
//顺序必须一致
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(hour);
dataOutput.writeUTF(citymaxPM);
dataOutput.writeLong(maxPM);
dataOutput.writeUTF(cityminPM);
dataOutput.writeLong(minPM);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.hour = dataInput.readInt();
this.citymaxPM = dataInput.readUTF();
this.maxPM = dataInput.readLong();
this.cityminPM = dataInput.readUTF();
this.minPM = dataInput.readLong();
}
//决定最终的输出
@Override
public String toString() {
return "最大PM城市:" + citymaxPM + " \t最大PM:" + maxPM + " \t最小PM城市:" + cityminPM + " \t最小PM:" + minPM;
}
}
二、mapper类
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class TableMapper extends Mapper<LongWritable, Text, Text,TableBean> {
private Text outK = new Text();
private TableBean outV = new TableBean();
//对每个<k,v>调用一次
//站号,经度,纬度,PM25,PM10,NO2,SO2,O3-1,O3-8h,CO,AQI,等级,年,月,日,小时,城市。 ","分割。
//99000,115.49,38.88,43,68,21,20,104,104,0.6,60,2,2018,8,1,0,北京
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取一行
String line = value.toString();
String[] split = line.split(",");
String cityid = split[split.length-1];
String houres = split[split.length-2];
String PM = split[3];
outK.set(houres);
outV.setCitymaxPM(cityid);
outV.setMaxPM(Long.parseLong(PM));
outV.setCityminPM(cityid);
outV.setMinPM(Long.parseLong(PM));
context.write(outK,outV);
}
}
三、Combiner类
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.*;
//与map阶段的输出一致,还需关联driver
//map之后reducer之前
//输出为:(小时,(最大PM城市,最大PM,最小PM城市,最小PM))
public class TableCombiner extends Reducer<Text, TableBean, Text, TableBean> {
private TableBean outV = new TableBean();
//为了防止相同的PM被覆盖,所以要遍历输出
//key为:小时
//value:最大PM城市,最大PM,最小PM城市,最小PM
@Override
protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException {
Map<String,Long> map = new TreeMap<String,Long>();
//map的key为:小时,城市
//map的value为:PM
//循环遍历累加相同城市和相同时间的PM值
for (TableBean value : values) {
long sumPM =0;
if(map.isEmpty()){
map.put(key+","+value.getCitymaxPM(),value.getMaxPM());
}else if(map.containsKey(key+","+value.getCitymaxPM())){
List<Map.Entry<String, Long>> list = new ArrayList<>(map.entrySet());
for (Map.Entry<String, Long> longEntry : list) {
sumPM = longEntry.getValue() + value.getMaxPM();
}
map.put(key+","+value.getCitymaxPM(), sumPM);
}else {
map.put(key+","+value.getCitymaxPM(), value.getMaxPM());
}
}
List<Map.Entry<String, Long>> list = new ArrayList<>(map.entrySet());
//map的key为:小时,城市
//map的value为:PM
//取出其中的key和value写出去
for (Map.Entry<String, Long> longEntry : list) {
outV.setCitymaxPM(longEntry.getKey().split(",")[1]);
outV.setMaxPM(longEntry.getValue());
outV.setCityminPM(longEntry.getKey().split(",")[1]);
outV.setMinPM(longEntry.getValue());
context.write(key,outV);
}
}
}
四、reducer类
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.*;
public class TableReducer extends Reducer<Text,TableBean, Text, TableBean> {
private TableBean outV = new TableBean();
//为了防止相同的PM被覆盖,所以要遍历输出
//key:小时
//value:最大PM城市,最大PM,最小PM城市,最小PM
@Override
protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException {
Map<String,Long> map = new TreeMap<String,Long>();
//将城市和PM存进map集合
for (TableBean value : values) {
map.put(value.getCitymaxPM(),value.getMaxPM());
}
//取出map集合中的最大和最小值
List<Long> entrys = new ArrayList<Long>(map.values());//转换为List集合
long maxPM1 = Collections.max(entrys);
long minPM1 = Collections.min(entrys);
String maxPMcity = null;
String minPMcity = null;
//遍历取出最大、最小的城市和PM值
List<Map.Entry<String, Long>> list = new ArrayList<>(map.entrySet());
for (Map.Entry<String, Long> entry : list) {
if(entry.getValue()==maxPM1){
maxPMcity = entry.getKey();
}
if(entry.getValue()==minPM1){
minPMcity = entry.getKey();
}
}
outV.setCitymaxPM(maxPMcity);
outV.setMaxPM(maxPM1);
outV.setCityminPM(minPMcity);
outV.setMinPM(minPM1);
context.write(key,outV);
}
}
五、driver类
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;
import java.io.IOException;
public class TableDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
BasicConfigurator.configure(); //自动快速地使用缺省Log4j环境
//1 获取job
Configuration conf = new Configuration();
Job job = Job.getInstance();
//System.setProperty("HADOOP_USER_NAME", "root");
//2 设置jar
job.setJarByClass(TableDriver.class);
//3 关联mapper和reducer
job.setMapperClass(TableMapper.class);
job.setReducerClass(TableReducer.class);
//关联Combiner
job.setCombinerClass(TableCombiner.class);
//4.设置map输出的KV类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(TableBean.class);
//5.设置最终输出的KV类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(TableBean.class);
//6.设置输入路径和输出路径
FileInputFormat.setInputPaths(job,new Path("C:\\hadoopzye\\hadoop"));
FileOutputFormat.setOutputPath(job, new Path("C:\\hadoopzye\\output1"));
//7.提交job
boolean result = job.waitForCompletion(true);
//如果成功返回0否则返回1
System.exit(result?0:1);
}
}
六、例子
1.因为原文件数据太多所以截取了一点点用于测试
结果:
可以打包成jar包到hadoop集群上测试