1、课程设计题目及要求
(1)从ftp://ftp.ncdc.noaa.gov/pub/data/gsod下载2014到2018年的天气数据,然后对数据进行清洗,仅保留日期和当天的气温,数据文件保存为temperature.txt;
(2)根据temperature.txt,统计全球每年的最高气温和最低气温;要求:按每年每月统计最高和最低气温;按每年每月统计平均气温,并按年月由近到远排序;按年月筛选15-25度之间的气温数据,并按2014到2016年分别存储到3个文件中。
实验一(按每年每月统计最高和最低气温):
截取100000万行数据进行分析
package wordcount;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
public class MaxTAndMinTCombiner extends Reducer<Text, DoubleWritable, Text, DoubleWritable>{
@Override
/*
* reduce方法提供给reduce task进程来调用
*
* reduce task会将shuffle阶段分发过来的大量kv数据对进行聚合,聚合的机制是相同key的kv对聚合为一组
* 然后reduce task对每一组聚合kv调用一次我们自定义的reduce方法
* 比如:<hello,1><hello,1><hello,1><tom,1><tom,1><tom,1>
* hello组会调用一次reduce方法进行处理,tom组也会调用一次reduce方法进行处理
* 调用时传递的参数:
* key:一组kv中的key
* values:一组kv中所有value的迭代器
*/
protected void reduce(Text key, Iterable<DoubleWritable> values,Reducer< Text, DoubleWritable,Text, DoubleWritable>.Context context) throws IOException, InterruptedException {
double maxvalue=0;
double minvalue=100;
for(DoubleWritable value:values) {
if(maxvalue<value.get())
maxvalue=value.get();
if(minvalue>value.get())
minvalue=value.get();
}
context.write(key, new DoubleWritable(maxvalue));
context.write(key, new DoubleWritable(minvalue));}
}
package wordcount;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class wordcount extends Configured implements Tool{
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
Job wordCountJob = Job.getInstance(conf,"word count");
//重要:指定本job所在的jar包
wordCountJob.setJarByClass(wordcount.class);
//设置wordCountJob所用的mapper逻辑类为哪个类
wordCountJob.setMapperClass(WordCountMapper.class);
//设置wordCountJob所用的reducer逻辑类为哪个类
wordCountJob.setReducerClass(WordCountReducer.class);
//设置map阶段输出的kv数据类型
wordCountJob.setMapOutputKeyClass(Text.class);
wordCountJob.setMapOutputValueClass(DoubleWritable.class);
//设置最终输出的kv数据类型
wordCountJob.setOutputKeyClass(NullWritable.class);
wordCountJob.setOutputValueClass(YearMaxTAndMinT.class);
wordCountJob.setCombinerClass(MaxTAndMinTCombiner.class);
//设置要处理的文本数据所存放的路径
FileInputFormat.setInputPaths(wordCountJob, new Path("hdfs://hadoop:8020/data/hadoop/temperature30.txt"));
FileOutputFormat.setOutputPath(wordCountJob, new Path("hdfs://hadoop:8020/output28"));
//提交job给hadoop集群
wordCountJob.waitForCompletion(true);
return 0;
}
public static void main(String[] args) throws Exception {
//获取当前环境变量
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://hadoop:8020");// 指定namenode
//使用ToolRunner的run方法对自定义的类型进行处理
conf.set("mapreduce.job.jar",JarUtil.jar(wordcount.class));
try {
ToolRunner.run(conf, new wordcount(), args);
} catch (Exception e) {
e.printStackTrace();
}
}
}
package wordcount;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, DoubleWritable>{
YearMaxTAndMinT yearmaxtandmint=new YearMaxTAndMinT();
/*
* map方法是提供给map task进程来调用的,map task进程是每读取一行文本来调用一次我们自定义的map方法
* map task在调用map方法时,传递的参数:
* 一行的起始偏移量LongWritable作为key
* 一行的文本内容Text作为value
*/
@Override
protected void map(LongWritable key, Text value,Mapper<LongWritable, Text, Text, DoubleWritable>.Context context) throws IOException, InterruptedException {
//拿到一行文本内容,转换成String 类型
String line = value.toString();
//将这行文本切分成单词
//int[] indexs=getIndexs(line);
String[] words=line.split(" ");
String s1=words[0].substring(0,