MapReduce日常总结
简单的对字符出现的概率进行统计代码:
Mapper类
package com.fiberhome.py.CrawlingMR;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
*
* GodIsPY
* 2018年1月7日 下午7:12:10
*/
public class CrawlingMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
private static IntWritable one = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
//将读过来的数据输出
context.getCounter("MAP", "INPUT").increment(1);
one.set(1);
context.write(value, one);
context.getCounter("MAP", "OUTPUT").increment(1);
}
}
Reducer类
package com.fiberhome.py.CrawlingMR;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* GodIsPY
* 2018年1月7日 下午7:21:10
*/
public class CrawlingReduce extends Reducer<Text, IntWritable, Text, FloatWritable>{
private FloatWritable avg = new FloatWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, FloatWritable>.Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
context.getCounter("REDUCE", "INPUT").increment(1);
//将map发来的数据进行处理
Iterator<IntWritable> iterator = values.iterator();
int sum = 0;
float result = 0;
while(iterator.hasNext()){
sum += iterator.next().get();
}
//总数据1180
result = (float)sum/(float)1128;
avg.set(result);
context.write(key, avg);
context.getCounter("REDUCE", "OUTPUT").increment(1);
}
}
Main方法并对JOB的配置
package com.fiberhome.py.main;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import com.fiberhome.py.CrawlingMR.CrawlingMapper;
import com.fiberhome.py.CrawlingMR.CrawlingReduce;
/**
* GodIsPY
* 2018年1月7日 下午7:30:22
*/
public class JobCrawling {
public static void main(String[] args) {
try {
//获取hadoopjob
Job job = new Job();
//设置工作名
job.setJobName("JOB_Crawling_PY");
//设置MapReduce
job.setJarByClass(JobCrawling.class);
job.setMapperClass(CrawlingMapper.class);
job.setReducerClass(CrawlingReduce.class);
//设置输入输出流
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
if(job.waitForCompletion(true)){
System.out.println("运行成功");
}else{
System.out.println("运行失败");
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
MapReduce中需要注意的点
1、如果需要读取当前项目下的资源文件,Map中不能获取到文件,所以需要使用hadoop中Configuration来将配置文件读取。
2、Mapper每次获取的数据是一行数据。
3、Reducer获得的数据是Mapper中相同key整合成的数据。