一、目的
对于用户访问日志,使用MapReduce进行编程分析,获得每个城市的访问量。
二、实现环境
1.CDH 5.7.0
2.CentOS 7.4
3.集群外部Hadoop开发环境部署
参考博客:https://blog.csdn.net/u010886217/article/details/89278390
三、实现代码
1.研究日志格式
2.MapReduce实现统计日志代码
package Hadoop;
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class PVMapReduce extends Configured implements Tool{
//map类
//继承Mapper类,<KEYIN, VALUEIN, KEYOUT, VALUEOUT> 输入的key,输入的value,输出的key,输出的value
public static class MyPVMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable>{
IntWritable province_id = new IntWritable();
IntWritable mr_value = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, IntWritable, IntWritable>.Context context)
throws IOException, InterruptedException {
//(pro_id,1)
//获得一整条数据
String line = value.toString();
//将数据进行切割获得一串数组
String [] str = line.split("\t");
//切割之后数组长度小于30,认为这条数据字段缺失,丢弃
if(str.length <= 30){
context.getCounter("我的计数器","长度小于30的数据").increment(1);
return;
}
//取出数组下标为23的字符串 =》 pro_id
String pro_id = str[23];
String url = str[1];
/**
* 其实很多情况下,数据是不完整的,所以在map方法当中,我们要对数据进行清洗
* 做if判断,去掉不符合逻辑的数据
*/
if(StringUtils.isBlank(url)){
context.getCounter("我的计数器","url为空的数据").increment(1);
return;
}
int pro_int = -1;
try{
pro_int = Integer.parseInt(pro_id);
}catch(Exception e){
context.getCounter("我的计数器","数值转换异常的数据").increment(1);
return;
}
//最后要具体查看数据结果,判断是否符合我们的要求(可能还要做更多的过滤)
province_id.set(pro_int);
context.write(province_id, mr_value);
}
}
//combiner
public static class MyPVcombiner extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
IntWritable total = new IntWritable();
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values,
Reducer<IntWritable, IntWritable, IntWritable, IntWritable>.Context context) throws IOException, InterruptedException {
int count = -1;
for (IntWritable intWritable : values) {
count += intWritable.get();
}
total.set(count);
context.write(key, total);
}
}
//reduce类
// reduce类的输入,其实就是map类中map方法的输出 输入key 输入value 输出key 输出value
public static class MyPVReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
IntWritable total = new IntWritable();
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values,
Reducer<IntWritable, IntWritable, IntWritable, IntWritable>.Context context) throws IOException, InterruptedException {
int count = -1;
for (IntWritable intWritable : values) {
count += intWritable.get();
}
total.set(count);
context.write(key, total);
}
}
//运行类,run方法,在测试的时候使用main函数,调用这个类的run方法来运行
/**
*
* @param args 参数是要接受main方法得到的参数,在run中使用
* @return
* @throws Exception
*/
public int run(String[] args) throws Exception {
//通过调用this的getConf方法得到从外部传入的conf对象
Configuration conf = this.getConf();
Job job = Job.getInstance(conf,this.getClass().getSimpleName());
job.setJarByClass(PVMapReduce.class);
//输入路径
Path inpath = new Path(args[0]);
FileInputFormat.addInputPath(job, inpath);
//输出路径
Path outpath = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outpath);
//执行前先判断输出路径是否存在,存在就将该路径删除
FileSystem fs = outpath.getFileSystem(conf);
if(fs.exists(outpath)){
fs.delete(outpath,true);
}
//设置Map相关参数
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setMapperClass(MyPVMapper.class);
//设置shuffle
job.setCombinerClass(MyPVcombiner.class);
//设置reduce相关参数
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
job.setReducerClass(MyPVReducer.class);
int isSuccess = job.waitForCompletion(true)?0:1;
return isSuccess;
}
public static void main(String[] args) {
Configuration conf = new Configuration();
args = new String[]{
"hdfs://hadoop01:8020/20150812",
"hdfs://hadoop01:8020/out"
};
try {
int isSucces = ToolRunner.run(conf,new PVMapReduce(), args);
System.out.println("isSuccess"+isSucces);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}