package com.ibeifeng.hadoop19_copy;
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.StringTokenizer;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.jboss.netty.util.internal.StringUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.jboss.netty.util.internal.StringUtil;
//处理用户登录网站的日志信息
//一行日志代表一条信息
//数据的格式由36个字段组成,每个字段用制表符分隔,参考,track字典可以得知每个字段代表的含义,
//我们现在要考察的是一份日志中,统计各个省的PV数据,需要取出url与provinceID这两个字段!url不能为空!provinceID不能为空!url是第二个字段,provinceID是第24个字段
public class WebPV {
public static class PVMap extends Mapper<LongWritable,Text,IntWritable,IntWritable>{
private final IntWritable mapoutkey = new IntWritable();
private final IntWritable mapoutvalue = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] message = line.split("\t");
//判断数组的长度是否小于30,小于30的数据不计算
if(message.length < 30){
//增加计数器,可以计算出字段少于30的行数!
context.getCounter("Count_NUMBER", "Length_short_than_30").increment(1L);
return;
}
//判断url是否为空
String url = message[1];
if(StringUtils.isBlank(url)){
context.getCounter("Count_NUMBER", "url_is_blank").increment(1L);
return;
}
//判断provinceID是否为空
String provinceID = message[23];
if(StringUtils.isBlank(provinceID)){
context.getCounter("Count_NUMBER", "provinceID_is_blank").increment(1L);
return;
}
//
Integer i = Integer.MAX_VALUE;
try {
i = Integer.parseInt(provinceID);
} catch (NumberFormatException e) {
// TODO Auto-generated catch block
context.getCounter("Count_NUMBER", "Transe").increment(1L);
e.printStackTrace();
return;
}
mapoutkey.set(i);
context.write(mapoutkey, mapoutvalue);
}
}
public static class PVReduce extends Reducer<IntWritable,IntWritable,IntWritable,IntWritable>{
//private final IntWritable reduceoutkey = new IntWritable();
private final IntWritable reduceoutvalue = new IntWritable();
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
int sum = 0;
for(IntWritable value : values){
sum += value.get();
}
reduceoutvalue.set(sum);
context.write(key, reduceoutvalue);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException{
//获取Hadoop的默认配置信息
Configuration conf = new Configuration();
//设置job
Job job = Job.getInstance(conf, "JinNan");
//设置打成Jar包
job.setJarByClass(WebPV.class);
//设置输入路径
Path inpath = new Path("hdfs://node-1:8020/2015082818");
FileInputFormat.setInputPaths(job, inpath);
//设置mapper类
job.setMapperClass(PVMap.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
//shuffle
//设置reduce类
job.setReducerClass(PVReduce.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
//设置输出路径
Path outpath = new Path("hdfs://node-1:8020/test");
FileSystem fs = outpath.getFileSystem(conf);
if(fs.exists(outpath)){
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
System.exit(job.waitForCompletion(true)? 0: 1);
}
}