LogBean类
java
package clean;
import lombok.Getter;
import lombok.Setter;
@Setter
@Getter
public class LogBean {
private String remote_addr;// 记录客户端的ip地址
private String remote_user;// 记录客户端用户名称,忽略属性"-"
private String time_local;// 记录访问时间与时区
private String request;// 记录请求的url与http协议
private String status;// 记录请求状态;成功是200
private String body_bytes_sent;// 记录发送给客户端文件主体内容大小
private String http_referer;// 用来记录从那个页面链接访问过来的
private String http_user_agent;// 记录客户浏览器的相关信息
private boolean valid = true;// 判断数据是否合法
public String toString(){
StringBuffer stringBuffer = new StringBuffer();
stringBuffer.append(this.valid);
stringBuffer.append("\001").append(remote_addr);
stringBuffer.append("\001").append(remote_user);
stringBuffer.append("\001").append(time_local);
stringBuffer.append("\001").append(request);
stringBuffer.append("\001").append(status);
stringBuffer.append("\001").append(body_bytes_sent);
stringBuffer.append("\001").append(http_referer);
stringBuffer.append("\001").append(http_user_agent);
return stringBuffer.toString();
}
}
LogCheanMapper
java
package clean;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class LogCleanMapper extends Mapper<LongWritable, Text, NullWritable,Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取数据
String line = value.toString();
//判断文 字是否合法
LogBean logBean=pressLog(line);
if (!logBean.isValid()){
return;
}
context.write(NullWritable.get(),new Text(logBean.toString()));
}
public LogBean pressLog(String line) {
LogBean logBean = new LogBean();
//切分
String[] s = line.split(" ");
if (s.length>11){
//封装数据
logBean.setRemote_addr(s[0]);
logBean.setRemote_user(s[1]);
logBean.setTime_local(s[3]+s[4]);
logBean.setRequest(s[7]);
logBean.setStatus(s[8]);
logBean.setBody_bytes_sent(s[9]);
logBean.setHttp_referer(s[10]);
if (s.length>12){
logBean.setHttp_user_agent(s[1]+""+s[12]);
}else {
logBean.setHttp_user_agent(s[11]);
}
//判断状态码大于或等 于400将valid=false
if (Integer.parseInt (logBean.getStatus())>=400){
logBean.setValid(false);
}
}else {
logBean.setValid(false);
}
return logBean;
}
}
LogDriver
java
package clean;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import telephone.LogCleanDriver;
import telephone.LogCleanMapper;
import java.io.IOException;
public class LogDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
args=new String[]{"G:\\a\\web.txt","G:\\a\\OutWeb3.txt"};
Job job = Job.getInstance(new Configuration());
job.setJarByClass(LogDriver.class);
//配置mapper类及输出数据类型
job.setMapperClass(LogCleanMapper.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
//设置reduce的个数,默认是1
job.setNumReduceTasks(0);
//输入数据路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
//输出数据路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//提交 任务
job.waitForCompletion(true);
}
}