上传日志文件到hadoop的dfs当中去
一、根据上述日志文件,计算该天的独立ip数,pv数(注意要筛选日志,并非每条记录都要统计),被传输页面的总字节数
1、将日志信息分为8个字段,创建指标对象KPI
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
/*
* KPI Object
*/
public class KPI {
private String remote_addr;// 记录客户端的ip地址
private String remote_user;// 记录客户端用户名称,忽略属性"-"
private String time_local;// 记录访问时间与时区
private String request;// 记录请求的url与http协议
private String status;// 记录请求状态;成功是200
private String body_bytes_sent;// 记录发送给客户端文件主体内容大小
private String http_referer;// 用来记录从那个页面链接访问过来的
private String http_user_agent;// 记录客户浏览器的相关信息
private boolean valid = true;// 判断数据是否合法
private static KPI parser(String line) {
KPI kpi = new KPI();
String[] arr = line.split(" ");
if (arr.length > 11) {
kpi.setRemote_addr(arr[0]);
kpi.setRemote_user(arr[1]);
kpi.setTime_local(arr[3].substring(1));
kpi.setRequest(arr[6]);
kpi.setStatus(arr[8]);
kpi.setBody_bytes_sent(arr[9]);
kpi.setHttp_referer(arr[10]);
if (arr.length > 12) {
kpi.setHttp_user_agent(arr[11] + " " + arr[12]);
} else {
kpi.setHttp_user_agent(arr[11]);
}
try{
// 存在status没有的情况,直接pass
if (Integer.parseInt(kpi.getStatus()) >= 400) {
// 大于400,HTTP错误
kpi.setValid(false);
}
}catch(Exception e){
System.out.println(line);
kpi.setValid(false);
}
} else {
kpi.setValid(false);
}
return kpi;
}
/**
* 按page的pv分类
*/
public static KPI filterPVs(String line) {
/*KPI kpi = parser(line);
Set<String> pages = new HashSet<String>();
pages.add("/forum-46-1.html");
pages.add("/forum-58-1.html");
pages.add("/forum-61-1.html");
if (!pages.contains(kpi.getRequest())) {
kpi.setValid(false);
}
return kpi;*/
return parser(line);
}
/**
* 按page的独立ip分类
*/
public static KPI filterIPs(String line) {
/*KPI kpi = parser(line);
Set<String> pages = new HashSet<String>();
pages.add("/forum-46-1.html");
pages.add("/forum-58-1.html");
pages.add("/forum-61-1.html");
if (!pages.contains(kpi.getRequest())) {
kpi.setValid(false);
}
return kpi;*/
return parser(line);
}
/**
* PV按浏览器分类
*/
public static KPI filterBroswer(String line) {
return parser(line);
}
/**
* PV按小时分类
*/
public static KPI filterTime(String line) {
return parser(line);
}
/**
* PV按访问域名分类
*/
public static KPI filterDomain(String line) {
return parser(line);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("valid:" + this.valid);
sb.append("\nremote_addr:" + this.remote_addr);
sb.append("\nremote_user:" + this.remote_user);
sb.append("\ntime_local:" + this.time_local);
sb.append("\nrequest:" + this.request);
sb.append("\nstatus:" + this.status);
sb.append("\nbody_bytes_sent:" + this.body_bytes_sent);
sb.append("\nhttp_referer:" + this.http_referer);
sb.append("\nhttp_user_agent:" + this.http_user_agent);
return sb.toString();
}
public String getRemote_addr() {
return remote_addr;
}
public void setRemote_addr(String remote_addr) {
this.remote_addr = remote_addr;
}
public String getRemote_user() {
return remote_user;
}
public void setRemote_user(String remote_user) {
this.remote_user = remote_user;
}
public String getTime_local() {
return time_local;
}
public Date getTime_local_Date() throws ParseException {
SimpleDateFormat df = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",
Locale.US);
return df.parse(this.time_local);
}
public String getTime_local_Date_hour() throws ParseException {
SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHH");
return df.format(this.getTime_local_Date());
}
public void setTime_local(String time_local) {
this.time_local = time_local;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public String getStatus() {
return status;
}
public