网站访问日志清洗以及pageview和visit模型 代码及思路
想要清洗数据得到pageview表和visitbiao对象数据之前我们先了解一下
weblogbean pageviewbean visitbean 三个对象里 所有或者所需要的属性数据。
简单描述一下:pageview 的核心属性 和visit核心属性
pageview: sessionid / ip / url / 停留的时长 /第几步
visit :sessionsid /起始时间 /结束时间/进入的页面/离开的页面/访问的页面书/ip/referal
具体如下:
weblogbean
private boolean valid = true;// 判断数据是否合法
private String remote_addr;// 记录客户端的ip地址
private String remote_user;// 记录客户端用户名称,忽略属性"-"
private String time_local;// 记录访问时间与时区
private String request;// 记录请求的url与http协议
private String status;// 记录请求状态;成功是200
private String body_bytes_sent;// 记录发送给客户端文件主体内容大小
private String http_referer;// 用来记录从那个页面链接访问过来的
private String http_user_agent;// 记录客户浏览器的相关信息
pageviewbean
private String session; //sessionid
private String remote_addr;
private String timestr;
private String request;
private int step;
private String staylong;
private String referal;
private String useragent;
private String bytes_send;
private String status;
visitbean
private String session;
private String remote_addr;
private String inTime;
private String outTime;
private String inPage;
private String outPage;
private String referal;
private int pageVisits;
下面我们分俩个清洗计算 分别得到我们的pageview和visit所需要的数据。
一 清洗计算得到pageview数据
1.PageViewsBean
package cn.weblog2;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class PageViewsBean implements Writable {
private String session;
private String remote_addr;
private String timestr;
private String request;
private int step;
private String staylong;
private String referal;
private String useragent;
private String bytes_send;
private String status;
public void set(String session, String remote_addr, String useragent, String timestr, String request, int step, String staylong, String referal, String bytes_send, String status) {
this.session = session;
this.remote_addr = remote_addr;
this.useragent = useragent;
this.timestr = timestr;
this.request = request;
this.step = step;
this.staylong = staylong;
this.referal = referal;
this.bytes_send = bytes_send;
this.status = status;
}
public String getSession() {
return session;
}
public void setSession(String session) {
this.session = session;
}
public String getRemote_addr() {
return remote_addr;
}
public void setRemote_addr(String remote_addr) {
this.remote_addr = remote_addr;
}
public String getTimestr() {
return timestr;
}
public void setTimestr(String timestr) {
this.timestr = timestr;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public int getStep() {
return step;
}
public void setStep(int step) {
this.step = step;
}
public String getStaylong() {
return staylong;
}
public void setStaylong(String staylong) {
this.staylong = staylong;
}
public String getReferal() {
return referal;
}
public void setReferal(String referal) {
this.referal = referal;
}
public String getUseragent() {
return useragent;
}
public void setUseragent(String useragent) {
this.useragent = useragent;
}
public String getBytes_send() {
return bytes_send;
}
public void setBytes_send(String bytes_send) {
this.bytes_send = bytes_send;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public void readFields(DataInput in) throws IOException {
this.session = in.r