Web日志流处理的MapReduce程序 -- 两个(一个使用Collections排序 一个使用MapReduce本身的排序)

我的这两个项目代码地址:
Collections排序:
https://gitee.com/tanghongping/web_click_mr_hve
MapReduce排序:
https://gitee.com/tanghongping/MapReduceTest

这两个项目里面会有一些车市的代码,可以忽略。

使用Collections.sort排序

WeblogBean

package com.thp.bigdata.webClick.mrBean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

/**
 * 对接外部数据的层,表结构定义最好跟外部数据源保持一致
 * @author 汤小萌
 *
 */
public class WeblogBean implements Writable {

	private boolean valid = true;		// 判断数据是否合法
	private String remote_addr;			// 记录客户端的ip地址
	private String remote_user;			// 记录客户端用户名称  忽略属性"-"
	private String time_local;			// 记录访问时间与时区
	private String request;				// 记录请求的url与http协议
	private String status;				// 记录请求状态;成功是200
	private String body_bytes_sent;		// 记录发给客户单主体文件的大小
	private String http_referer;		// 记录用户是从哪个链接过来的
	private String http_user_agent;		// 记录客户端浏览器的相关信息
	
	public void set(boolean valid, String remote_addr, String remote_user, String time_local, String request,
			String status, String body_bytes_sent, String http_referer, String http_user_agent) {
		this.valid = valid;
		this.remote_addr = remote_addr;
		this.remote_user = remote_user;
		this.time_local = time_local;
		this.request = request;
		this.status = status;
		this.body_bytes_sent = body_bytes_sent;
		this.http_referer = http_referer;
		this.http_user_agent = http_user_agent;
	}

	public boolean isValid() {
		return valid;
	}

	public void setValid(boolean valid) {
		this.valid = valid;
	}

	public String getRemote_addr() {
		return remote_addr;
	}

	public void setRemote_addr(String remote_addr) {
		this.remote_addr = remote_addr;
	}

	public String getRemote_user() {
		return remote_user;
	}

	public void setRemote_user(String remote_user) {
		this.remote_user = remote_user;
	}

	public String getTime_local() {
		return time_local;
	}

	public void setTime_local(String time_local) {
		this.time_local = time_local;
	}

	public String getRequest() {
		return request;
	}

	public void setRequest(String request) {
		this.request = request;
	}

	public String getStatus() {
		return status;
	}

	public void setStatus(String status) {
		this.status = status;
	}

	public String getBody_bytes_sent() {
		return body_bytes_sent;
	}

	public void setBody_bytes_sent(String body_bytes_sent) {
		this.body_bytes_sent = body_bytes_sent;
	}

	public String getHttp_referer() {
		return http_referer;
	}

	public void setHttp_referer(String http_referer) {
		this.http_referer = http_referer;
	}

	public String getHttp_user_agent() {
		return http_user_agent;
	}

	public void setHttp_user_agent(String http_user_agent) {
		this.http_user_agent = http_user_agent;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeBoolean(this.valid);
		out.writeUTF(null==remote_addr?"":remote_addr);
		out.writeUTF(null==remote_user?"":remote_user);
		out.writeUTF(null==time_local?"":time_local);
		out.writeUTF(null==request?"":request);
		out.writeUTF(null==status?"":status);
		out.writeUTF(null==body_bytes_sent?"":body_bytes_sent);
		out.writeUTF(null==http_referer?"":http_referer);
		out.writeUTF(null==http_user_agent?"":http_user_agent);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.valid = in.readBoolean();
		this.remote_addr = in.readUTF();
		this.remote_user = in.readUTF();
		this.time_local = in.readUTF();
		this.request = in.readUTF();
		this.status = in.readUTF();
		this.body_bytes_sent = in.readUTF();
		this.http_referer = in.readUTF();
		this.http_user_agent = in.readUTF();
	}
	
	
	@Override
	public String toString() {
		StringBuilder sb = new StringBuilder();
		sb.append(this.valid);
		sb.append("\001").append(this.getRemote_addr());
		sb.append("\001").append(this.getRemote_user());
		sb.append("\001").append(this.getTime_local());
		sb.append("\001").append(this.getRequest());
		sb.append("\001").append(this.getStatus());
		sb.append("\001").append(this.getBody_bytes_sent());
		sb.append("\001").append(this.getHttp_referer());
		sb.append("\001").append(this.getHttp_user_agent());
		return sb.toString();
	}
	
	
}

PageViewsBean

package com.thp.bigdata.webClick.mrBean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

/**
 * 
 * @author 汤小萌
 *
 */
public class PageViewsBean implements Writable {
	
	private String session;			// sessionId
	private String remote_addr;		// 客户端ip地址
	private String timeStr;			// 访问的时间
	private String request;			// 请求的url
	private int step;				// 访问的第几步
	private String staylong;		// 停留的时间
	private String referal;			// 是从哪个页面过来的
	private String useragent;		// 记录跟浏览器相关信息
	private String bytes_send;		// 发送的数据字节大小
	private String status;			// 本次请求的状态
	
	
	
	public void set(String session, String remote_addr, String useragent, String timeStr, String request, int step, String staylong, String referal, String bytes_send, String status) {
		this.session = session;
		this.remote_addr = remote_addr;
		this.useragent = useragent;
		this.timeStr = timeStr;
		this.request = request;
		this.step = step;
		this.staylong = staylong;
		this.referal = referal;
		this.bytes_send = bytes_send;
		this.status = status;
	}
	
	public String getSession() {
		return session;
	}
	public void setSession(String session) {
		this.session = session;
	}
	public String getRemote_addr() {
		return remote_addr;
	}
	public void setRemote_addr(String remote_addr) {
		this.remote_addr = remote_addr;
	}
	public String getTimeStr() {
		return timeStr;
	}
	public void setTimeStr(String timeStr) {
		this.timeStr = timeStr;
	}
	public String getRequest() {
		return request;
	}
	public void setRequest(String request) {
		this.request = request;
	}
	public int getStep() {
		return step;
	}
	public void setStep(int step) {
		this.step = step;
	}
	public String getStaylong() {
		return staylong;
	}
	public void setStaylong(String staylong) {
		this.staylong = staylong;
	}
	public String getReferal() {
		return referal;
	}
	public void setReferal(String referal) {
		this.referal = referal;
	}
	public String getUseragent() {
		return useragent;
	}
	public void setUseragent(String useragent) {
		this.useragent = useragent;
	}
	public String getBytes_send() {
		return bytes_send;
	}
	public void setBytes_send(String bytes_send) {
		this.bytes_send = bytes_send;
	}
	public String getStatus() {
		return status;
	}
	public void setStatus(String status) {
		this.status = status;
	}
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(session);
		out.writeUTF(remote_addr);
		out.writeUTF(timeStr);
		out.writeUTF(request);
		out.writeInt(step);
		out.writeUTF(staylong);
		out.writeUTF(referal);
		out.writeUTF(useragent);
		out.writeUTF(bytes_send);
		out.writeUTF(status);
	}
	@Override
	public void readFields(DataInput in) throws IOException {
		this.session = in.readUTF();
		this.remote_addr = in.readUTF();
		this.timeStr = in.readUTF();
		this.request = in.readUTF();
		this.step = in.readInt();
		this.staylong = in.readUTF();
		this.referal = in.readUTF();
		this.useragent = in.readUTF();
		this.bytes_send = in.readUTF();
		this.status = in.readUTF();
	}
	
	
	
	
	
}

VisitBean

package com.thp.bigdata.webClick.mrBean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

/**
 * 
 * @author 汤小萌
 *
 */
public class VisitBean implements Writable {
	private String session;
	private String remote_addr;
	private String inTime;
	private String outTime;
	private String inPage;
	private String outPage;
	private String referal;
	private int pageVisits;
	
	public void set(String session, String remote_addr, String inTime, String outTime, String inPage, String outPage, String referal, int pageVisits) {
		this.session = session;
		this.remote_addr = remote_addr;
		this.inTime = inTime;
		this.outTime = outTime;
		this.inPage = inPage;
		this.outPage = outPage;
		this.referal = referal;
		this.pageVisits = pageVisits;
	}

	public String getSession() {
		return session;
	}

	public void setSession(String session) {
		this.session = session;
	}

	public String getRemote_addr() {
		return remote_addr;
	}

	public void setRemote_addr(String remote_addr) {
		this.remote_addr = remote_addr;
	}

	public String getInTime() {
		return inTime;
	}

	public void setInTime(String inTime) {
		this.inTime = inTime;
	}

	public String getOutTime() {
		return outTime;
	}

	public void setOutTime(String outTime) {
		this.outTime = outTime;
	}

	public String getInPage() {
		return inPage;
	}

	public void setInPage(String inPage) {
		this.inPage = inPage;
	}

	public String getOutPage() {
		return outPage;
	}

	public void setOutPage(String outPage) {
		this.outPage = outPage;
	}

	public String getReferal() {
		return referal;
	}

	public void setReferal(String referal) {
		this.referal = referal;
	}

	public int getPageVisits() {
		return pageVisits;
	}

	public void setPageVisits(int pageVisits) {
		this.pageVisits = pageVisits;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(session);
		out.writeUTF(remote_addr);
		out.writeUTF(inTime);
		out.writeUTF(outTime);
		out.writeUTF(inPage);
		out.writeUTF(outPage);
		out.writeUTF(referal);
		out.writeInt(pageVisits);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.session = in.readUTF();
		this.remote_addr = in.readUTF();
		this.inTime = in.readUTF();
		this.outTime = in.readUTF();
		this.inPage = in.readUTF();
		this.outPage = in.readUTF();
		this.referal = in.readUTF();
		this.pageVisits = in.readInt();
	}
	
	@Override
	public String toString() {
		return session + "\001" + remote_addr + "\001" + inTime + "\001" +
				outTime + "\001" + inPage + "\001" + outPage + "\001" + referal + "\001" + pageVisits;
	}

	
	
	
}

预处理解析类

package com.thp.bigdata.webClick.mrBean;

import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;

import org.junit.Test;

/**
 * 对加载进来的数据进行 
 * @author 汤小萌
 *
 */
public class WeblogParser {
	
	
	
	/**
	 *  0 ) 194.237.142.21
		1 ) -
		2 ) -
		3 ) [18/Sep/2013:06:49:18
		4 ) +0000]
		5 ) "GET
		6 ) /wp-content/uploads/2013/07/rstudio-git3.png
		7 ) HTTP/1.1"
		8 ) 304
		9 ) 0
		10 ) "-"
		11 ) "Mozilla/4.0
		12 ) (compatible;)"
	 * @param line
	 * @return
	 */
	public static WeblogBean parser(String line) {
		WeblogBean weblogBean = new WeblogBean();
		String[] arr = line.split(" ");
		if(arr.length >11) {
			weblogBean.setRemote_addr(arr[0]);
			weblogBean.setRemote_user(arr[1]);
			String time_local = formatDate(arr[3].substring(1));
			if(null == time_local) time_local = "-invalid_time-";
			weblogBean.setTime_local(time_local);
			weblogBean.setRequest(arr[6]);
			
			weblogBean.setStatus(arr[8]);
			weblogBean.setBody_bytes_sent(arr[9]);
			weblogBean.setHttp_referer(arr[10]);
			
			// 如果useragent元素较多,则拼接useragent
			
			if(arr.length > 12) {
				StringBuffer sb = new StringBuffer();
				for(int i = 11; i < arr.length; i++) {
					sb.append(arr[i]);
				}
				weblogBean.setHttp_user_agent(sb.toString());
			} else {
				weblogBean.setHttp_user_agent(arr[11]);
			}
			
			if(Integer.parseInt(weblogBean.getStatus()) >= 400) {  // 状态码 >=400 说明请求错误
				weblogBean.setValid(false);
			}
			
			if("-invalid_time-".equals(weblogBean.getTime_local())) {
				weblogBean.setValid(false);
			}
			
		} else {
			weblogBean.setValid(false);
		}
		r
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值