ETL的实现
ETL:Extract-Transform-Load,数据抽取-转换-加载过程。
■目 标:过滤无效数据,解析补全数据,格式化需求数据
■无效数据:缺少访客id,会话id,订单id等关键属性的,针对不同事件有不同,的属性要求
■解析补全数据: 浏览器信息、操作系统信息、地域信息等
■格式化数据:时间日期、客户端信息等
■数据来源: 存储在HDFS上的用户行为数据
■数据存储: ETL后的数据存储位置: HDFS,供后期使用Hive分析做好准备
数据清洗日志内容
ETL代码实现
WebLogDriver:
public class WebLogDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
//1.创建Job
Job job = Job.getInstance( this.getConf(), "WebLogDriver" );
job.setJarByClass( WebLogDriver.class );
//2.设置job
//(2.1)input
Path inputPath = new Path( args[0] );
FileInputFormat.setInputPaths( job,inputPath );
//(2.2)map
job.setMapperClass( WebLogMapper.class );
job.setMapOutputKeyClass( Text.class );
job.setMapOutputValueClass( NullWritable.class );
//(2.5)output
//如果输出目录存在,干掉
FileSystem hdfs = FileSystem.get( this.getConf() );
Path outputPath = new Path( args[1] );
if(hdfs.exists( outputPath )){
hdfs.delete( outputPath ,true);
}
FileOutputFormat.setOutputPath( job,outputPath );
//3.提交运行
boolean isSuccess = job.waitForCompletion( true );
return isSuccess?0:1;
}
public static void main(String[] args) {
Configuration configuration = new Configuration();
//run(Configuration conf, Tool tool, String[] args)
try {
int status = ToolRunner.run( configuration, new WebLogDriver(), args );
System.exit( status );
} catch (Exception e) {
e.printStackTrace();
}
}
}
WebLogMapper:
public class WebLogMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
LogParseUtil logParseUtil = new LogParseUtil();
Set<String> page = new HashSet<String>( );
private Text outputKey = new Text( );
@Override
protected void setup(Context context) throws IOException, InterruptedException {
page.add("/about");
page.add("/black-ip-list/");
page.add("/cassandra-clustor/");
page.add("/finance-rhive-repurchase/");
page.add("/hadoop-family-roadmap/");
page.add("/hadoop-hive-intro/");
page.add("/hadoop-zookeeper-intro/");
page.add("/hadoop-mahout-roadmap/");
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
WebLogBean logBean = logParseUtil.logParseHandle( line );
logParseUtil.pageFilter(logBean,page);
outputKey.set( logBean.toString() );
context.write( outputKey,NullWritable.get() );
}
}
LogParseUtil:
public class LogParseUtil {
SimpleDateFormat df1 = new SimpleDateFormat( "dd/MMM/yyyy:HH:mm:ss",Locale.ENGLISH);
SimpleDateFormat df2 = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss", Locale.ENGLISH);
public WebLogBean logParseHandle(String line){
String[] items = line.split( " " );
WebLogBean webLogBean = new WebLogBean();
if(items.length > 11){
if(StringUtils.isNotBlank( items[0] )) {
webLogBean.setIp( items[0] );
}
webLogBean.setUser_id( items[2] );
//时间格式化
String s_time = dateDateFormat(items[3].substring( 1 ));
if(s_time == null){
s_time = "invalid_time";
webLogBean.setValid( false );
}
webLogBean.setTime( s_time );
//URL
webLogBean.setRequest( items[6] );
webLogBean.setStatus( items[8] );
if(Integer.parseInt( items[8] ) >400) {
webLogBean.setValid( false );
}
webLogBean.setBody_size( items[9] );
webLogBean.setHttp_ref( items[10] );
//UserAgent
if(items.length > 12){
StringBuffer sb = new StringBuffer( );
for (int i = 11; i < items.length; i++) {
sb.append( items[i] ).append( " " );
}
webLogBean.setUser_agent( sb.toString() );
}else {
webLogBean.setUser_agent( items[11] );
}
}else {
webLogBean.setValid( false );
}
return webLogBean;
}
private String dateDateFormat(String substring) {
String dataStr = null;
Date date = null;
try {
date = df1.parse( substring );
dataStr = df2.format( date );
} catch (ParseException e) {
e.printStackTrace();
}
return dataStr;
}
public void pageFilter(WebLogBean logBean, Set<String> page) {
if(!page.contains( logBean.getRequest() )){
logBean.setValid( false );
}
}
}
WebLogBean:
public class WebLogBean implements Writable {
private boolean valid = true;
private String ip ;
private String user_id;
//请求的时间 2018-07-07 16:59:59
private String time;
//请求的URL
private String request;
//请求结果的状态
private String status;
private String body_size;
//前页面
private String http_ref;
//浏览器,操作系统等信息
private String user_agent;
public WebLogBean() {
}
public WebLogBean(boolean valid, String ip, String user_id, String time, String request, String status, String body_size, String http_ref, String user_agent) {
this.valid = valid;
this.ip = ip;
this.user_id = user_id;
this.time = time;
this.request = request;
this.status = status;
this.body_size = body_size;
this.http_ref = http_ref;
this.user_agent = user_agent;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeBoolean( valid );
out.writeUTF( ip );
out.writeUTF( user_id );
out.writeUTF( time );
out.writeUTF( request );
out.writeUTF( status );
out.writeUTF( body_size );
out.writeUTF( http_ref );
out.writeUTF( user_agent );
}
@Override
public void readFields(DataInput in) throws IOException {
this.valid = in.readBoolean();
this.ip = in.readUTF();
this.user_id = in.readUTF();
this.time = in.readUTF();
this.request = in.readUTF();
this.status = in.readUTF();
this.body_size = in.readUTF();
this.http_ref = in.readUTF();
this.user_agent = in.readUTF();
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append( this.valid )
.append( "\001" ).append( this.ip )
.append( "\001" ).append( this.user_id )
.append( "\001" ).append( this.time )
.append( "\001" ).append( this.request )
.append( "\001" ).append( this.status )
.append( "\001" ).append( this.body_size )
.append( "\001" ).append( this.http_ref )
.append( "\001" ).append( this.user_agent );
return sb.toString();
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getUser_id() {
return user_id;
}
public void setUser_id(String user_id) {
this.user_id = user_id;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getBody_size() {
return body_size;
}
public void setBody_size(String body_size) {
this.body_size = body_size;
}
public String getHttp_ref() {
return http_ref;
}
public void setHttp_ref(String http_ref) {
this.http_ref = http_ref;
}
public String getUser_agent() {
return user_agent;
}
public void setUser_agent(String user_agent) {
this.user_agent = user_agent;
}
打成jar包然后上传到hadoop运行:
bin/yarn jar hadoop-1.0-SNAPSHOT.jar com.huadian.webetl.etl.WebLogDriver /datas/tmp/access.log /flume/weblog/etl
一条数据:
false1.162.203.134-2013-09-18 13:47:35/images/my.jpg20019939"http://www.angularjs.cn/A0d9""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
PageView分析模型构建
■对清洗后数据针对每一个会话,进行进一步分析,构建pageview模型, 在每一条记录
中添加session_ id、 停留时长、访问顺序等字段
步骤:
Map:
key value
ip 其他字段
reduce:
每一个IP调用一次Reduce方法
将所有的记录放到List
可以对所有的记录进行排序:访问时间排序
生成对应的字段
迭代整个List集合
如果list的长度为1,
直接输出结果
如果拿到的是第一条记录
i=0,跳过
循环到达第二个记录
i=1,
sessionId = RandomUUID.toString();
step=1
length =list[i].getTime() –list[i-1].getTime
判定:length >30分钟
sessionId重置
step=1
如果I = list.length-1
输出上一条和当前条
output
PageViewDriver:
public class PageViewDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
//1.创建Job
Job job = Job.getInstance( this.getConf(), "PageViewDriver" );
job.setJarByClass( PageViewDriver.class );
//2.设置job
//(2.1)input
Path inputPath = new Path( args[0] );
FileInputFormat.setInputPaths( job,inputPath );
//(2.2)map
job.setMapperClass( PageViewMapper.class );
job.setMapOutputKeyClass( Text.class );
job.setMapOutputValueClass( WebLogBean.class );
//(2.4)reduce
job.setReducerClass( PageViewReducer.class );
job.setOutputKeyClass( PVBean.class );
job.setOutputValueClass( NullWritable.class );
//job.setNumReduceTasks( 2 );
//(2.5)output
//如果输出目录存在,干掉
FileSystem hdfs = FileSystem.get( this.getConf() );
Path outputPath = new Path( args[1] );
if(hdfs.exists( outputPath )){
hdfs.delete( outputPath ,true);
}
FileOutputFormat.setOutputPath( job,outputPath );
//3.提交运行
boolean isSuccess = job.waitForCompletion( true );
return isSuccess?0:1;
}
public static void main(String[] args) {
Configuration configuration = new Configuration();
//run(Configuration conf, Tool tool, String[] args)
try {
int status = ToolRunner.run( configuration, new PageViewDriver(), args );
System.exit( status );
} catch (Exception e) {
e.printStackTrace();
}
}
}
PageViewMapper:
public class PageViewMapper extends Mapper<LongWritable, Text,Text, WebLogBean> {
private Text outputKey = new Text( );
private WebLogBean outputValue = new WebLogBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws InterruptedException, IOException {
String line = value.toString();
String[] itmes = line.split( "\001" );
outputValue.setAll( itmes[0].equals( "true" ),itmes[1],itmes[2],itmes[3],itmes[4],itmes[5],itmes[6],itmes[7],itmes[8]);
outputKey.set( outputValue.getIp() );
context.getCounter( "user define group ","ip count" ).increment( 1L );
if(outputValue.isValid()){
context.getCounter( "user define group ","valid count" ).increment( 1L );
context.write( outputKey,outputValue );
}else {
context.getCounter( "user define group ","invalid count" ).increment( 1L );
}
}
}
PageViewReducer:
public class PageViewReducer extends Reducer<Text, WebLogBean, PVBean, NullWritable> {
private NullWritable outputValue = NullWritable.get();
private SimpleDateFormat sdf = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss" );
private PVBean pvBean = new PVBean();
@Override
protected void reduce(Text key, Iterable<WebLogBean> values, Context context) throws IOException, InterruptedException {
ArrayList<WebLogBean> beans = new ArrayList<>();
for (WebLogBean bean:values) {
WebLogBean bean_copy = new WebLogBean();
try {
BeanUtils.copyProperties( bean_copy ,bean);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
beans.add( bean_copy );
}
//排序
Collections.sort( beans,new Comparator<WebLogBean>() {
@Override
public int compare(WebLogBean o1, WebLogBean o2) {
Date date1 = myToDate(o1.getTime());
Date date2 = myToDate(o2.getTime());
return date1.compareTo( date2 );
}
} );
//构建2个字段:sessionId,停留时间,步数
String sessionId = UUID.randomUUID().toString();
int step = 1;
if(beans.size() == 1){
WebLogBean b = beans.get( 0 );
pvBean.setAll( sessionId,b.getIp(),b.getTime(),b.getRequest(),step,"60",b.getHttp_ref(),b.getUser_agent(),b.getBody_size(),b.getStatus());
context.write( pvBean,outputValue);
return;
}
//有2条记录才会执行下面的代码
for (int i = 0; i < beans.size(); i++) {
if(i==0){
continue;
}
long length = timeDiff(beans.get( i ).getTime(),beans.get( i-1 ).getTime())/1000;
if(length < 30 * 60 ){
//同一个session
WebLogBean b = beans.get( i-1 );
pvBean.setAll( sessionId,b.getIp(),b.getTime(),b.getRequest(),step,length +"",b.getHttp_ref(),
b.getUser_agent(),b.getBody_size(),b.getStatus());
context.write( pvBean,outputValue);
step++;
}else {
//横跨2个Session
WebLogBean b = beans.get( i-1 );
pvBean.setAll( sessionId,b.getIp(),b.getTime(),b.getRequest(),step,length +"",b.getHttp_ref(),
b.getUser_agent(),b.getBody_size(),b.getStatus());
context.write( pvBean,outputValue);
//重置
sessionId = UUID.randomUUID().toString();
step =1;
}
if(i == beans.size() -1){
WebLogBean b = beans.get( i );
pvBean.setAll( sessionId,b.getIp(),b.getTime(),b.getRequest(),step,"60",b.getHttp_ref(),
b.getUser_agent(),b.getBody_size(),b.getStatus());
context.write( pvBean,outputValue);
}
}
}
private long timeDiff(String time, String time1) {
return this.myToDate( time ).getTime() - this.myToDate( time1 ).getTime();
}
private Date myToDate(String time) {
Date date = null;
try {
date = sdf.parse( time );
} catch (ParseException e) {
e.printStackTrace();
}
return date;
}
}
WebLogBean:
public class WebLogBean implements WritableComparable {
private boolean valid = true;
private String ip ;
private String user_id;
//请求的时间 2018-07-07 16:59:59
private String time;
//请求的URL
private String request;
//请求结果的状态
private String status;
private String body_size;
//前页面
private String http_ref;
//浏览器,操作系统等信息
private String user_agent;
public WebLogBean() {
}
public void setAll(boolean valid, String ip, String user_id, String time, String request, String status, String body_size, String http_ref, String user_agent) {
this.valid = valid;
this.ip = ip;
this.user_id = user_id;
this.time = time;
this.request = request;
this.status = status;
this.body_size = body_size;
this.http_ref = http_ref;
this.user_agent = user_agent;
}
@Override
public int compareTo(Object o) {
return o.toString().compareTo( this.toString() );
}
@Override
public void write(DataOutput out) throws IOException {
out.writeBoolean( valid );
out.writeUTF( ip );
out.writeUTF( user_id );
out.writeUTF( time );
out.writeUTF( request );
out.writeUTF( status );
out.writeUTF( body_size );
out.writeUTF( http_ref );
out.writeUTF( user_agent );
}
@Override
public void readFields(DataInput in) throws IOException {
this.valid = in.readBoolean();
this.ip = in.readUTF();
this.user_id = in.readUTF();
this.time = in.readUTF();
this.request = in.readUTF();
this.status = in.readUTF();
this.body_size = in.readUTF();
this.http_ref = in.readUTF();
this.user_agent = in.readUTF();
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append( this.valid )
.append( "\001" ).append( this.ip )
.append( "\001" ).append( this.user_id )
.append( "\001" ).append( this.time )
.append( "\001" ).append( this.request )
.append( "\001" ).append( this.status )
.append( "\001" ).append( this.body_size )
.append( "\001" ).append( this.http_ref )
.append( "\001" ).append( this.user_agent );
return sb.toString();
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getUser_id() {
return user_id;
}
public void setUser_id(String user_id) {
this.user_id = user_id;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getBody_size() {
return body_size;
}
public void setBody_size(String body_size) {
this.body_size = body_size;
}
public String getHttp_ref() {
return http_ref;
}
public void setHttp_ref(String http_ref) {
this.http_ref = http_ref;
}
public String getUser_agent() {
return user_agent;
}
public void setUser_agent(String user_agent) {
this.user_agent = user_agent;
}
}
PVBean:
public class PVBean implements Writable {
private String session;
private String ip;
private String time;
private String request;
private int step;
private String length;
private String http_ref;
private String user_agent;
private String body_size;
private String status;
public void setAll(String session,String ip,String time,String request,int step,String length
,String http_ref,String user_agent,String body_size,String status){
this.setSession(session);
this.setIp(ip);
this.setTime(time);
this.setRequest(request);
this.setStep(step);
this.setLength(length);
this.setHttp_ref(http_ref);
this.setUser_agent(user_agent);
this.setBody_size(body_size);
this.setStatus(status);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(session);
out.writeUTF(ip);
out.writeUTF(time);
out.writeUTF(request);
out.writeInt(step);
out.writeUTF(length);
out.writeUTF(http_ref);
out.writeUTF(user_agent);
out.writeUTF(body_size);
out.writeUTF(status);
}
@Override
public void readFields(DataInput in) throws IOException {
this.session = in.readUTF();
this.ip = in.readUTF();
this.time = in.readUTF();
this.request = in.readUTF();
this.step = in.readInt();
this.length = in.readUTF();
this.http_ref = in.readUTF();
this.user_agent = in.readUTF();
this.body_size = in.readUTF();
this.status = in.readUTF();
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append( session )
.append( "\001" ).append( ip )
.append( "\001" ).append( time )
.append( "\001" ).append( request )
.append( "\001" ).append( step )
.append( "\001" ).append( length )
.append( "\001" ).append( http_ref )
.append( "\001" ).append( user_agent )
.append( "\001" ).append( body_size )
.append( "\001" ).append( status );
return sb.toString();
}
public String getSession() {
return session;
}
public void setSession(String session) {
this.session = session;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public int getStep() {
return step;
}
public void setStep(int step) {
this.step = step;
}
public String getLength() {
return length;
}
public void setLength(String length) {
this.length = length;
}
public String getHttp_ref() {
return http_ref;
}
public void setHttp_ref(String http_ref) {
this.http_ref = http_ref;
}
public String getUser_agent() {
return user_agent;
}
public void setUser_agent(String user_agent) {
this.user_agent = user_agent;
}
public String getBody_size() {
return body_size;
}
public void setBody_size(String body_size) {
this.body_size = body_size;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
}
bin/yarn jar hadoop-1.0-SNAPSHOT.jar com.huadian.weblogAnalytic.pageview.PageViewDriver /flume/weblog/etl /flume/weblog/pv
pv统计在etl的数据的基础上再进行操作。
530e77bc-0ec5-4871-bfe3-35bcfc296cc9111.161.17.1042013-09-18 12:17:25/hadoop-hive-intro/160"http://blog.fens.me/series-hadoop-cloud/""Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36" 14763200
Visit分析模型构建:
对清洗后数据针对每一个会话,进行分组,统计每个session的起始时间、结束时间、进入页面、离开页面、访问页数、来源页面等信息
VisitViewDriver:
public class VisitViewDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
//1.创建Job
Job job = Job.getInstance( this.getConf(), "VisitViewDriver" );
job.setJarByClass( VisitViewDriver.class );
//2.设置job
//(2.1)input
Path inputPath = new Path( args[0] );
FileInputFormat.setInputPaths( job,inputPath );
//(2.2)map
job.setMapperClass( VisitViewMapper.class );
job.setMapOutputKeyClass( Text.class );
job.setMapOutputValueClass( PVBean.class );
//(2.4)reduce
job.setReducerClass( VisitViewReducer.class );
job.setOutputKeyClass( VisitBean.class );
job.setOutputValueClass( NullWritable.class );
//job.setNumReduceTasks( 2 );
//(2.5)output
//如果输出目录存在,干掉
FileSystem hdfs = FileSystem.get( this.getConf() );
Path outputPath = new Path( args[1] );
if(hdfs.exists( outputPath )){
hdfs.delete( outputPath ,true);
}
FileOutputFormat.setOutputPath( job,outputPath );
//3.提交运行
boolean isSuccess = job.waitForCompletion( true );
return isSuccess?0:1;
}
public static void main(String[] args) {
Configuration configuration = new Configuration();
//run(Configuration conf, Tool tool, String[] args)
try {
int status = ToolRunner.run( configuration, new VisitViewDriver(), args );
System.exit( status );
} catch (Exception e) {
e.printStackTrace();
}
}
}
VisitViewMapper:
public class VisitViewMapper extends Mapper<LongWritable, Text,Text,PVBean> {
private Text outputKey = new Text( );
private PVBean outputValue = new PVBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] items = line.split( "\001" );
outputKey.set( items[0] );
outputValue.setAll( items[0],items[1],items[2],items[3],Integer.parseInt( items[4] ),items[5],items[6],items[7],items[8],items[9] );
context.write( outputKey,outputValue );
}
}
VisitViewReducer:
public class VisitViewReducer extends Reducer<Text, PVBean, VisitBean, NullWritable> {
private VisitBean outputKey = new VisitBean();
private NullWritable outputValue = NullWritable.get();
@Override
protected void reduce(Text key, Iterable<PVBean> values, Context context) throws IOException, InterruptedException {
ArrayList<PVBean> beans = new ArrayList<>();
for (PVBean bean:values) {
PVBean bean_copy = new PVBean();
try {
BeanUtils.copyProperties(bean_copy, bean);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
beans.add(bean_copy);
}
//排序
Collections.sort( beans,new Comparator<PVBean>() {
@Override
public int compare(PVBean o1, PVBean o2) {
return o1.getStep() > o2.getStep() ?1:-1;
}
} );
outputKey.setSession( key.toString() );
outputKey.setInTime( beans.get( 0 ).getTime() );
outputKey.setInPage( beans.get( 0 ).getRequest() );
outputKey.setOutTime( beans.get( beans.size()-1 ).getTime() );
outputKey.setOutPage( beans.get( beans.size()-1 ).getRequest() );
outputKey.setPageNum( beans.size() );
outputKey.setHttp_ref( beans.get( 0 ).getHttp_ref() );
outputKey.setIp( beans.get( 0 ).getIp() );
context.write( outputKey,outputValue );
}
}
VisitBean
public class VisitBean implements Writable {
private String session;
private String inTime;
private String outTime;
private String inPage;
private String outPage;
private int pageNum;
private String ip;
private String http_ref;
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF( session );
out.writeUTF( inTime );
out.writeUTF( outTime );
out.writeUTF( inPage );
out.writeUTF( outPage );
out.writeUTF( ip );
out.writeUTF( http_ref );
out.writeInt( pageNum );
}
@Override
public void readFields(DataInput in) throws IOException {
this.session = in.readUTF();
this.inTime = in.readUTF();
this.outTime = in.readUTF();
this.inPage = in.readUTF();
this.outPage = in.readUTF();
this.ip = in.readUTF();
this.http_ref = in.readUTF();
this.pageNum = in.readInt();
}
@Override
public String toString() {
return session + "\001" + ip + "\001" + inTime + "\001" + outTime + "\001" + inPage
+ "\001" + outPage + "\001" + http_ref + "\001" + pageNum;
}
public String getSession() {
return session;
}
public void setSession(String session) {
this.session = session;
}
public String getInTime() {
return inTime;
}
public void setInTime(String inTime) {
this.inTime = inTime;
}
public String getOutTime() {
return outTime;
}
public void setOutTime(String outTime) {
this.outTime = outTime;
}
public String getInPage() {
return inPage;
}
public void setInPage(String inPage) {
this.inPage = inPage;
}
public String getOutPage() {
return outPage;
}
public void setOutPage(String outPage) {
this.outPage = outPage;
}
public int getPageNum() {
return pageNum;
}
public void setPageNum(int pageNum) {
this.pageNum = pageNum;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getHttp_ref() {
return http_ref;
}
public void setHttp_ref(String http_ref) {
this.http_ref = http_ref;
}
}
bin/yarn jar hadoop-1.0-SNAPSHOT.jar com.huadian.weblogAnalytic.visitview.VisitViewDriver /flume/weblog/pv /flume/weblog/vi
分析后的数据:
022d82f3-7b70-43ce-bbd9-6e0eaadc420271.206.247.972013-09-19 03:39:572013-09-19 03:39:57/hadoop-mahout-roadmap//hadoop-mahout-roadmap/"http://f.dataguru.cn/thread-175501-1-1.html"1