spark 实际项目分析-移动端app日志

本文章是企业的一个真实案例简化而来的,主要是统计那个用户的下载流量和上传流量,从而真实分析出每个用户的流量使用情况

为完成此功能需要4个类

AccessLogInfo.java: 用户流量实体信息

public class AccessLogInfo implements Serializable {

   private static final long serialVersionUID = 5749943279909593929L;
   
   private long timestamp;       // 时间戳
   private long upTraffic;       // 上行流量
   private long downTraffic;  // 下行流量
   
   public AccessLogInfo() {}
   
   public AccessLogInfo(long timestamp, long upTraffic, long downTraffic) {
      this.timestamp = timestamp;
      this.upTraffic = upTraffic;
      this.downTraffic = downTraffic;
   }
   
   public long getTimestamp() {
      return timestamp;
   }
   public void setTimestamp(long timestamp) {
      this.timestamp = timestamp;
   }
   public long getUpTraffic() {
      return upTraffic;
   }
   public void setUpTraffic(long upTraffic) {
      this.upTraffic = upTraffic;
   }
   public long getDownTraffic() {
      return downTraffic;
   }
   public void setDownTraffic(long downTraffic) {
      this.downTraffic = downTraffic;
   }
   
}
2 
DataGenerator.java: 数据模拟生成器

public class DataGenerator {
   
   public static void main(String[] args) throws Exception {
      Random random = new Random();
      
      // 生成100个deviceID
      List<String> deviceIDs = new ArrayList<String>();
      for(int i = 0; i < 100; i++) {
         deviceIDs.add(getRandomUUID());
      }
      
      StringBuffer buffer = new StringBuffer("");  
      
      for(int i = 0; i < 1000; i++) {
         // 生成随机时间戳
         Calendar cal = Calendar.getInstance();
         cal.setTime(new Date());    
         cal.add(Calendar.MINUTE, -random.nextInt(600)); 
         long timestamp = cal.getTime().getTime();
      
         // 生成随机deviceID
         String deviceID = deviceIDs.get(random.nextInt(100));  
         
         // 生成随机的上行流量
         long upTraffic = random.nextInt(100000);
         // 生成随机的下行流量
         long downTraffic = random.nextInt(100000);
         
         buffer.append(timestamp).append("\t")  
               .append(deviceID).append("\t")  
               .append(upTraffic).append("\t")
               .append(downTraffic).append("\n");  
      }
      
      PrintWriter pw = null;  
      try {
         pw = new PrintWriter(new OutputStreamWriter(
               new FileOutputStream("C:\\Users\\Administrator\\Desktop\\access.log")));
         pw.write(buffer.toString());  
      } catch (Exception e) {
         e.printStackTrace();
      } finally {
         pw.close();
      }
   }
   
   private static String getRandomUUID() {
      return UUID.randomUUID().toString().replace("-", "");
   }
   
}
3 
AccessLogSortKey.java: 自定义二次排序

import java.io.Serializable;

import scala.math.Ordered;

/**
 * 日志的二次排序key
 * @author Administrator
 *
 */
public class AccessLogSortKey implements Ordered<AccessLogSortKey>, Serializable {

   private static final long serialVersionUID = 3702442700882342403L;
   
   private long upTraffic;
   private long downTraffic;
   private long timestamp;
   
   public AccessLogSortKey() {}
   
   public AccessLogSortKey(long upTraffic, long downTraffic, long timestamp) {
      this.upTraffic = upTraffic;
      this.downTraffic = downTraffic;
      this.timestamp = timestamp;
   }

   @Override
   public boolean $greater(AccessLogSortKey other) {
      if(upTraffic > other.upTraffic) {
         return true;
      } else if(upTraffic == other.upTraffic && 
            downTraffic > other.downTraffic) {
         return true;
      } else if(upTraffic == other.upTraffic && 
            downTraffic == other.downTraffic &&
            timestamp > other.timestamp) {
         return true;
      }
      return false;
   }

   @Override
   public boolean $greater$eq(AccessLogSortKey other) {
      if($greater(other)) {
         return true;
      } else if(upTraffic == other.upTraffic && 
            downTraffic == other.downTraffic &&
            timestamp == other.timestamp) {
         return true;
      }
      return false;
   }
   
   @Override
   public boolean $less(AccessLogSortKey other) {
      if(upTraffic < other.upTraffic) {
         return true;
      } else if(upTraffic == other.upTraffic && 
            downTraffic < other.downTraffic) {
         return true;
      } else if(upTraffic == other.upTraffic && 
            downTraffic == other.downTraffic &&
            timestamp < other.timestamp) {
         return true;
      }
      return false;
   }

   @Override
   public boolean $less$eq(AccessLogSortKey other) {
      if($less(other)) {
         return true;
      } else if(upTraffic == other.upTraffic && 
            downTraffic == other.downTraffic &&
            timestamp == other.timestamp) {
         return true;
      }
      return false;
   }
   
   @Override
   public int compare(AccessLogSortKey other) {
      if(upTraffic - other.upTraffic != 0) {
         return (int) (upTraffic - other.upTraffic); 
      } else if(downTraffic - other.downTraffic != 0) {
         return (int) (downTraffic - other.downTraffic);
      } else if(timestamp - other.timestamp != 0) {
         return (int) (timestamp - other.timestamp);
      }
      return 0;
   }
   
   @Override
   public int compareTo(AccessLogSortKey other) {
      if(upTraffic - other.upTraffic != 0) {
         return (int) (upTraffic - other.upTraffic); 
      } else if(downTraffic - other.downTraffic != 0) {
         return (int) (downTraffic - other.downTraffic);
      } else if(timestamp - other.timestamp != 0) {
         return (int) (timestamp - other.timestamp);
      }
      return 0;
   }

   public long getUpTraffic() {
      return upTraffic;
   }

   public void setUpTraffic(long upTraffic) {
      this.upTraffic = upTraffic;
   }

   public long getDownTraffic() {
      return downTraffic;
   }

   public void setDownTraffic(long downTraffic) {
      this.downTraffic = downTraffic;
   }

   public long getTimestamp() {
      return timestamp;
   }

   public void setTimestamp(long timestamp) {
      this.timestamp = timestamp;
   }

   @Override
   public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result + (int) (downTraffic ^ (downTraffic >>> 32));
      result = prime * result + (int) (timestamp ^ (timestamp >>> 32));
      result = prime * result + (int) (upTraffic ^ (upTraffic >>> 32));
      return result;
   }

   @Override
   public boolean equals(Object obj) {
      if (this == obj)
         return true;
      if (obj == null)
         return false;
      if (getClass() != obj.getClass())
         return false;
      AccessLogSortKey other = (AccessLogSortKey) obj;
      if (downTraffic != other.downTraffic)
         return false;
      if (timestamp != other.timestamp)
         return false;
      if (upTraffic != other.upTraffic)
         return false;
      return true;
   }

   @Override
   public String toString() {
      return "AccessLogSortKey [upTraffic=" + upTraffic + ", downTraffic="
            + downTraffic + ", timestamp=" + timestamp + "]";
   }
   
}
4 
AppLogSpark.java: spark日志分析

import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

/**
 * 移动端app访问流量日志分析案例
 * @author Administrator
 *
 */
public class AppLogSpark {

   public static void main(String[] args) throws Exception {
      // 创建Spark配置和上下文对象
      SparkConf conf = new SparkConf()
            .setAppName("AppLogSpark")  
            .setMaster("local"); 
      JavaSparkContext sc = new JavaSparkContext(conf);
      
      // 读取日志文件,并创建一个RDD
      // 使用SparkContext的textFile()方法,即可读取本地磁盘文件,或者是HDFS上的文件
      // 创建出来一个初始的RDD,其中包含了日志文件中的所有数据
      JavaRDD<String> accessLogRDD = sc.textFile(
            "C://Users//Administrator//Desktop//access.log");   
      
      // 将RDD映射为key-value格式,为后面的reduceByKey聚合做准备
      JavaPairRDD<String, AccessLogInfo> accessLogPairRDD = 
            mapAccessLogRDD2Pair(accessLogRDD);
      
      // 根据deviceID进行聚合操作
      // 获取每个deviceID的总上行流量、总下行流量、最早访问时间戳
      JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD = 
            aggregateByDeviceID(accessLogPairRDD);
      
      // 将按deviceID聚合RDD的key映射为二次排序key,value映射为deviceID
      JavaPairRDD<AccessLogSortKey, String> accessLogSortRDD = 
            mapRDDKey2SortKey(aggrAccessLogPairRDD);
      
      // 执行二次排序操作,按照上行流量、下行流量以及时间戳进行倒序排序
      JavaPairRDD<AccessLogSortKey ,String> sortedAccessLogRDD =
            accessLogSortRDD.sortByKey(false);
      // 获取top10数据
      List<Tuple2<AccessLogSortKey, String>> top10DataList = 
            sortedAccessLogRDD.take(10);
      for(Tuple2<AccessLogSortKey, String> data : top10DataList) {
         System.out.println(data._2 + ": " + data._1);  
      }
      
      // 关闭Spark上下文
      sc.close();
   }
   
   /**
    * 将日志RDD映射为key-value的格式
    * @param accessLogRDD 日志RDD
    * @return key-value格式RDD
    */
   private static JavaPairRDD<String, AccessLogInfo> mapAccessLogRDD2Pair(
         JavaRDD<String> accessLogRDD) {
      return accessLogRDD.mapToPair(new PairFunction<String, String, AccessLogInfo>() {

         private static final long serialVersionUID = 1L;

         @Override
         public Tuple2<String, AccessLogInfo> call(String accessLog)
               throws Exception {
            // 根据\t对日志进行切分
            String[] accessLogSplited = accessLog.split("\t");  
            
            // 获取四个字段
            long timestamp = Long.valueOf(accessLogSplited[0]);
            String deviceID = accessLogSplited[1];
            long upTraffic = Long.valueOf(accessLogSplited[2]);
            long downTraffic = Long.valueOf(accessLogSplited[3]);  
            
            // 将时间戳、上行流量、下行流量,封装为自定义的可序列化对象
            AccessLogInfo accessLogInfo = new AccessLogInfo(timestamp,
                  upTraffic, downTraffic);
            
            return new Tuple2<String, AccessLogInfo>(deviceID, accessLogInfo);
         }
         
      });
   }
   
   /**
    * 根据deviceID进行聚合操作
    * 计算出每个deviceID的总上行流量、总下行流量以及最早访问时间
    * @param accessLogPairRDD 日志key-value格式RDD
    * @return 按deviceID聚合RDD
    */
   private static JavaPairRDD<String, AccessLogInfo> aggregateByDeviceID(
         JavaPairRDD<String, AccessLogInfo> accessLogPairRDD) {
      return accessLogPairRDD.reduceByKey(new Function2<AccessLogInfo, AccessLogInfo, AccessLogInfo>() {
         
         private static final long serialVersionUID = 1L;
         
         @Override
         public AccessLogInfo call(AccessLogInfo accessLogInfo1, AccessLogInfo accessLogInfo2)
               throws Exception {
            long timestamp = accessLogInfo1.getTimestamp() < accessLogInfo2.getTimestamp() ? 
                  accessLogInfo1.getTimestamp() : accessLogInfo2.getTimestamp();
            long upTraffic = accessLogInfo1.getUpTraffic() + accessLogInfo2.getUpTraffic();
            long downTraffic = accessLogInfo1.getDownTraffic() + accessLogInfo2.getDownTraffic();
            
            AccessLogInfo accessLogInfo = new AccessLogInfo();
            accessLogInfo.setTimestamp(timestamp);
            accessLogInfo.setUpTraffic(upTraffic); 
            accessLogInfo.setDownTraffic(downTraffic);
            
            return accessLogInfo;
         }
         
      });
   }
   
   /**
    * 将RDD的key映射为二次排序key
    * @param aggrAccessLogPairRDD 按deviceID聚合RDD
    * @return 二次排序key RDD
    */
   private static JavaPairRDD<AccessLogSortKey, String> mapRDDKey2SortKey(
         JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD) {
      return aggrAccessLogPairRDD.mapToPair(
            
            new PairFunction<Tuple2<String,AccessLogInfo>, AccessLogSortKey, String>() {

               private static final long serialVersionUID = 1L;

               @Override
               public Tuple2<AccessLogSortKey, String> call(
                     Tuple2<String, AccessLogInfo> tuple) throws Exception {
                  // 获取tuple数据
                  String deviceID = tuple._1;
                  AccessLogInfo accessLogInfo = tuple._2;
                  
                  // 将日志信息封装为二次排序key 
                  AccessLogSortKey accessLogSortKey = new AccessLogSortKey(
                        accessLogInfo.getUpTraffic(), 
                        accessLogInfo.getDownTraffic(), 
                        accessLogInfo.getTimestamp());
                  
                  // 返回新的Tuple
                  return new Tuple2<AccessLogSortKey, String>(accessLogSortKey, deviceID);
               }
               
            });
   }
   
}
阅读更多

没有更多推荐了,返回首页