先用python模拟一些日志数据:
# !/usr/bin/python
# -*- coding:utf-8 -*-
import random
# remote_addr
# remote_user
# time
# request
# status 200 400
# agent
remote='222.68.172.'+str(random.randint(0,15))
remote_user = ['jianan','zhudy','crystal','lorry','lilian','vivian','meimei','zhangxin']
time = '2019-6-29-'+str(random.randint(0,23))+'-'+str(random.randint(0,59))+'-'+str(random.randint(0,59))
request = ['www.baidu.com','www.lianjiawang.com','www.taobao.com','www.jingdong.com',
'www.youku.com','www.aiqiyi.com','www.meitu.com','www.tieba.com',
'www.qq.com','www.360.com','www.jialidun.com','www.bilibili.com']
status = ['200','404','500']
agent = ['mobile','firefox','chrome','mac']
print(remote_user[random.randint(0,len(remote_user)-1)])
print(request[random.randint(0,len(remote_user)-1)])
with open("log.txt","w") as f:
for i in range(0,10000000):
f.write('222.68.172.'+str(random.randint(0,15))+'^'+remote_user[random.randint(0,len(remote_user)-1)]+
'^'+'2019-6-29-'+str(random.randint(0,23))+'-'+str(random.randint(0,59))+'-'+str(random.randint(0,59))+'^'+request[random.randint(0,len(request)-1)]+
'^'+status[random.randint(0,len(status)-1)]+
'^'+agent[random.randint(0,len(agent)-1)]+'\n')
样式如下:
在hive中创建对应的表 并加载数据
scala编写sparkSQL程序(过滤只留下访问状态200的):
统计日活跃用户前2名;统计每小时的访问量;统计页面访问量排名前2;统计不同访问途径的访问量
打包通过WinSCP发到linux服务器上,配置crontab定时任务,执行sh文件.
代码如下:
package cn.spark.study
import org.apache.spark.{SparkContext,SparkConf}
import org.apache.spark.SparkContext._
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
object LogLogistic{
def main(arg: Array[String]): Unit ={
val sparkConf = new SparkConf()
sparkConf.set("spark.sql.crossJoin.enabled", "true")
.set("spark.sql.warehouse.dir", "hdfs://localhost:9000/user/hive/warehouse")
.set("spark.debug.maxToStringFields", "100")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.locality.wait", "10")
.set("spark.storage.memoryFraction","0.4")
.set("spark.shuffle.consolidateFiles", "true")
.set("spark.reducer.maxSizeInFlight","96") //reduce task的拉取缓存,默认48m
.set("spark.shuffle.file.buffer","64")//:map task的写磁盘缓存,默认32k
.set("spark.shuffle.memoryFraction","0.4")//:用于reduce端聚合的内存比例,默认0.2,超过比例就会溢
val spark = SparkSession
.builder()
.appName("Loglogistic")
.config(sparkConf)
.enableHiveSupport()
.getOrCreate()
import spark.implicits._
// 统计日活跃用户name前2名
val sqlDF = spark.sql(
"select name,viewNum from (select name,count(*) viewNum from execise.log_logistic where responsenumber = 200 group by name order by viewNum desc ) a limit 2 ")
sqlDF.write.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/test")
.option("dbtable", "T_ACTIVATEDNAME")
.option("user", "root")
.option("password", "123456")
.option("driver","com.mysql.jdbc.Driver")
.save()
//统计每小时的访问量
val sqlDF1 = spark.sql(
"select timeHour ,count(*) viewNum from (select split(log_data,'-')[3] timeHour from execise.log_logistic where responsenumber = 200) a group by timeHour order by viewNum desc ")
sqlDF1.write.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/test")
.option("dbtable", "T_HOURNUM")
.option("user", "root")
.option("password", "123456")
.option("driver","com.mysql.jdbc.Driver")
.save()
//统计页面访问量排名前2
val sqlDF2 = spark.sql(
"select webpage,viewNum from (select webpage ,count(*) viewNum from execise.log_logistic where responsenumber = 200 group by webpage order by viewNum desc) a limit 2 ")
sqlDF2.write.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/test")
.option("dbtable", "T_WEBPAGE")
.option("user", "root")
.option("password", "123456")
.option("driver","com.mysql.jdbc.Driver")
.save()
//统计不同访问途径的访问量
val sqlDF3 = spark.sql(
"select source, viewNum from (select source,count(*) viewNum from execise.log_logistic group by source order by viewNum desc) a")
sqlDF3.write.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/test")
.option("dbtable", "T_SOURCENUM")
.option("user", "root")
.option("password", "123456")
.option("driver","com.mysql.jdbc.Driver")
.save()
}
}
运行程序,观察数据写入mysql表中:
计算数据结果: