hdfs hive sparkSQL 结合处理离线日志统计任务

先用python模拟一些日志数据:

# !/usr/bin/python
# -*- coding:utf-8 -*-
import random
# remote_addr
# remote_user
# time
# request
# status 200 400
# agent
remote='222.68.172.'+str(random.randint(0,15))
remote_user = ['jianan','zhudy','crystal','lorry','lilian','vivian','meimei','zhangxin']
time = '2019-6-29-'+str(random.randint(0,23))+'-'+str(random.randint(0,59))+'-'+str(random.randint(0,59))
request = ['www.baidu.com','www.lianjiawang.com','www.taobao.com','www.jingdong.com',
           'www.youku.com','www.aiqiyi.com','www.meitu.com','www.tieba.com',
           'www.qq.com','www.360.com','www.jialidun.com','www.bilibili.com']
status = ['200','404','500']
agent = ['mobile','firefox','chrome','mac']
print(remote_user[random.randint(0,len(remote_user)-1)])
print(request[random.randint(0,len(remote_user)-1)])
with open("log.txt","w") as f:
    for i in range(0,10000000):
        f.write('222.68.172.'+str(random.randint(0,15))+'^'+remote_user[random.randint(0,len(remote_user)-1)]+
                '^'+'2019-6-29-'+str(random.randint(0,23))+'-'+str(random.randint(0,59))+'-'+str(random.randint(0,59))+'^'+request[random.randint(0,len(request)-1)]+
                '^'+status[random.randint(0,len(status)-1)]+
                '^'+agent[random.randint(0,len(agent)-1)]+'\n')

样式如下:

在hive中创建对应的表 并加载数据

scala编写sparkSQL程序(过滤只留下访问状态200的):

统计日活跃用户前2名;统计每小时的访问量;统计页面访问量排名前2;统计不同访问途径的访问量

打包通过WinSCP发到linux服务器上,配置crontab定时任务,执行sh文件.

代码如下:

package cn.spark.study

import org.apache.spark.{SparkContext,SparkConf}
import org.apache.spark.SparkContext._
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
object LogLogistic{
  def main(arg: Array[String]): Unit ={
    val sparkConf = new SparkConf()
    sparkConf.set("spark.sql.crossJoin.enabled", "true")
        .set("spark.sql.warehouse.dir", "hdfs://localhost:9000/user/hive/warehouse")
        .set("spark.debug.maxToStringFields", "100")
        .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .set("spark.locality.wait", "10")
        .set("spark.storage.memoryFraction","0.4")
        .set("spark.shuffle.consolidateFiles", "true")
        .set("spark.reducer.maxSizeInFlight","96") //reduce task的拉取缓存,默认48m
        .set("spark.shuffle.file.buffer","64")//:map task的写磁盘缓存,默认32k
        .set("spark.shuffle.memoryFraction","0.4")//:用于reduce端聚合的内存比例,默认0.2,超过比例就会溢
   
    val spark = SparkSession
            .builder()
            .appName("Loglogistic")
            .config(sparkConf)
            .enableHiveSupport()
            .getOrCreate()
    import spark.implicits._
// 统计日活跃用户name前2名
val sqlDF = spark.sql(
            "select name,viewNum from (select name,count(*) viewNum from execise.log_logistic where responsenumber = 200 group by name order by viewNum desc ) a limit 2 ")  
    sqlDF.write.format("jdbc")
    .option("url", "jdbc:mysql://localhost:3306/test")
    .option("dbtable", "T_ACTIVATEDNAME")
    .option("user", "root")
    .option("password", "123456")
    .option("driver","com.mysql.jdbc.Driver")
    .save()
//统计每小时的访问量
val sqlDF1 = spark.sql(
            "select timeHour ,count(*) viewNum  from (select split(log_data,'-')[3] timeHour from execise.log_logistic where responsenumber = 200) a group by  timeHour order by viewNum desc ")  
    sqlDF1.write.format("jdbc")
    .option("url", "jdbc:mysql://localhost:3306/test")
    .option("dbtable", "T_HOURNUM")
    .option("user", "root")
    .option("password", "123456")
    .option("driver","com.mysql.jdbc.Driver")
    .save()
//统计页面访问量排名前2
val sqlDF2 = spark.sql(
            "select webpage,viewNum from (select  webpage ,count(*) viewNum from execise.log_logistic where responsenumber = 200 group by webpage order by viewNum desc) a limit 2 ")  
    sqlDF2.write.format("jdbc")
    .option("url", "jdbc:mysql://localhost:3306/test")
    .option("dbtable", "T_WEBPAGE")
    .option("user", "root")
    .option("password", "123456")
    .option("driver","com.mysql.jdbc.Driver")
    .save()
//统计不同访问途径的访问量
  val sqlDF3 = spark.sql(
            "select source, viewNum from (select source,count(*) viewNum from execise.log_logistic group by source order by viewNum desc) a")  
    sqlDF3.write.format("jdbc")
    .option("url", "jdbc:mysql://localhost:3306/test")
    .option("dbtable", "T_SOURCENUM")
    .option("user", "root")
    .option("password", "123456")
    .option("driver","com.mysql.jdbc.Driver")
    .save()  
  }
}

运行程序,观察数据写入mysql表中:

计算数据结果:

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值