实践DSP广告数据平台的日志文件转换成parquet文件第二种方案

上篇:实践广告精准投放的bz2数据转parquet文件场景案例

这一节主要演示,将bz2日志文件转换成parquet文件的第二种方案

二话不说,直接上代码

dolphin-doit01\src\main\scala\cn\sheep\dolphin\etl\Bz2ParquetV2.scala


package cn.sheep.dolphin.etl

import cn.sheep.dolphin.bean.AdLog
import cn.sheep.dolphin.utils.FileHelper
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext


/** 将bz2日志文件转换成parquet文件
 * author: old sheep
 * Created 2021/03/21
 */
object Bz2ParquetV2 {

  def main(args: Array[String]): Unit = {
    // 检验参数
    if (args.length != 2) {
      println(
        """
          |Usage: cn.sheep.dolphin.etl.Bz2ParquetV2
          |Param:
          |  bz2InputPath  bz2日志文件的输入路径
          | parquetOutPath  parquet文件的输出路径
        """.stripMargin)
      sys.exit(-1) // -1 非正常退出
    }

    // 接收参数(模式匹配了)
    val Array(bz2InputPath, parquetOutPath) = args

    val conf = new SparkConf()
      .setAppName("将bz2日志文件转换成parquet文件")
      .setMaster("local[*]")

    //读取离线的数据文件的
    val sc = new SparkContext(conf)

    // 读取离线的bz2日志文件
    val data = sc.textFile(bz2InputPath)

    // 过滤非法数据
    val filteredRDD: RDD[Array[String]] = data.map(_.split(",", -1)).filter(_.size >= 85)

    // parquet <- DataFrame(几种创建方式) <- SQLContext <- RDD
    val sqlc = new SQLContext(sc)

    // RDD[Array[String]] -> RDD[AdLog]
    val adLogRDD: RDD[AdLog] = filteredRDD.map(AdLog(_))
    val dataFrame = sqlc.createDataFrame(adLogRDD)

    // 删除目标目录
    FileHelper.deleteDir(parquetOutPath, sc)

    // partitionBy 对数据按照省份进行分区
    dataFrame.write.parquet(parquetOutPath)

    sc.stop()
  }
}

 dolphin-doit01\src\main\scala\cn\sheep\dolphin\bean\AdLog.scala

package cn.sheep.dolphin.bean

/**
 * 定义实体类,要来封装日志数据
 * author: old sheep
 * Created 2021/03/21  16:45
 */
class AdLog(
             val sessionid: String, //   会话标识
             val advertisersid: Int, // 广告主id
             val adorderid: Int, // 广告id
             val adcreativeid: Int, // 广告创意id ( >= 200000 : dsp)
             val adplatformproviderid: Int, // 广告平台商id (>= 100000: rtb)
             val sdkversion: String, // sdk 版本号
             val adplatformkey: String, // 平台商key
             val putinmodeltype: Int, // 针对广告主的投放模式,//1:展示量投放2:点击
             val requestmode: Int, // 数据请求方式(1:请求、2:展示、3:点击)
             val adprice: Double, // 广告价格
             val adppprice: Double, // 平台商价格
             val requestdate: String, // 请求时间,//格式为:yyyy-m-dd hh:mm:ss
             val ip: String, // 设备用户的真实ip 地址
             val appid: String, // 应用id
             val appname: String, // 应用名称
             val uuid: String, // 设备唯一标识
             val device: String, // 设备型号,如htc、iphone
             val client: Int, // 操作系统(1:android 2:ios 3:wp)
             val osversion: String, // 设备操作系统版本
             val density: String, // 设备屏幕的密度
             val pw: Int, // 设备屏幕宽度
             val ph: Int, // 设备屏幕高度
             val long: String, // 设备所在经度
             val lat: String, // 设备所在纬度
             val provincename: String, // 设备所在省份名称
             val cityname: String, // 设备所在城市名称
             val ispid: Int, // 运营商id
             val ispname: String, // 运营商名称
             val networkmannerid: Int, // 联网方式id
             val networkmannername: String, //联网方式名称
             val iseffective: Int, // 有效标识(有效指可以正常计费的)(0:无效1:
             val isbilling: Int, // 是否收费(0:未收费1:已收费)
             val adspacetype: Int, // 广告位类型(1:banner 2:插屏3:全屏)
             val adspacetypename: String, // 广告位类型名称(banner、插屏、全屏)
             val devicetype: Int, // 设备类型(1:手机2:平板)
             val processnode: Int, // 流程节点(1:请求量kpi 2:有效请求3:广告请
             val apptype: Int, // 应用类型id
             val district: String, // 设备所在县名称
             val paymode: Int, // 针对平台商的支付模式,1:展示量投放(CPM) 2:点击
             val isbid: Int, // 是否rtb
             val bidprice: Double, // rtb 竞价价格
             val winprice: Double, // rtb 竞价成功价格
             val iswin: Int, // 是否竞价成功
             val cur: String, // values:usd|rmb 等
             val rate: Double, // 汇率
             val cnywinprice: Double, // rtb 竞价成功转换成人民币的价格
             val imei: String, // imei
             val mac: String, // mac
             val idfa: String, // idfa
             val openudid: String, // openudid
             val androidid: String, // androidid
             val rtbprovince: String, // rtb 省
             val rtbcity: String, // rtb 市
             val rtbdistrict: String, // rtb 区
             val rtbstreet: String, // rtb 街道
             val storeurl: String, // app 的市场下载地址
             val realip: String, // 真实ip
             val isqualityapp: Int, // 优选标识
             val bidfloor: Double, // 底价
             val aw: Int, // 广告位的宽
             val ah: Int, // 广告位的高
             val imeimd5: String, // imei_md5
             val macmd5: String, // mac_md5
             val idfamd5: String, // idfa_md5
             val openudidmd5: String, // openudid_md5
             val androididmd5: String, // androidid_md5
             val imeisha1: String, // imei_sha1
             val macsha1: String, // mac_sha1
             val idfasha1: String, // idfa_sha1
             val openudidsha1: String, // openudid_sha1
             val androididsha1: String, // androidid_sha1
             val uuidunknow: String, // uuid_unknow tanx 密文
             val userid: String, // 平台用户id
             val iptype: Int, // 表示ip 类型
             val initbidprice: Double, // 初始出价
             val adpayment: Double, // 转换后的广告消费
             val agentrate: Double, // 代理商利润率
             val lrate: Double, // 代理利润率
             val adxrate: Double, // 媒介利润率
             val title: String, // 标题
             val keywords: String, // 关键字
             val tagid: String, // 广告位标识(当视频流量时值为视频ID 号)
             val callbackdate: String, // 回调时间格式为:YYYY/mm/dd hh:mm:ss
             val channelid: String, // 频道ID
             val mediatype: Int //媒体类型:1 长尾媒体2 视频媒体3 独立媒体默认:1


           ) extends Product {
  /**
   * 通过一个角标来访问类的某个属性
   *
   * @param n 索引
   * @return 就是n对应的类的某个属性
   */
  override def productElement(n: Int): Any = n match {
    case 0 => sessionid
    case 1 => advertisersid
    case 2 => adorderid
    case 3 => adcreativeid
    case 4 => adplatformproviderid
    case 5 => sdkversion
    case 6 => adplatformkey
    case 7 => putinmodeltype
    case 8 => requestmode
    case 9 => adprice
    case 10 => adppprice
    case 11 => requestdate
    case 12 => ip
    case 13 => appid
    case 14 => appname
    case 15 => uuid
    case 16 => device
    case 17 => client
    case 18 => osversion
    case 19 => density
    case 20 => pw
    case 21 => ph
    case 22 => long
    case 23 => lat
    case 24 => provincename
    case 25 => cityname
    case 26 => ispid
    case 27 => ispname
    case 28 => networkmannerid
    case 29 => networkmannername
    case 30 => iseffective
    case 31 => isbilling
    case 32 => adspacetype
    case 33 => adspacetypename
    case 34 => devicetype
    case 35 => processnode
    case 36 => apptype
    case 37 => district
    case 38 => paymode
    case 39 => isbid
    case 40 => bidprice
    case 41 => winprice
    case 42 => iswin
    case 43 => cur
    case 44 => rate
    case 45 => cnywinprice
    case 46 => imei
    case 47 => mac
    case 48 => idfa
    case 49 => openudid
    case 50 => androidid
    case 51 => rtbprovince
    case 52 => rtbcity
    case 53 => rtbdistrict
    case 54 => rtbstreet
    case 55 => storeurl
    case 56 => realip
    case 57 => isqualityapp
    case 58 => bidfloor
    case 59 => aw
    case 60 => ah
    case 61 => imeimd5
    case 62 => macmd5
    case 63 => idfamd5
    case 64 => openudidmd5
    case 65 => androididmd5
    case 66 => imeisha1
    case 67 => macsha1
    case 68 => idfasha1
    case 69 => openudidsha1
    case 70 => androididsha1
    case 71 => uuidunknow
    case 72 => userid
    case 73 => iptype
    case 74 => initbidprice
    case 75 => adpayment
    case 76 => agentrate
    case 77 => lrate
    case 78 => adxrate
    case 79 => title
    case 80 => keywords
    case 81 => tagid
    case 82 => callbackdate
    case 83 => channelid
    case 84 => mediatype
  }

  /**
   * 类中有多少个成员属性
   *
   * @return
   */
  override def productArity: Int = 85

  /**
   * 看that和当前类是否一个同一类型
   *
   * @param that
   * @return
   */
  override def canEqual(that: Any): Boolean = that.isInstanceOf[AdLog]
}

object AdLog {

  import cn.sheep.dolphin.bean.RichString._

  def apply(arr: Array[String]): AdLog = new AdLog(
    arr(0),
    arr(1).toIntPlus,
    arr(2).toIntPlus,
    arr(3).toIntPlus,
    arr(4).toIntPlus,
    arr(5),
    arr(6),
    arr(7).toIntPlus,
    arr(8).toIntPlus,
    arr(9).toDoublePlus,
    arr(10).toDoublePlus,
    arr(11),
    arr(12),
    arr(13),
    arr(14),
    arr(15),
    arr(16),
    arr(17).toIntPlus,
    arr(18),
    arr(19),
    arr(20).toIntPlus,
    arr(21).toIntPlus,
    arr(22),
    arr(23),
    arr(24),
    arr(25),
    arr(26).toIntPlus,
    arr(27),
    arr(28).toIntPlus,
    arr(29),
    arr(30).toIntPlus,
    arr(31).toIntPlus,
    arr(32).toIntPlus,
    arr(33),
    arr(34).toIntPlus,
    arr(35).toIntPlus,
    arr(36).toIntPlus,
    arr(37),
    arr(38).toIntPlus,
    arr(39).toIntPlus,
    arr(40).toDoublePlus,
    arr(41).toDoublePlus,
    arr(42).toIntPlus,
    arr(43),
    arr(44).toDoublePlus,
    arr(45).toDoublePlus,
    arr(46),
    arr(47),
    arr(48),
    arr(49),
    arr(50),
    arr(51),
    arr(52),
    arr(53),
    arr(54),
    arr(55),
    arr(56),
    arr(57).toIntPlus,
    arr(58).toDoublePlus,
    arr(59).toIntPlus,
    arr(60).toIntPlus,
    arr(61),
    arr(62),
    arr(63),
    arr(64),
    arr(65),
    arr(66),
    arr(67),
    arr(68),
    arr(69),
    arr(70),
    arr(71),
    arr(72),
    arr(73).toIntPlus,
    arr(74).toDoublePlus,
    arr(75).toDoublePlus,
    arr(76).toDoublePlus,
    arr(77).toDoublePlus,
    arr(78).toDoublePlus,
    arr(79),
    arr(80),
    arr(81),
    arr(82),
    arr(83),
    arr(84).toIntPlus
  )
}

 dolphin-doit01\src\main\scala\cn\sheep\dolphin\utils\FileHelper.scala


package cn.sheep.dolphin.utils

import java.io.File

import org.apache.commons.io.FileUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkContext
/**
 * 文件处理相关逻辑
 * author: old sheep
 * Created 2021/03/21  17:16
 */
object FileHelper {
  /**
   * 该方法只能删除本地文件目录
   * @param path
   */
  def deleteTargetDir(path: String) = {
    val file = new File(path)
    if (file.exists()) {
      // 递归删除目录
      FileUtils.deleteDirectory(file)
    }
  }


  /**
   * 根据客户端参数来获取文件系统,然后删除对应的目录
   * @param dirPath
   * @param sc
   * @return
   */
  def deleteDir(dirPath: String, sc: SparkContext)={
    val hadoopConfiguration = sc.hadoopConfiguration
    // fs 是本地文件系统还是分布式文件系统看的core-default.xml中的fs.defaultFS的参数
    val fs = FileSystem.get(hadoopConfiguration)

    val path = new Path(dirPath)
    if(fs.exists(path)) {
      fs.delete(path, true)
    }
  }

}

 配置传参数

运行Bz2Parquetv2程序,控制台打印输出

{
  "type" : "struct",
  "fields" : [ {
    "name" : "sessionid",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "advertisersid",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "adorderid",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "adcreativeid",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "adplatformproviderid",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "sdkversion",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "adplatformkey",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "putinmodeltype",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "requestmode",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "adprice",
    "type" : "double",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "adppprice",
    "type" : "double",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "requestdate",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "ip",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "appid",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "appname",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "uuid",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "device",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "client",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "osversion",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "density",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "pw",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "ph",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "long",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "lat",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "provincename",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "cityname",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "ispid",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "ispname",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "networkmannerid",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "networkmannername",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "iseffective",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "isbilling",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "adspacetype",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "adspacetypename",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "devicetype",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "processnode",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "apptype",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "district",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "paymode",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "isbid",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "bidprice",
    "type" : "double",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "winprice",
    "type" : "double",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "iswin",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "cur",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "rate",
    "type" : "double",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "cnywinprice",
    "type" : "double",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "imei",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "mac",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "idfa",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "openudid",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "androidid",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "rtbprovince",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "rtbcity",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "rtbdistrict",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "rtbstreet",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "storeurl",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "realip",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "isqualityapp",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "bidfloor",
    "type" : "double",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "aw",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "ah",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "imeimd5",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "macmd5",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "idfamd5",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "openudidmd5",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "androididmd5",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "imeisha1",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "macsha1",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "idfasha1",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "openudidsha1",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "androididsha1",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "uuidunknow",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "userid",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "iptype",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "initbidprice",
    "type" : "double",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "adpayment",
    "type" : "double",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "agentrate",
    "type" : "double",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "lrate",
    "type" : "double",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "adxrate",
    "type" : "double",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "title",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "keywords",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "tagid",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "callbackdate",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "channelid",
    "type" : "string",
    "nullable" : true,
    "metadata" : { }
  }, {
    "name" : "mediatype",
    "type" : "integer",
    "nullable" : true,
    "metadata" : { }
  } ]
}

在输出文件路径查看

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值