上篇:实践广告精准投放的bz2数据转parquet文件场景案例
这一节主要演示,将bz2日志文件转换成parquet文件的第二种方案
二话不说,直接上代码
dolphin-doit01\src\main\scala\cn\sheep\dolphin\etl\Bz2ParquetV2.scala
package cn.sheep.dolphin.etl
import cn.sheep.dolphin.bean.AdLog
import cn.sheep.dolphin.utils.FileHelper
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
/** 将bz2日志文件转换成parquet文件
* author: old sheep
* Created 2021/03/21
*/
object Bz2ParquetV2 {
def main(args: Array[String]): Unit = {
// 检验参数
if (args.length != 2) {
println(
"""
|Usage: cn.sheep.dolphin.etl.Bz2ParquetV2
|Param:
| bz2InputPath bz2日志文件的输入路径
| parquetOutPath parquet文件的输出路径
""".stripMargin)
sys.exit(-1) // -1 非正常退出
}
// 接收参数(模式匹配了)
val Array(bz2InputPath, parquetOutPath) = args
val conf = new SparkConf()
.setAppName("将bz2日志文件转换成parquet文件")
.setMaster("local[*]")
//读取离线的数据文件的
val sc = new SparkContext(conf)
// 读取离线的bz2日志文件
val data = sc.textFile(bz2InputPath)
// 过滤非法数据
val filteredRDD: RDD[Array[String]] = data.map(_.split(",", -1)).filter(_.size >= 85)
// parquet <- DataFrame(几种创建方式) <- SQLContext <- RDD
val sqlc = new SQLContext(sc)
// RDD[Array[String]] -> RDD[AdLog]
val adLogRDD: RDD[AdLog] = filteredRDD.map(AdLog(_))
val dataFrame = sqlc.createDataFrame(adLogRDD)
// 删除目标目录
FileHelper.deleteDir(parquetOutPath, sc)
// partitionBy 对数据按照省份进行分区
dataFrame.write.parquet(parquetOutPath)
sc.stop()
}
}
dolphin-doit01\src\main\scala\cn\sheep\dolphin\bean\AdLog.scala
package cn.sheep.dolphin.bean
/**
* 定义实体类,要来封装日志数据
* author: old sheep
* Created 2021/03/21 16:45
*/
class AdLog(
val sessionid: String, // 会话标识
val advertisersid: Int, // 广告主id
val adorderid: Int, // 广告id
val adcreativeid: Int, // 广告创意id ( >= 200000 : dsp)
val adplatformproviderid: Int, // 广告平台商id (>= 100000: rtb)
val sdkversion: String, // sdk 版本号
val adplatformkey: String, // 平台商key
val putinmodeltype: Int, // 针对广告主的投放模式,//1:展示量投放2:点击
val requestmode: Int, // 数据请求方式(1:请求、2:展示、3:点击)
val adprice: Double, // 广告价格
val adppprice: Double, // 平台商价格
val requestdate: String, // 请求时间,//格式为:yyyy-m-dd hh:mm:ss
val ip: String, // 设备用户的真实ip 地址
val appid: String, // 应用id
val appname: String, // 应用名称
val uuid: String, // 设备唯一标识
val device: String, // 设备型号,如htc、iphone
val client: Int, // 操作系统(1:android 2:ios 3:wp)
val osversion: String, // 设备操作系统版本
val density: String, // 设备屏幕的密度
val pw: Int, // 设备屏幕宽度
val ph: Int, // 设备屏幕高度
val long: String, // 设备所在经度
val lat: String, // 设备所在纬度
val provincename: String, // 设备所在省份名称
val cityname: String, // 设备所在城市名称
val ispid: Int, // 运营商id
val ispname: String, // 运营商名称
val networkmannerid: Int, // 联网方式id
val networkmannername: String, //联网方式名称
val iseffective: Int, // 有效标识(有效指可以正常计费的)(0:无效1:
val isbilling: Int, // 是否收费(0:未收费1:已收费)
val adspacetype: Int, // 广告位类型(1:banner 2:插屏3:全屏)
val adspacetypename: String, // 广告位类型名称(banner、插屏、全屏)
val devicetype: Int, // 设备类型(1:手机2:平板)
val processnode: Int, // 流程节点(1:请求量kpi 2:有效请求3:广告请
val apptype: Int, // 应用类型id
val district: String, // 设备所在县名称
val paymode: Int, // 针对平台商的支付模式,1:展示量投放(CPM) 2:点击
val isbid: Int, // 是否rtb
val bidprice: Double, // rtb 竞价价格
val winprice: Double, // rtb 竞价成功价格
val iswin: Int, // 是否竞价成功
val cur: String, // values:usd|rmb 等
val rate: Double, // 汇率
val cnywinprice: Double, // rtb 竞价成功转换成人民币的价格
val imei: String, // imei
val mac: String, // mac
val idfa: String, // idfa
val openudid: String, // openudid
val androidid: String, // androidid
val rtbprovince: String, // rtb 省
val rtbcity: String, // rtb 市
val rtbdistrict: String, // rtb 区
val rtbstreet: String, // rtb 街道
val storeurl: String, // app 的市场下载地址
val realip: String, // 真实ip
val isqualityapp: Int, // 优选标识
val bidfloor: Double, // 底价
val aw: Int, // 广告位的宽
val ah: Int, // 广告位的高
val imeimd5: String, // imei_md5
val macmd5: String, // mac_md5
val idfamd5: String, // idfa_md5
val openudidmd5: String, // openudid_md5
val androididmd5: String, // androidid_md5
val imeisha1: String, // imei_sha1
val macsha1: String, // mac_sha1
val idfasha1: String, // idfa_sha1
val openudidsha1: String, // openudid_sha1
val androididsha1: String, // androidid_sha1
val uuidunknow: String, // uuid_unknow tanx 密文
val userid: String, // 平台用户id
val iptype: Int, // 表示ip 类型
val initbidprice: Double, // 初始出价
val adpayment: Double, // 转换后的广告消费
val agentrate: Double, // 代理商利润率
val lrate: Double, // 代理利润率
val adxrate: Double, // 媒介利润率
val title: String, // 标题
val keywords: String, // 关键字
val tagid: String, // 广告位标识(当视频流量时值为视频ID 号)
val callbackdate: String, // 回调时间格式为:YYYY/mm/dd hh:mm:ss
val channelid: String, // 频道ID
val mediatype: Int //媒体类型:1 长尾媒体2 视频媒体3 独立媒体默认:1
) extends Product {
/**
* 通过一个角标来访问类的某个属性
*
* @param n 索引
* @return 就是n对应的类的某个属性
*/
override def productElement(n: Int): Any = n match {
case 0 => sessionid
case 1 => advertisersid
case 2 => adorderid
case 3 => adcreativeid
case 4 => adplatformproviderid
case 5 => sdkversion
case 6 => adplatformkey
case 7 => putinmodeltype
case 8 => requestmode
case 9 => adprice
case 10 => adppprice
case 11 => requestdate
case 12 => ip
case 13 => appid
case 14 => appname
case 15 => uuid
case 16 => device
case 17 => client
case 18 => osversion
case 19 => density
case 20 => pw
case 21 => ph
case 22 => long
case 23 => lat
case 24 => provincename
case 25 => cityname
case 26 => ispid
case 27 => ispname
case 28 => networkmannerid
case 29 => networkmannername
case 30 => iseffective
case 31 => isbilling
case 32 => adspacetype
case 33 => adspacetypename
case 34 => devicetype
case 35 => processnode
case 36 => apptype
case 37 => district
case 38 => paymode
case 39 => isbid
case 40 => bidprice
case 41 => winprice
case 42 => iswin
case 43 => cur
case 44 => rate
case 45 => cnywinprice
case 46 => imei
case 47 => mac
case 48 => idfa
case 49 => openudid
case 50 => androidid
case 51 => rtbprovince
case 52 => rtbcity
case 53 => rtbdistrict
case 54 => rtbstreet
case 55 => storeurl
case 56 => realip
case 57 => isqualityapp
case 58 => bidfloor
case 59 => aw
case 60 => ah
case 61 => imeimd5
case 62 => macmd5
case 63 => idfamd5
case 64 => openudidmd5
case 65 => androididmd5
case 66 => imeisha1
case 67 => macsha1
case 68 => idfasha1
case 69 => openudidsha1
case 70 => androididsha1
case 71 => uuidunknow
case 72 => userid
case 73 => iptype
case 74 => initbidprice
case 75 => adpayment
case 76 => agentrate
case 77 => lrate
case 78 => adxrate
case 79 => title
case 80 => keywords
case 81 => tagid
case 82 => callbackdate
case 83 => channelid
case 84 => mediatype
}
/**
* 类中有多少个成员属性
*
* @return
*/
override def productArity: Int = 85
/**
* 看that和当前类是否一个同一类型
*
* @param that
* @return
*/
override def canEqual(that: Any): Boolean = that.isInstanceOf[AdLog]
}
object AdLog {
import cn.sheep.dolphin.bean.RichString._
def apply(arr: Array[String]): AdLog = new AdLog(
arr(0),
arr(1).toIntPlus,
arr(2).toIntPlus,
arr(3).toIntPlus,
arr(4).toIntPlus,
arr(5),
arr(6),
arr(7).toIntPlus,
arr(8).toIntPlus,
arr(9).toDoublePlus,
arr(10).toDoublePlus,
arr(11),
arr(12),
arr(13),
arr(14),
arr(15),
arr(16),
arr(17).toIntPlus,
arr(18),
arr(19),
arr(20).toIntPlus,
arr(21).toIntPlus,
arr(22),
arr(23),
arr(24),
arr(25),
arr(26).toIntPlus,
arr(27),
arr(28).toIntPlus,
arr(29),
arr(30).toIntPlus,
arr(31).toIntPlus,
arr(32).toIntPlus,
arr(33),
arr(34).toIntPlus,
arr(35).toIntPlus,
arr(36).toIntPlus,
arr(37),
arr(38).toIntPlus,
arr(39).toIntPlus,
arr(40).toDoublePlus,
arr(41).toDoublePlus,
arr(42).toIntPlus,
arr(43),
arr(44).toDoublePlus,
arr(45).toDoublePlus,
arr(46),
arr(47),
arr(48),
arr(49),
arr(50),
arr(51),
arr(52),
arr(53),
arr(54),
arr(55),
arr(56),
arr(57).toIntPlus,
arr(58).toDoublePlus,
arr(59).toIntPlus,
arr(60).toIntPlus,
arr(61),
arr(62),
arr(63),
arr(64),
arr(65),
arr(66),
arr(67),
arr(68),
arr(69),
arr(70),
arr(71),
arr(72),
arr(73).toIntPlus,
arr(74).toDoublePlus,
arr(75).toDoublePlus,
arr(76).toDoublePlus,
arr(77).toDoublePlus,
arr(78).toDoublePlus,
arr(79),
arr(80),
arr(81),
arr(82),
arr(83),
arr(84).toIntPlus
)
}
dolphin-doit01\src\main\scala\cn\sheep\dolphin\utils\FileHelper.scala
package cn.sheep.dolphin.utils
import java.io.File
import org.apache.commons.io.FileUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkContext
/**
* 文件处理相关逻辑
* author: old sheep
* Created 2021/03/21 17:16
*/
object FileHelper {
/**
* 该方法只能删除本地文件目录
* @param path
*/
def deleteTargetDir(path: String) = {
val file = new File(path)
if (file.exists()) {
// 递归删除目录
FileUtils.deleteDirectory(file)
}
}
/**
* 根据客户端参数来获取文件系统,然后删除对应的目录
* @param dirPath
* @param sc
* @return
*/
def deleteDir(dirPath: String, sc: SparkContext)={
val hadoopConfiguration = sc.hadoopConfiguration
// fs 是本地文件系统还是分布式文件系统看的core-default.xml中的fs.defaultFS的参数
val fs = FileSystem.get(hadoopConfiguration)
val path = new Path(dirPath)
if(fs.exists(path)) {
fs.delete(path, true)
}
}
}
配置传参数
运行Bz2Parquetv2程序,控制台打印输出
{
"type" : "struct",
"fields" : [ {
"name" : "sessionid",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "advertisersid",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "adorderid",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "adcreativeid",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "adplatformproviderid",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "sdkversion",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "adplatformkey",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "putinmodeltype",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "requestmode",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "adprice",
"type" : "double",
"nullable" : true,
"metadata" : { }
}, {
"name" : "adppprice",
"type" : "double",
"nullable" : true,
"metadata" : { }
}, {
"name" : "requestdate",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "ip",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "appid",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "appname",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "uuid",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "device",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "client",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "osversion",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "density",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "pw",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "ph",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "long",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "lat",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "provincename",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "cityname",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "ispid",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "ispname",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "networkmannerid",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "networkmannername",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "iseffective",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "isbilling",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "adspacetype",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "adspacetypename",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "devicetype",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "processnode",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "apptype",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "district",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "paymode",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "isbid",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "bidprice",
"type" : "double",
"nullable" : true,
"metadata" : { }
}, {
"name" : "winprice",
"type" : "double",
"nullable" : true,
"metadata" : { }
}, {
"name" : "iswin",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "cur",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "rate",
"type" : "double",
"nullable" : true,
"metadata" : { }
}, {
"name" : "cnywinprice",
"type" : "double",
"nullable" : true,
"metadata" : { }
}, {
"name" : "imei",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "mac",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "idfa",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "openudid",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "androidid",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "rtbprovince",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "rtbcity",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "rtbdistrict",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "rtbstreet",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "storeurl",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "realip",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "isqualityapp",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "bidfloor",
"type" : "double",
"nullable" : true,
"metadata" : { }
}, {
"name" : "aw",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "ah",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "imeimd5",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "macmd5",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "idfamd5",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "openudidmd5",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "androididmd5",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "imeisha1",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "macsha1",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "idfasha1",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "openudidsha1",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "androididsha1",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "uuidunknow",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "userid",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "iptype",
"type" : "integer",
"nullable" : true,
"metadata" : { }
}, {
"name" : "initbidprice",
"type" : "double",
"nullable" : true,
"metadata" : { }
}, {
"name" : "adpayment",
"type" : "double",
"nullable" : true,
"metadata" : { }
}, {
"name" : "agentrate",
"type" : "double",
"nullable" : true,
"metadata" : { }
}, {
"name" : "lrate",
"type" : "double",
"nullable" : true,
"metadata" : { }
}, {
"name" : "adxrate",
"type" : "double",
"nullable" : true,
"metadata" : { }
}, {
"name" : "title",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "keywords",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "tagid",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "callbackdate",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "channelid",
"type" : "string",
"nullable" : true,
"metadata" : { }
}, {
"name" : "mediatype",
"type" : "integer",
"nullable" : true,
"metadata" : { }
} ]
}
在输出文件路径查看