数据预处理整体代码实现
数据预处理的逻辑在集群在idea都能执行,下面是我在idea的执行代码
代码执行的前提(参数):
- app埋点日志原始文件输入路径
- geohash地理位置字典输入路径
- 当日的idmp映射字典所在路径
- 预处理结果输出路径
- spark运行模式的master
注意
如果参数文件在idea直接输入路径,如果在本地路径前加上file://,如果在hdfs就需要加上hdfs://主机名:端口号/路径,在集群运行输入yum,在idea运行写local[*]
注意
我们在解析json格式的文件的时候我们需要封装一个case class AppLogBean标准容器
case class AppLogBean(
var guid:Long,
eventid: String,
event: Map[String, String],
uid: String,
imei: String,
mac: String,
imsi: String,
osName: String,
osVer: String,
androidId: String,
resolution: String,
deviceType: String,
deviceId: String,
uuid: String,
appid: String,
appVer: String,
release_ch: String,
promotion_ch: String,
longtitude: Double,
latitude: Double,
carrier: String,
netType: String,
cid_sn: String,
ip: String,
sessionId: String,
timestamp: Long,
var province:String="未知",
var city:String="未知",
var district:String="未知"
)
接下来我们真正的进行数据预处理(亲测可用)
创建spark环境处理地理位置信息
object AppLogDataPreprocess {
def main(args: Array[String]): Unit = {
// 构造sparksessiong
val spark = SparkUtil.getSparkSession(this.getClass.getSimpleName, "local[*]")
import spark.implicits._
//加载area地理位置字典
val geodf = spark.read.parquet("hdfs://doit01:9000/doit12/data/dict/area")
val geoMap: collection.Map[String, (String, String, String)] = geodf.rdd.map(row => {
val geo = row.getAs[String]("geo")
val province = row.getAs[String]("province")
val city = row.getAs[String]("city")
val district = row.getAs[String]("district")
(geo, (province, city, district))
}).collectAsMap()
val bc_geo = spark.sparkContext.broadcast(geoMap)
处理idmp用户为一标识信息
//加载用idmp用户唯一标识字典
val idmpdf = spark.read.parquet("hdfs://doit01:9000/doit12/data/dict/idmp/2020-01-13")
val idMap = idmpdf.rdd.map(row => {
val id = row.getAs[Long]("biaoshi_hashcode")
val guid = row.getAs[Long]("guid")
(id, guid)
}).collectAsMap()
val bc_id = spark.sparkContext.broadcast(idMap)
处理原始数据log埋点日志信息
// 加载当日的app埋点日志文件,成为一个dataset[String]
val appDs: Dataset[String] = spark.read.textFile("hdfs://doit01:9000/doit12/data/applog/2020-01-13")
val beans = appDs.map(line => {
var bean: AppLogBean = null
try {
val jsonobj = JSON.parseObject(line)
val eventid = jsonobj.getString("eventid")
val timestamp = jsonobj.getString("timestamp").toLong
val eventobj: JSONObject = jsonobj.getJSONObject("event")
import scala.collection.JavaConversions._
val javaMap: util.Map[String, String] = eventobj.getInnerMap.asInstanceOf[util.Map[String, String]]
val event: Map[String, String] = javaMap.toMap
val userobj = jsonobj.getJSONObject("user")
val uid = userobj.getString("uid")
val sessionId = userobj.getString("sessionId")
val phoneobj = userobj.getJSONObject("phone")
val imei = phoneobj.getString("imei")
val mac = phoneobj.getString("mac")
val imsi = phoneobj.getString("imsi")
val osName = phoneobj.getString("osName")
val osVer = phoneobj.getString("osVer")
val androidId = phoneobj.getString("androidId")
val resolution = phoneobj.getString("resolution")
val deviceType = phoneobj.getString("deviceType")
val deviceId = phoneobj.getString("deviceId")
val uuid = phoneobj.getString("uuid")
val appobj = userobj.getJSONObject("app")
val appid = appobj.getString("appid")
val appVer = appobj.getString("appVer")
val release_ch = appobj.getString("release_ch") // 下载渠道
val promotion_ch = appobj.getString("promotion_ch") // 推广渠道
val locobj = userobj.getJSONObject("loc")
var lng = 0.0
var lat = -90.0
try {
lng = locobj.getDouble("longtitude")
lat = locobj.getDouble("latitude")
} catch {
case e: Exception => e.printStackTrace()
}
val carrier = locobj.getString("carrier")
val netType = locobj.getString("netType")
val cid_sn = locobj.getString("cid_sn")
val ip = locobj.getString("ip")
// 判断数据合法规则
val tmp = (imei + imsi + mac + uid + uuid + androidId).replaceAll("null", "")
if (StringUtils.isNotBlank(tmp) && event != null && StringUtils.isNotBlank(eventid) && StringUtils.isNotBlank(sessionId)) {
// 将提取出来的各个字段,封装到AppLogBean中
bean = AppLogBean(
Long.MinValue,
eventid,
event,
uid,
imei,
mac,
imsi,
osName,
osVer,
androidId,
resolution,
deviceType,
deviceId,
uuid,
appid,
appVer,
release_ch,
promotion_ch,
lng,
lat,
carrier,
netType,
cid_sn,
ip,
sessionId,
timestamp
)
}
} catch {
case e: Exception => {
e.printStackTrace()
}
}
bean
})
val beans2 = beans.filter(_ != null)
查geo地域字典,填充省市区
beans2.map(bean => {
val geoDict = bc_geo.value
val idmpDict = bc_id.value
// 查geo地域字典,填充省市区
val lat = bean.latitude
val lng = bean.longtitude
val mygeo = GeoHash.geoHashStringWithCharacterPrecision(lat, lng, 5)
val maybeTuple: Option[(String, String, String)] = geoDict.get(mygeo)
if (maybeTuple.isDefined) {
val areaNames = maybeTuple.get
// 填充省市区
bean.province = areaNames._1
bean.city = areaNames._2
bean.district = areaNames._3
}
查id映射字典,填充guid
// 查id映射字典,填充guid
val ids = Array(bean.imei, bean.imsi, bean.mac, bean.androidId, bean.uuid, bean.uid)
val mouId = ids.filter(StringUtils.isNotBlank(_))(0)
val maybeLong = idmpDict.get(mouId.hashCode.toLong)
if (maybeLong.isDefined) {
val guid = maybeLong.get
bean.guid = guid
}
bean
})
将数据过滤整合输出成为贴源层
.filter(bean => bean.guid != Long.MinValue)
.toDF()
.write
.parquet("hdfs://doit01:9000/doit12/data/applog-output/2020-01-13")
spark.close()
}
}