object EdpiAnalyseFile {
@transient val logger = LoggerFactory.getLogger(EdpiAnalyseFile.getClass)
def main(args: Array[String]): Unit = {
val date = args(0) // yyyyMMdd
val queueName = if(args(1) != null) args(1) else "crflow"
val input = s"hdfs://hdfsunity/data/zb_zhw_gw_rt/hive/warehouse/dws_zb_zhw_gw_rt.db/dws_user_netquality_edpi_user_req_iptype_topn_domain_15minsnapshot_d_inc/day_id=${date}/*/"
// val input = "input/day/prov"
logger.warn("运行日期 =>" + date)
logger.warn("日志目录 =>" + input)
val bool1 = ZhichaModel.getModel2IsZhicha(1, 4, 500, 0.05, 1024)
println(bool1)
val spark = SparkSession.builder()
.appName("EdpiAnalyseFile")
// .master("local[*]")
.config("spark.sql.parquet.binaryAsString", "true")
.config("spark.yarn.queue", queueName)
.enableHiveSupport()
.getOrCreate()
val beginTime = System.currentTimeMillis()
val sc = spark.sparkContext
val dateBC = sc.broadcast(date)
val city4ListBC = sc.broadcast(EdpiConstant.CITY_LIST) // 表中数据,直辖市可能为空,特殊处理
val provMappingBC = sc.broadcast(EdpiConstant.PROV_MAPPING)
val df = spark.read.parquet(input)
val tmpRes = df.rdd.mapPartitions(iter => {
iter.flatMap(row => {
val city4List = city4ListBC.value
val entity = new EdpiDwsEntity15Min(row)
val provinceName = entity.province_cn
val city = if (city4List.contains(provinceName)) provinceName else entity.city_cn
val account = entity.pppoe_account
val brasIp = entity.bras_ipv4_address
val lev3Domain = entity.lev3_domain
val pppoeAccount = entity.pppoe_account
val dstIp = entity.dst_ip
val agg15minuteTime = entity.agg_15minute_time
//1宽带账户,2专线账户
val userType = if (Aaa_Clean.getACtype(s"${provinceName}", account) != 2) 1 else 2
// 资源ID website_id: 通过二级域名映射而来,这里存储二级域名
val lev2_domain = if (RSDomainUtil.transformUrl(2, lev3Domain) == null) lev3Domain else RSDomainUtil.transformUrl(2, lev3Domain)
// 第一张表 tbl_content_edpi_desc_info
val edpiDesc1stIpv4ProvKey = ("edpi_desc_1st", 1, provinceName, 4, userType, "-1", "-1", "", "", "", "")
val edpiDesc1stIpv4CityKey = ("edpi_desc_1st", 2, provinceName, 4, userType, city, "-1", "", "", "", "")
val edpiDesc1stIpv4Barskey = ("edpi_desc_1st", 3, provinceName, 4, userType, city, brasIp, "", "", "", "")
val edpiDesc1stIpv6ProvKey = ("edpi_desc_1st", 1, provinceName, 6, userType, "-1", "-1", "", "", "", "")
val edpiDesc1stIpv6CityKey = ("edpi_desc_1st", 2, provinceName, 6, userType, city, "-1", "", "", "", "")
val edpiDesc1stIpv6Barskey = ("edpi_desc_1st", 3, provinceName, 6, userType, city, brasIp, "", "", "", "")
// 第二张表 tbl_website_point_new_info 网站:应用ID天粒度
val websitePoint2ndIpv4ProvKey = ("website_point_2nd", 1, provinceName, 4, -1, "-1", "-1", lev2_domain, "", "", "")
val websitePoint2ndIpv4CityKey = ("website_point_2nd", 2, provinceName, 4, -1, city, "-1", lev2_domain, "", "", "")
val websitePoint2ndIpv4Barskey = ("website_point_2nd", 3, provinceName, 4, -1, city, brasIp, lev2_domain, "", "", "")
val websitePoint2ndIpv6ProvKey = ("website_point_2nd", 1, provinceName, 6, -1, "-1", "-1", lev2_domain, "", "", "")
val websitePoint2ndIpv6CityKey = ("website_point_2nd", 2, provinceName, 6, -1, city, "-1", lev2_domain, "", "", "")
val websitePoint2ndIpv6Barskey = ("website_point_2nd", 3, provinceName, 6, -1, city, brasIp, lev2_domain, "", "", "")
// 第三张表 tbl_alive_quality_info
val aliveQuality3rdIpv4ProvKey = ("alive_quality_3rd", 1, provinceName, 4, userType, "-1", "-1", "", pppoeAccount, "", "")
val aliveQuality3rdIpv4CityKey = ("alive_quality_3rd", 2, provinceName, 4, userType, city, "-1", "", pppoeAccount, "", "")
val aliveQuality3rdIpv4Barskey = ("alive_quality_3rd", 3, provinceName, 4, userType, city, brasIp, "", pppoeAccount, "", "")
val aliveQuality3rdIpv6ProvKey = ("alive_quality_3rd", 1, provinceName, 6, userType, "-1", "-1", "", pppoeAccount, "", "")
val aliveQuality3rdIpv6CityKey = ("alive_quality_3rd", 2, provinceName, 6, userType, city, "-1", "", pppoeAccount, "", "")
val aliveQuality3rdIpv6Barskey = ("alive_quality_3rd", 3, provinceName, 6, userType, city, brasIp, "", pppoeAccount, "", "")
// 第四张表 tbl_website_analyse_info_ip
val websiteAnalyse4thIpv4ProvKey = ("website_analyse_4th", 1, provinceName, 4, -1, "-1", "-1", lev2_domain, pppoeAccount, dstIp, "")
val websiteAnalyse4thIpv4CityKey = ("website_analyse_4th", 2, provinceName, 4, -1, city, "-1", lev2_domain, pppoeAccount, dstIp, "")
val websiteAnalyse4thIpv4Barskey = ("website_analyse_4th", 3, provinceName, 4, -1, city, brasIp, lev2_domain, pppoeAccount, dstIp, "")
val websiteAnalyse4thIpv6ProvKey = ("website_analyse_4th", 1, provinceName, 6, -1, "-1", "-1", lev2_domain, pppoeAccount, dstIp, "")
val websiteAnalyse4thIpv6CityKey = ("website_analyse_4th", 2, provinceName, 6, -1, city, "-1", lev2_domain, pppoeAccount, dstIp, "")
val websiteAnalyse4thIpv6Barskey = ("website_analyse_4th", 3, provinceName, 6, -1, city, brasIp, lev2_domain, pppoeAccount, dstIp, "")
// 第七张表 tbl_website_domain_point_new_info
val websiteDomainPoint7thIpv4ProvKey = ("website_domain_point_7th", 1, provinceName, 4, -1, "-1", "-1", lev2_domain, "", "", agg15minuteTime)
val websiteDomainPoint7thIpv4CityKey = ("website_domain_point_7th", 2, provinceName, 4, -1, city, "-1", lev2_domain, "", "", agg15minuteTime)
val websiteDomainPoint7thIpv4Barskey = ("website_domain_point_7th", 3, provinceName, 4, -1, city, brasIp, lev2_domain, "", "", agg15minuteTime)
val websiteDomainPoint7thIpv6ProvKey = ("website_domain_point_7th", 1, provinceName, 6, -1, "-1", "-1", lev2_domain, "", "", agg15minuteTime)
val websiteDomainPoint7thIpv6CityKey = ("website_domain_point_7th", 2, provinceName, 6, -1, city, "-1", lev2_domain, "", "", agg15minuteTime)
val websiteDomainPoint7thIpv6Barskey = ("website_domain_point_7th", 3, provinceName, 6, -1, city, brasIp, lev2_domain, "", "", agg15minuteTime)
Array(
(edpiDesc1stIpv4ProvKey, new EdpiDwsEntity15Min(row)),
(edpiDesc1stIpv4CityKey, new EdpiDwsEntity15Min(row)),
(edpiDesc1stIpv4Barskey, new EdpiDwsEntity15Min(row)),
(edpiDesc1stIpv6ProvKey, new EdpiDwsEntity15Min(row)),
(edpiDesc1stIpv6CityKey, new EdpiDwsEntity15Min(row)),
(edpiDesc1stIpv6Barskey, new EdpiDwsEntity15Min(row)),
(websitePoint2ndIpv4ProvKey, new EdpiDwsEntity15Min(row)),
(websitePoint2ndIpv4CityKey, new EdpiDwsEntity15Min(row)),
(websitePoint2ndIpv4Barskey, new EdpiDwsEntity15Min(row)),
(websitePoint2ndIpv6ProvKey, new EdpiDwsEntity15Min(row)),
(websitePoint2ndIpv6CityKey, new EdpiDwsEntity15Min(row)),
(websitePoint2ndIpv6Barskey, new EdpiDwsEntity15Min(row)),
(aliveQuality3rdIpv4ProvKey, new EdpiDwsEntity15Min(row)),
(aliveQuality3rdIpv4CityKey, new EdpiDwsEntity15Min(row)),
(aliveQuality3rdIpv4Barskey, new EdpiDwsEntity15Min(row)),
(aliveQuality3rdIpv6ProvKey, new EdpiDwsEntity15Min(row)),
(aliveQuality3rdIpv6CityKey, new EdpiDwsEntity15Min(row)),
(aliveQuality3rdIpv6Barskey, new EdpiDwsEntity15Min(row)),
(websiteAnalyse4thIpv4ProvKey, new EdpiDwsEntity15Min(row)),
(websiteAnalyse4thIpv4CityKey, new EdpiDwsEntity15Min(row)),
(websiteAnalyse4thIpv4Barskey, new EdpiDwsEntity15Min(row)),
(websiteAnalyse4thIpv6ProvKey, new EdpiDwsEntity15Min(row)),
(websiteAnalyse4thIpv6CityKey, new EdpiDwsEntity15Min(row)),
(websiteAnalyse4thIpv6Barskey, new EdpiDwsEntity15Min(row)),
(websiteDomainPoint7thIpv4ProvKey, new EdpiDwsEntity15Min(row)),
(websiteDomainPoint7thIpv4CityKey, new EdpiDwsEntity15Min(row)),
(websiteDomainPoint7thIpv4Barskey, new EdpiDwsEntity15Min(row)),
(websiteDomainPoint7thIpv6ProvKey, new EdpiDwsEntity15Min(row)),
(websiteDomainPoint7thIpv6CityKey, new EdpiDwsEntity15Min(row)),
(websiteDomainPoint7thIpv6Barskey, new EdpiDwsEntity15Min(row))
)
})
}).reduceByKey(
(entityA, entityB) => {
entityA.addEnttiy(entityB)
}
).cache()
val edpiDesc1stdf = tmpRes
.mapPartitions(iter => {
iter.map {
case (key, entity) => {
val date = dateBC.value
val provMapping = provMappingBC.value
val tblFlag = key._1
val logType = key._2
val provinceName = key._3
val ipVersion = key._4
val userType = key._5
val cityName = key._6
val brasIp = key._7
val lev2Domain = key._8
val pppoeAccount = key._9
val dstIp = key._10
val agg15minuteTime = key._11
val provinceId = provMapping.getOrDefault(provinceName, -1)
val row = if (ipVersion == 4) {
val totalDelay = entity.domain_user_ipv4_http_total_delay_avg_ms_15min
val lossPackRate = entity.domain_user_ipv4_http_loss_packet_ratio_15min
val maxDownloadSpeed = entity.domain_user_ipv4_http_dl_flow_max_bps_15min
val isQuality = if (ZhichaModel.getModel2IsZhicha(logType, ipVersion, totalDelay, lossPackRate, maxDownloadSpeed)) 1 else 0
val qualityScore = ZhichaModel.getModel1Score(logType, ipVersion, List((1, totalDelay), (2, lossPackRate), (3, maxDownloadSpeed)))
val dstIpArea = 0
Row(
date, logType, provinceName, ipVersion, userType, cityName, brasIp, isQuality, lev2Domain, tblFlag, pppoeAccount, dstIp, agg15minuteTime, dstIpArea,
qualityScore,
entity.domain_user_ipv4_http_cnt_15min,
entity.domain_user_ipv4_http_trans_data_total_byte_15min,
entity.domain_user_ipv4_http_conn_delay_avg_ms_15min,
entity.domain_user_ipv4_http_1st_packet_delay_avg_ms_15min,
lossPackRate,
entity.domain_user_ipv4_http_dl_flow_avg_bps_15min,
totalDelay, maxDownloadSpeed, provinceId
)
} else {
val totalDelay = entity.domain_user_ipv6_http_total_delay_avg_ms_15min
val lossPackRate = entity.domain_user_ipv6_http_loss_packet_ratio_15min
val maxDownloadSpeed = entity.domain_user_ipv6_http_dl_flow_max_bps_15min
val isQuality = if (ZhichaModel.getModel2IsZhicha(logType, ipVersion, totalDelay, lossPackRate, maxDownloadSpeed)) 1 else 0
val qualityScore = 0.0
val dstIpArea = 0
Row(
date, logType, provinceName, ipVersion, userType, cityName, brasIp, isQuality, lev2Domain, tblFlag, pppoeAccount, dstIp, agg15minuteTime, dstIpArea,
qualityScore,
entity.domain_user_ipv6_http_cnt_15min,
entity.domain_user_ipv6_http_trans_data_total_byte_15min,
entity.domain_user_ipv6_http_conn_delay_avg_ms_15min,
entity.domain_user_ipv6_http_1st_packet_delay_avg_ms_15min,
lossPackRate,
entity.domain_user_ipv6_http_dl_flow_avg_bps_15min,
totalDelay, maxDownloadSpeed, provinceId
)
}
row
}
}
})
spark.createDataFrame(edpiDesc1stdf, EdpiTableTmp1Schema).createOrReplaceTempView("edpi_table_tmp1")
// 第一张表
val edpiDesc1stProvSqlDay = EdpiSql.edpiDesc1stProvSql.replace("${day_id}", date)
val edpiDesc1stCitySqlDay = EdpiSql.edpiDesc1stCitySql.replace("${day_id}", date)
val edpiDesc1stBrasSqlDay = EdpiSql.edpiDesc1stBrasSql.replace("${day_id}", date)
logger.warn("[edpiDesc1stProvSqlDay ==> " + edpiDesc1stProvSqlDay + "] [edpiDesc1stCitySqlDay ==> " + edpiDesc1stCitySqlDay + "] [edpiDesc1stBrasSqlDay ==> " + edpiDesc1stBrasSqlDay + "]")
spark.sql(edpiDesc1stProvSqlDay)/*.show(10)*/
spark.sql(edpiDesc1stCitySqlDay)/*.show(10)*/
spark.sql(edpiDesc1stBrasSqlDay)/*.show(10)*/
// .repartition(1).write.csv("./output/"+ System.currentTimeMillis())
// 第二张表
val websitePoint2ndProvSqlDay = EdpiSql.websitePoint2ndProvSql.replace("${day_id}", date)
val websitePoint2ndCitySqlDay = EdpiSql.websitePoint2ndCitySql.replace("${day_id}", date)
val websitePoint2ndBrasSqlDay = EdpiSql.websitePoint2ndBrasSql.replace("${day_id}", date)
logger.warn("[websitePoint2ndProvSqlDay ==> " + websitePoint2ndProvSqlDay + "] [websitePoint2ndCitySqlDay ==> " + websitePoint2ndCitySqlDay + "] [websitePoint2ndBrasSqlDay ==> " + websitePoint2ndBrasSqlDay + "]")
spark.sql(websitePoint2ndProvSqlDay)/*.show(10)*/
spark.sql(websitePoint2ndCitySqlDay)/*.show(10)*/
spark.sql(websitePoint2ndBrasSqlDay)/*.show(10)*/
// 第三张表
val aliveQuality3rdProvSqlDay = EdpiSql.aliveQuality3rdProvSql.replace("${day_id}", date)
val aliveQuality3rdCitySqlDay = EdpiSql.aliveQuality3rdCitySql.replace("${day_id}", date)
val aliveQuality3rdBrasSqlDay = EdpiSql.aliveQuality3rdBrasSql.replace("${day_id}", date)
logger.warn("[aliveQuality3rdProvSqlDay ==> " + aliveQuality3rdProvSqlDay + "] [aliveQuality3rdCitySqlDay ==> " + aliveQuality3rdCitySqlDay + "] [aliveQuality3rdBrasSqlDay ==> " + aliveQuality3rdBrasSqlDay + "]")
spark.sql(aliveQuality3rdProvSqlDay)/*.show(10)*/
spark.sql(aliveQuality3rdCitySqlDay)/*.show(10)*/
spark.sql(aliveQuality3rdBrasSqlDay)/*.show(10)*/
// 第四张表
val websiteAnalyse4thProvSqlDay = EdpiSql.websiteAnalyse4thProvSql.replace("${day_id}", date)
val websiteAnalyse4thCitySqlDay = EdpiSql.websiteAnalyse4thCitySql.replace("${day_id}", date)
val websiteAnalyse4thBrasSqlDay = EdpiSql.websiteAnalyse4thBrasSql.replace("${day_id}", date)
logger.warn("[websiteAnalyse4thProvSqlDay ==> " + websiteAnalyse4thProvSqlDay + "] [websiteAnalyse4thCitySqlDay ==> " + websiteAnalyse4thCitySqlDay + "] [websiteAnalyse4thBrasSqlDay ==> " + websiteAnalyse4thBrasSqlDay + "]")
spark.sql(websiteAnalyse4thProvSqlDay)/*.show(10)*/
spark.sql(websiteAnalyse4thCitySqlDay)/*.show(10)*/
spark.sql(websiteAnalyse4thBrasSqlDay)/*.show(10)*/
// 第五张表 tbl_domain_top100_info 来自第二张表
val domain100Top5thProvSqlDay = EdpiSql.domain100Top5thProvSql.replace("${day_id}", date)
logger.warn("[domain100Top4thProvSqlDay ==> " + domain100Top5thProvSqlDay + "]")
spark.sql(domain100Top5thProvSqlDay)/*.show(10)*/
// 第六张表 tbl_domain_top100_info_ip 来自第四张表
val domainIP100Top6thProvSqlDay = EdpiSql.domainIP100Top6thProvSql.replace("${day_id}", date)
logger.warn("[domainIP100Top6thProvSqlDay ==> " + domainIP100Top6thProvSqlDay + "]")
spark.sql(domainIP100Top6thProvSqlDay)/*.show(10)*/
// 第七张表 tbl_website_domain_point_new_info
val websiteDomainPoint7thProvSqlDay = EdpiSql.websiteDomainPoint7thProvSql.replace("${day_id}", date)
val websiteDomainPoint7thCitySqlDay = EdpiSql.websiteDomainPoint7thCitySql.replace("${day_id}", date)
val websiteDomainPoint7thBrasSqlDay = EdpiSql.websiteDomainPoint7thBrasSql.replace("${day_id}", date)
logger.warn("[websiteDomainPoint7thProvSqlDay ==> " + websiteDomainPoint7thProvSqlDay + "] [websiteDomainPoint7thCitySqlDay ==> " + websiteDomainPoint7thCitySqlDay + "] [websiteDomainPoint7thBrasSqlDay ==> " + websiteDomainPoint7thBrasSqlDay + "]")
spark.sql(websiteDomainPoint7thProvSqlDay)/*.show(10)*/
spark.sql(websiteDomainPoint7thCitySqlDay)/*.show(10)*/
spark.sql(websiteDomainPoint7thBrasSqlDay)/*.show(10)*/
// 第八张表 tbl_website_analyse_info
spark.sql(EdpiSql.WebsiteTmpSql).createOrReplaceTempView("website_tmp")
val websiteAnalyse8thProvSqlDay = EdpiSql.websiteAnalyse8thProvSql.replace("${day_id}", date)
val websiteAnalyse8thCitySqlDay = EdpiSql.websiteAnalyse8thCitySql.replace("${day_id}", date)
val websiteAnalyse8thBrasSqlDay = EdpiSql.websiteAnalyse8thBrasSql.replace("${day_id}", date)
logger.warn("[websiteAnalyse8thProvSqlDay ==> " + websiteAnalyse8thProvSqlDay + "] [websiteAnalyse8thCitySqlDay ==> " + websiteAnalyse8thCitySqlDay + "] [websiteAnalyse8thBrasSqlDay ==> " + websiteAnalyse8thBrasSqlDay + "]")
spark.sql(websiteAnalyse8thProvSqlDay)/*.show(10)*/
spark.sql(websiteAnalyse8thCitySqlDay)/*.show(10)*/
spark.sql(websiteAnalyse8thBrasSqlDay)/*.show(10)*/
val duration = (System.currentTimeMillis() - beginTime) / 1000 / 60
logger.warn(this.getClass.getName + "==> 程序运行时间(分): " + duration)
spark.close()
}
val EdpiTableTmp1Schema: StructType = StructType(
Array(
StructField("record_date", StringType, true), // 日期
StructField("log_type", IntegerType, true), // 日志类型
StructField("province_cn", StringType, true), // 省份
StructField("ip_version", IntegerType, true), // ip类型 ipv4 ipv6
StructField("user_type", IntegerType, true), // 1宽带账户,2专线账户
StructField("city_cn", StringType, true), // 区域ID
StructField("bras_ip", StringType, true), // Bras_IP
StructField("zhicha_type", IntegerType, true), // 质差类型
StructField("lev2_domain", StringType, true), // 二级域名
StructField("tbl_flag", StringType, true), // 表区分
StructField("pppoe_account", StringType, true), // 账号
StructField("dst_ip", StringType, true), // 解析IP
StructField("agg_15minute_time", StringType, true), // 15分钟时间点 HH:MM
StructField("dst_ip_area", IntegerType, true), // 15分钟时间点 HH:MM
StructField("quality_score", DoubleType, true), // 质差分数
StructField("url_sum", LongType, true), // 访问量
StructField("all_flow", LongType, true), // 传输量总和
StructField("avg_conn_delay", DoubleType, true), // 平均连接时延
StructField("avg_first_delay", DoubleType, true), // 平均首包时延
StructField("avg_lost_rate", DoubleType, true), // 丢包率
StructField("avg_download_speed", DoubleType, true), // 平均下载速率
StructField("avg_all_dalay", DoubleType, true), // 整体时延
StructField("max_download_speed", DoubleType, true), // 峰值下载速率
StructField("province_id", IntegerType, true) // 省ID
)
)
}
【无标题】
最新推荐文章于 2024-01-18 23:25:36 发布