#!/usr/bin/env python
# @desc :
__coding__ = "utf-8"
__author__ = "bytedance"from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType
from com.bytedance.tag.base.parse_rule import EsMeta
import pyspark.sql.functions as F
import os
from pyspark.sql.window import Window as win
os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ["PYSPARK_PYTHON"] = '/root/anaconda3/envs/pyspark_env/bin/python3'
if __name__ == '__main__':
# 1.创建SparkSession对象
spark = SparkSession.builder \
.config("spark.sql.shuffle.partitions", 1) \
.appName("claims_tag_job") \
.master('local[*]') \
.getOrCreate()# 理赔活跃度的四级标签ID
four_tag_id = 33
# 2.插入数据
# 2.1-读取所有标签
tag_df = spark.read.jdbc(
url="jdbc:mysql://192.168.88.166:3306/tags_new?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC&useSSL=false",
table="insurance_basic_tag",
properties={'user': 'root', 'password': '123456'}
)
# # 2.2过滤出四级标签为122行,再查出rule规则
four_tag_df: DataFrame = tag_df.where(f"id={four_tag_id}").select("rule")
# 2.3得到rule数据
rule_str = four_tag_df.first()['rule']
# # 2.4将rule解析得到对象
rule_obj = EsMeta.parse_rule(rule_str)
# # 2.5根据规则读取业务数据
business_df = spark.read.format("es") \
.option("es.nodes", rule_obj.esNodes) \
.option("es.resource", f"{rule_obj.esIndex}") \
.option("es.read.field.include", f"{rule_obj.selectFields}") \
.option("es.mapping.date.rich", False) \
.load()
'''
business_df.show()
+------------+----------+---------+-------------+-------+
|buy_datetime|claim_date|claim_mnt| pol_no|user_id|
+------------+----------+---------+-------------+-------+
| 2019-08-18|2021-02-08| 5386|P000000200945| 1-422|
| 2017-10-01|2019-03-16| 6589| P000054052| 1-423|
| 2020-09-15|2021-11-27| 6801| P000062516| 1-424|
| 2022-06-06|2022-06-18| 8736|P000000165361| 1-425|
'''
# # 2.6从标签数据中过滤出五级标签
five_tag_df: DataFrame = tag_df.where(f"pid={four_tag_id}").select("id", "rule", "name")
'''
five_tag_df.show()
+---+----------+--------+
| id| rule| name|
+---+----------+--------+
| 34| 0-365|非常活跃|
| 35| 366-999| 活跃|
| 36|1000-36500| 不活跃|
+---+----------+--------+
'''
# 3 数据处理
new_business_df = business_df.groupby(
'user_id').agg(F.round(F.sum(F.datediff('claim_date','buy_datetime'))
/F.count('pol_no'),2).alias('day')
)
'''
new_business_df.show()
+-------+------+
|user_id| day|
+-------+------+
| 1-422| 291.5|
| 1-423| 531.0|
| 1-424| 438.0|
| 1-425| 27.5|
| 1-426| 656.0|
| 1-427| 596.0|
'''
new_five_tag_df = five_tag_df.select(
'id',
F.split('rule', '-')[0].alias('start'),
F.split('rule', '-')[1].alias('end')
)
'''
new_five_tag_df.show()
+---+-----+-----+
| id|start| end|
+---+-----+-----+
| 34| 0| 365|
| 35| 366| 999|
| 36| 1000|36500|
+---+-----+-----+
'''
result_df:DataFrame = new_business_df.join(new_five_tag_df)\
.where('day between start and end').select(
new_business_df['user_id'],new_five_tag_df['id']\
.cast(StringType()).alias('tag_ids')
)
result_df.show()
# 3.4 -新旧标签进行合并
# 3.4.1- 查询旧标签的数据
old_tag_df = spark.read.format("es") \
.option("es.nodes", "192.168.88.166:9200") \
.option("es.resource", "insurance_result") \
.load()
# 3.3.2- 新旧标签数据full join
# 3.4.2.1- 创建自定义Python函数,实现新旧标签的合并与去重
@F.udf(returnType=StringType())
def merge_old_and_new_tag(old_tag, new_tag):
new_tag = str(new_tag)
# 增强代码健壮性,进行None值判断
if old_tag is None or len(old_tag) == 0:
return new_tag
if new_tag is None or len(new_tag) == 0:
return old_tag
# 将新旧标签数据格式,由字符串变成List
old_tag_list = old_tag.split(",")
new_tag_list = new_tag.split(",")
#
# 合并新旧标签
new_list = []
new_list_sum = old_tag_list + new_tag_list
for i in new_list_sum:
if i not in new_list:
new_list.append(i)
# 去重,最终结果
return ",".join(new_list)
merge_tag_df = result_df.join(old_tag_df, result_df["user_id"] == old_tag_df["user_id"], how="full").select(
F.coalesce(result_df["user_id"], old_tag_df["user_id"]).alias("user_id"),
merge_old_and_new_tag(old_tag_df["tag_ids"], result_df["tag_ids"]).alias("tag_ids")
)
'''
+-------+-----------+
|user_id| tag_ids|
+-------+-----------+
| 1-1|6,16,64,124|
| 1-10|6,19,63,123|
| 1-100|6,16,64,125|
| 1-101|6,19,63,125|
'''
# 4.输出数据
merge_tag_df.write.format("es") \
.option("es.nodes", "192.168.88.166:9200") \
.option("es.resource", "insurance_result") \
.option("es.mapping.id", "user_id") \
.option("es.index.auto.create", "True") \
.option("es.write.operation", "upsert") \
.mode("append") \
.save()
print('运行完成', '-' * 100)
# 释放资源
spark.stop()