from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import SparkSession
import json
"""
功能:1. 消费运Kafka里的数据
2. 解析Json数据格式
3. 将解析后的数据写入Hive表
"""
HIVE_TARGET_TABLE = 'test.data_log'
KAFKA_TOPICS = ["test"]
GROUP_ID = "TEST_KAFKA_GROUP"
METADATA_BROKER = "localhost1:9092,localhost2:9092,localhost3:9092"
KAFKA_PARAMS = {"metadata.broker.list": METADATA_BROKER,
"group.id": GROUP_ID,
"auto.offset.reset": "largest",
"enable.auto.commit": "false"}
schema = StructType([
StructField("id", StringType(), True),
StructField("name", StringType(), True)
])
def dict_to_tuple(data: dict):
id = data.get("id")
name = data.get("name")
data_tuple = (id, name)
return data_tuple
# 重启初始化kafka偏移量
def read_offset():
topic_name = KAFKA_TOPICS[0]
sql = f"""SELECT partition_id, last_offset
FROM confluent.platform_log_offset
WHERE topic_name = '{topic_name}' AND group_id = '{GROUP_ID}' """
# 客户端,读取MySQL数据
res = mysql_client.pool_execute_query(sql)
if res is not None and len(res) > 0:
offset_dict = {}
cols = list(res[0].keys())
for each in res:
offset_dict[TopicAndPartition(topic_name, each.get(cols[0]))] = each.get(cols[1])
return offset_dict
return None
# 每次消费完保存kafka偏移量,偏移量记录到MySQL数据库
def save_offset(rdd):
offset_data = ""
offset_ranges = rdd.offsetRanges()
for offset in offset_ranges:
offset_data += f"('{offset.topic}', '{GROUP_ID}', {offset.partition}, {offset.untilOffset}),"
sql = f"""REPLACE INTO test.log_offset(topic_name, group_id, partition_id, last_offset)
VALUES {offset_data}""".strip(',')
# 客服端,写入MySQL
# mysql_client.pool_execute_query(sql)
# 数据转换写入Hive数据库
def transform_to_hive(batch_rdd):
rdd = batch_rdd.map(lambda x: x[1])
spark = SparkSession.builder \
.enableHiveSupport() \
.config("spark.debug.maxToStringFields", 100) \
.getOrCreate()
if not rdd.isEmpty():
data_list = []
for line in rdd.collect():
data_dict: dict = json.loads(str(line))
data_tuple = dict_to_tuple(data_dict)
data_list.append(data_tuple)
df = spark.createDataFrame(data_list, schema)
df.write.format('hive').insertInto(HIVE_TARGET_TABLE)
save_offset(batch_rdd)
if __name__ == '__main__':
sc = SparkContext(appName="LogStreaming")
# 第二个参数指批次间隔时长,1分钟
ssc = StreamingContext(sc, 60)
dstream = KafkaUtils.createDirectStream(ssc=ssc, topics=KAFKA_TOPICS, kafkaParams=KAFKA_PARAMS,
fromOffsets=read_offset())
dstream.foreachRDD(transform_to_hive)
ssc.start()
ssc.awaitTermination()
10-24
691
“相关推荐”对你有帮助么?
-
非常没帮助
-
没帮助
-
一般
-
有帮助
-
非常有帮助
提交