1、生产消息
连接kafka服务,将数据写入指定topic
from kafka import KafkaProducer
import time
import random
import json
from json import dumps
from datetime import datetime
#连接kafka
# value_serializer 表示对业务数据value进行序列化操作,这里利用JSON模块中的dumps将JSON格式的数据序列化,且编码指定为utf-8
producer = KafkaProducer(bootstrap_servers='127.0.0.1:9092',value_serializer=lambda x: dumps(x).encode('utf-8'))
key='streaming'.encode('utf-8')
typeList =['鞋子','裤子','袜子','皮带','化妆品','背包','书籍','零食','运动服','乐器']
#发送内容,必须是bytes类型
for i in range(0, 1000):
vjson = {}
for t in typeList:
i = i + 1
vjson["bizId"] = str(i)
vjson["type"] = t
vjson["value"] = 1
#vjson["datetime"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print("Message to be sent: ", vjson)
# 将产生的数据通过send方法发布到主题为kf2pyspark的kafka消息队列上去,其中value数据为产生的vjson数据
producer.send('kf2pyspark',value= vjson , key=key)
time.sleep(1)
producer.flush()
producer.close()
2、消费消息
pyspark订阅kafka相关主题的消息,对获取到的数据进行处理,最后再发布到kafka特定主题上,供其他程序调用
import findspark
findspark.init()
##############################################
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
KAFKA_TOPIC_READ = "kf2pyspark"
KAFKA_WRITE_TOPIC = "pyspark2kf"
KAFKA_SERVERS = '127.0.0.1:9092'
JARS_PATH ="file:///E://wmsoft//spark-sql-kafka-0-10_2.11-2.4.0.jar,file:///E://wmsoft//kafka-clients-1.1.0.jar"
if __name__ == "__main__":
print("PySpark Kafka Started ...")
spark = SparkSession \
.builder \
.appName("PySpark Kafka Demo") \
.master("local[*]") \
.config("spark.jars", JARS_PATH) \
.config("spark.executor.extraClassPath", JARS_PATH) \
.config("spark.executor.extraLibrary", JARS_PATH) \
.config("spark.driver.extraClassPath", JARS_PATH) \
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
#从kf2pyspark主题中读取Kafaka流数据
sdf = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", KAFKA_SERVERS) \
.option("subscribe", KAFKA_TOPIC_READ) \
.option("startingOffsets", "latest") \
.load()
#注意Kafaka流数据的格式,业务数据都是在value字段中
sdf.printSchema()
# 将byte数组类型的业务值转换成字符串
sdf = sdf.selectExpr("CAST(value AS STRING)")
bizSchema = StructType().add("type", StringType()).add("value", IntegerType())
# 将字符串value列转换成一个JSON格式的值,这样就可以获取到value中的特定业务数据字段值
bizDf = sdf.select(from_json(col("value"), bizSchema).alias("json"))
#bizDf.printSchema()
# JSON下的所有字段
bizDf = bizDf.select("json.*")
#bizDf.printSchema()
#分组求和
bizDfCalc = bizDf.groupBy("type").agg({'value': 'sum'}).select("type",col("sum(value)").alias("amount"))
#Kafka主题数据必须有value字段
write_stream = bizDfCalc \
# 将处理完毕的数据type和amount转换成JSON文本格式并命名其别名为value
.select(to_json(struct(col("amount"), col("type"))).alias("value"))\
# 创建一个写入流;
.writeStream \
.format("kafka") \
.option("kafka.bootstrap.servers", KAFKA_SERVERS) \
.option("topic", KAFKA_WRITE_TOPIC) \
.option("checkpointLocation", "file:///E:/wmsoft/checkpoints/") \
.outputMode("complete") \
.trigger(processingTime='2 seconds') \
.queryName("sink2kafka")
query = write_stream.start()
query.awaitTermination()