运行代码:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
if __name__ == "__main__":
spark = SparkSession.builder.master("local[2]").appName("PythonWordCount").getOrCreate()
spark.sparkContext.setLogLevel('INFO')
prop = {"kafka.bootstrap.servers": "10.10.108.101:9092", "subscribe": "tp_test123"}
#time_udfs = udf(getLen, StringType())
data = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "10.10.108.101:9092")\
.option("subscribe", "tp_testabc")\
.load()
tmp = data.selectExpr("CAST(value AS STRING)")#.createOrReplaceTempView("updates")
#abc = spark.sql("select * from updates")
query = tmp.writeStream.format("console").outputMode("append").start()
spark.streams.awaitAnyTermination()
运行脚本:
spark-2.3.3-bin-hadoop2.7/bin/spark-submit --jars pyspark/lib/spark-sql-kafka-0-10_2.11-2.3.3.jar,pyspark/lib/kafka_2.11-0.10.0.1.jar,pyspark/lib/kafka-clients-0.10.2.0.jar pyspark/sparkStream.py
注意事项:使用spark-2.3.3版本spark,否则会报错。