1、安装pip install pyspark==2.4.4, 具体spark环境搭建如下:https://blog.csdn.net/sslfk/article/details/123899383
2、将spark-streaming-kafka包放入python环境的Lib\site-packages\pyspark\jars中,关于spark-streaming-kafka包可以从https://download.csdn.net/download/sslfk/85115900
3、代码调用方式:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition
def save_by_spark_direct_stream():
'''
Spark Streaming read Kafka,createDirectStream ,no kafka group id ,no storing offset, no ZK
'''
sc = SparkContext(appName="IIP_Recommend_System")
sc.setLogLevel("WARN")
ssc = StreamingContext(sc, int(TIMER_MAIN))
kvs = KafkaUtils.createDirectStream(ssc=ssc,
topics=[TOPIC_NAME_MAIN],
fromOffsets={},
kafkaParams={
"metadata.broker.list": ",".join(BROKER_LIST)}) # 容器云未开启SSL认证,pyspark不支持streaming kafka认证
kvs.foreachRDD(lambda rec: deal_data(rec))
'''
The Start!
'''
ssc.start()
ssc.awaitTermination()
ssc.stop()