功能描述如题,诸君共勉,拱手
系统版本:CentOS 6.8
Python:3.5.9
Hadoop:2.7.2
Spark:2.1.1
Kafka:0.11.0.2
import os
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext, DStream
from pyspark.streaming.kafka import KafkaUtils
# 确定环境变量及引入Kafka对应包
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
os.environ['PYSPARK_SUBMIT_ARGS'] = \
'--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.1 ' \
'pyspark-shell'
# 函数用于updateStateByKey,实现Streaming累加统计
def updateFunc(channel, actualChannel):
if actualChannel is None:
actualChannel = 0
return sum(channel, actualChannel)
conf = SparkConf().setMaster("spark://master:7077").setAppName("kafka")
# sc写成这样是因为在jupyter直接运行总是因sc已存在而报ValueError
try:
sc = SparkContext(conf=conf)
except ValueError:
pass
sc.setLogLevel("WARN")
ssc = StreamingContext(sc, 2)
# checkpoint一定要设置,否则报错
ssc.checkpoint("hdfs://master:9000/spark/checkpoint")
# Kafka相关的配置
zookeeper = "192.168.3.55:2181"
kfk_brokers = {"bootstrap_servers": "192.168.3.55:9092",
"kafka.bootstrap.servers": "192.168.3.55:9092",
"brokers": "192.168.3.55:9092",
"host": "192.168.3.55:9092"}
topic = {"topic_event": 1}
group_id = "test"
lines = KafkaUtils.createStream(ssc, zookeeper, group_id, topic,
kafkaParams=kfk_brokers)
# 以下基本都是RDD类似的操作,不过需要注意updateStateByKey这个算子,具体参见官方文档
linesTmp = lines.map(lambda x: json.loads(x[1].split("|")[1])).map(lambda x: (x["cm"]["md"], 1)) \
.reduceByKey(lambda a, b: a + b).updateStateByKey(updateFunc)
# 因为DStreams是没有sortBy算子的,所以需要使用transform转一下才能实现排序
sortedSSC = linesTmp.transform(lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False))
sortedSSC.pprint(num=20)
ssc.start()
ssc.awaitTermination(10)