代码:
'''
spark-streaming wordcount程序
author:殷雄
'''
import os
import sys
os.environ['SPARK_HOME'] = "/opt/spark-2.2.0-bin-hadoop2.7"
sys.path.append("/opt/spark-2.2.0-bin-hadoop2.7/python")
try:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.streaming import StreamingContext
print ("Successfully imported Spark Modules")
except ImportError as e:
print ("Can not import Spark Modules", e)
sys.exit(1)
def updateFunction(newValues, runningCount):
if runningCount is None:
runningCount = 0
return sum(newValues, runningCount) # add the new values with the previous running count to get the new count
conf = SparkConf().setAppName('spark-streaming').setMaster('local[2]') #连接spark
sc = SparkContext(conf = conf) ##生成SparkContext 对象
ssc = StreamingContext(sc,10)
lines = ssc.socketTextStream("localhost", 9988)
rdd = lines.flatMap(lambda x:x.split(" ")).map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y)
rdd.pprint()
#累加
#runningCounts = lines.flatMap(lambda x:x.split(" ")).map(lambda x:(x,1)).updateStateByKey(updateFunction)
#runningCounts.pprint()
#ssc.checkpoint('/spark/ssc')
ssc.start() # Start the computation
ssc.awaitTermination() #