sparkStream本地测试
1. nc -lk 9999 启动服务端, 然后启动 network_wordcount.py,终端输出每秒的streaming数据流
2. 在服务端输入数据, 客户端就能显示结果
完整代码如下
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: network_wordcount.py <hostname> <port>", file=sys.stderr)
sys.exit(-1)
sc = SparkContext(appName="PythonStreamingNetworkWordCount")
ssc = StreamingContext(sc, 1)
lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
counts = lines.flatMap(lambda line: line.split(" "))\
.map(lambda word: (word, 1))\
.reduceByKey(lambda a, b: a+b)
print("log test")
counts.pprint()
ssc.start()
ssc.awaitTermination()