cd /usr/local/spark/mycode
mkdir streaming
cd streaming
mkdir logfile
cd logfile
启动pyspark
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
ssc = StreamingContext(sc, 10)#每10秒一个信息
lines = ssc.textFileStream(‘file:///usr/local/spark/mycode/streaming/logfile’)
words = lines.flatMap(lambda x:x.split(’ ‘’))
wordscount = words.map(lambda x:(x,1)).reduceByKey(lambda a,b:a+b)
wordscount.pprint()
ssc.start()#程序开始自动进入循环监听模式,可以在监听窗口显示词频统计结果
ssc.awaitTermination()
在pyspark中创建文件流然后wordcount
最新推荐文章于 2024-02-12 02:15:38 发布