本地运行代码:
PYSPARK_DRIVER_PYTHON="jupyter" PYSPARK_DRIVER_PYTHON_OPTS="notebook" pyspark
# 读取文本数据
textFile = sc.textFile("file:/ipynotebook/test.txt")
# 使用flatMap空格符分割单词,读取每个单词
stringRDD = textFile.flatMap(lambda line:line.split(" "))
# map reduce 计算每个单词出现的次数
countsRDD = stringRDD.map(lambda word: (word,1)).reduceByKey(lambda a,b:a+b)
# 保存计算结果
countsRDD.saveAsTextFile("file:/ipynotebook/output")
yarn运行代码:
PYSPARK_DRIVER_PYTHON="jupyter" PYSPARK_DRIVER_PYTHON_OPTS="notebook" MASTER=yarn-client pyspark