案例1:WordCount程序
from pyspark import *
import os
if __name__ == '__main__':
os.environ['SPARK_HOME'] = 'G:\myProgram\spark-2.3.0-bin-hadoop2.7'
# Create SparkConf
conf = SparkConf() \
.setAppName("WordCount") \
.setMaster("local") \
# Create SparkContext
sc = SparkContext(conf=conf)
# 从本地模拟数据
datas = ["you,jump", "i,jump"]
# Create RDD
rdd = sc.parallelize(datas)
print(rdd.count()) # 2
print(rdd.first()) # you,jum
# WordCount
wordcount = rdd.flatMap(lambda line: line.split(",")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a + b)
for wc in wordcount.collect():
print(wc[0] + " " + str(wc[1]))
# you 1
# jump 2
# i 1