1. Docker 快速部署pyspark
只需安装Docker即可
1.1 下载镜像
docker pull jupyter/pyspark-notebook
1.2 启动镜像
docker run --name pyspark --rm -p 8888:8888 jupyter/pyspark-notebook
或者挂载windows的一个目录亦可
docker run --name pyspark --rm -v E:\COURSE\spark:/home/joyan -p 8888:8888 jupyter/pyspark-notebook
1.3 打开 jupyterNotebook
2. 编写第一个WordCount程序
from random_words import LoremIpsum
from pyspark import SparkConf , SparkContext
conf = SparkConf().setMaster("local[8]").setAppName("Word Count")
sc = SparkContext(conf = conf)
# 随机生成 nums 条句子
def getSentences(nums):
return LoremIpsum().get_sentences(nums)
# Spark : WordCount 应用程序
def wordCountApp(data):
data = sc.parallelize(data)
words = data.filter( lambda line : len(line.strip()) != 0 ).flatMap( lambda line : line.strip().split(" ") )
result = words.map(lambda key : ( key , 1 ) ).reduceByKey(lambda a , b : a + b)
return result.sortByKey()
if __name__ == "__main__":
nums = 10000
data = list()
data.append(getSentences(nums))
result = wordCountApp(data).take(nums)
print(result)