常规操作
# 创建
myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"\
.split(" ")
words = spark.sparkContext.parallelize(myCollection, 2)
keyword = words.keyBy(lambda word:word.lower()[0])
keyword.mapValues(lambda word: word.upper()).collect()
[('s', 'SPARK'),
('t', 'THE'),
('d', 'DEFINITIVE'),
('g', 'GUIDE'),
(':', ':'),
('b', 'BIG'),
('d', 'DATA'),
('p', 'PROCESSING'),
('m', 'MADE'),
('s', 'SIMPLE')]
# look up the result for particular key
keyword.lookup("s")
['Spark', 'Simple']
#sampleByKey
#sample an RDD by a set of keys
#RDD.sampleByKey(withReplacement, fractions, seed=None)[source]
# 第一个是是否有放回,第二个是概率,第三个是随机数种子
# 这个没法确认返回子集的大小
import random
## extract characters in words
distinctChars = words.flatMap(lambda word:list(word.lower())\
.distinct()\
.collect()
sampleMap = dict(map(lambda c:(c, random.random()), distinctChars))
words.map(lambda word: (word.lower()[0], word))\
.sampleByKey(True, sampleMap, 6)\
.collect()