1 SparkContext 执行环境入口
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName('test')\
.setMaster('local[*]')
sc = SparkContext(conf=conf)
2. 构建RDD对象
2.1 集合 -> RDD
rdd = sc.parallelize([1, 2, 3, 4, 5, 6], 3)
print(rdd.glom().collect(), rdd.getNumPartitions())
2.2 文件 -> RDD
rdd = sc.textFile("./data.csv")
print(rdd.collect())
2.3 RDD -> 文件
rdd = sc.parallelize([1, 2, 3], 3)
rdd.saveAsTextFile('./output')
'''
生成output文件夹
里面有按分区存储的多个文件
'''
3 RDD 算子
3.1 map、foreach、mapPartitions、foreach Partitions
rdd = sc.parallelize([1, 2, 3, 4, 5, 6], 3)
rdd2 = rdd.map(lambda x: (x, 1))
print(rdd2.map(lambda x: x[0] + x[1]).collect())
rdd = sc.parallelize([1, 2, 3])
rdd.foreach(lambda x: print(x))
rdd.foreach(lambda x: -x)
rdd.collect()
'''
map 一次调出一个元素进行计算,io次数多
mapPartitions 一次将一个分区的所有元素调出计算s
'''
rdd = sc.parallelize([1, 2, 3, 4, 5, 6], 3)
def func(iter):
res = list()
for it in iter:
res.append(it * 10)
return res
rdd.mapPartitions(func).collect()
3.2 flatMap 先map再解除嵌套
rdd = sc.textFile("./data.csv")
print(rdd.collect())
rdd.flatMap(lambda x: x.split(' ')).collect()
3.3 reduceByKey、reduce、fold 分组聚合
rdd = sc.parallelize([('a', 1), ('a', 2), ('b', 3)])
print(rdd.reduceByKey(lambda a, b: a + b).collect())
rdd = sc.parallelize(range(1, 3))
print(rdd.reduce(lambda a, b: a + b))
print(sc.parallelize([('a', 1), ('a', 1)]).reduce(lambda a, b: a + b))
rdd = sc.parallelize([1, 2, 3, 4, 5, 6], 3)
print(rdd.fold(10, lambda a, b: a + b))
'''
[[1, 2], [3, 4], [5, 6]]
10 + 1 + 2 = 13
10 + 3 + 4 = 17
10 + 5 + 6 = 21
10 + 13 + 17 + 21 = 61
> 61
'''
3.4 mapValue 二元组value进行map操作
rdd = sc.parallelize([('a', 1), ('a', 2), ('b', 3)])
rdd.mapValues(lambda x: x * 10).collect()
3.5 groupBy、groupByKey
- groupBy、groupByKey、reduceByKey区别
rdd = sc.parallelize([('a', 1), ('a', 2), ('b', 3), ('b', 4)])
rdd2 = rdd.groupBy(lambda x: x[0])
print(rdd2.collect())
'''
返回的是迭代器,需进一步转换
[('a', <pyspark.resultiterable.ResultIterable object at 0x106178370>),
('b', <pyspark.resultiterable.ResultIterable object at 0x1060abe50>)]
'''
rdd3 = rdd2.map(lambda x: (x[0], list(x[1])))
print(rdd3.collect())
'''
[('a', [('a', 1), ('a', 2)]),
('b', [('b', 3), ('b', 4)])]
'''
rdd = sc.parallelize([('a', 1), ('a', 2), ('b', 3), ('b', 4)])
rdd2 = rdd.groupByKey()
rdd2.map(lambda x: (x[0], list(x[1]))).collect()
3.6 filter、distinct 过滤筛选
rdd = sc.parallelize([1, 2, 3, 4, 5])
rdd.filter(lambda x: x > 3).collect()
rdd = sc.parallelize([1, 1, 1, 1, 2, 3, 'a', 'a'])
rdd.distinct().collect()
3.7 union 合并
rdd_a = sc.parallelize([1, 1, 2, 3])
rdd_b = sc.parallelize([2, 3, ('a', 1), ('b', 2)])
rdd_a.union(rdd_b).collect()
3.8 join、leftOuterJoin、rightOuterJoin 连接
rdd_a = sc.parallelize([('a', 1), ('a', 2), ('b', 3)])
rdd_b = sc.parallelize([('a', 1), ('b', 2), ('c', 3)])
print(rdd_a.join(rdd_b).collect())
'''
内连接 取交集
[('b', (3, 2)),
('a', (1, 1)),
('a', (2, 1))]
'''
print(rdd_a.leftOuterJoin(rdd_b).collect())
'''
左连接 取交集和左边全部
[('b', (3, 2)),
('a', (1, 1)),
('a', (2, 1))]
'''
print(rdd_a.rightOuterJoin(rdd_b).collect())
'''
右连接 取交集和右边全部
[('b', (3, 2)),
('c', (None, 3)),
('a', (1, 1)),
('a', (2, 1))]
'''
3.9 intersection 交集
rdd_a = sc.parallelize([('a', 1), ('a', 2), ('b', 3)])
rdd_b = sc.parallelize([('a', 1), ('b', 2), ('c', 3)])
rdd_a.intersection(rdd_b).collect()
3.10 sortBy、sortByKey 排序
rdd = sc.parallelize([[1, 2, 3],
[7, 8, 9],
[4, 5, 6]])
rdd.sortBy(lambda x: x[1], ascending=True, numPartitions=3).collect()
'''
[[1, 2, 3],
[4, 5, 6],
[7, 8, 9]]
'''
'''
ascending True升序,False降序
numPartitions 全局有序要设为1,否则只能保证分区内有序
keyfunc 对key进行处理,再排序
'''
rdd = sc.parallelize([('a', 1), ('c', 2), ('B', 3)])
print(rdd.sortByKey(ascending=True, numPartitions=1).collect())
'''
[('B', 3), ('a', 1), ('c', 2)]
'''
print(rdd.sortByKey(ascending=True, numPartitions=1, keyfunc=lambda k: str(k).lower()).collect())
'''
[('a', 1), ('B', 3), ('c', 2)]
'''
3.11 countByKey 统计key出现次数
rdd = sc.parallelize([('a', 1, 2), ('a'), ('b', 1)])
rdd.countByKey()
3.12 first、take、top、count 取元素
rdd = sc.parallelize([('a', 1, 2), ('a'), ('b', 1)])
print(rdd.first() )
print(rdd.take(2))
print(rdd.count())
rdd = sc.parallelize([2, 4, 1, 6])
print(rdd.top(2))
3.13 takeOrdered 排序取前n个
'''
param1: n
param2: func取数前更改元素,不更改元素本身,
不传func,默认升序(取前n最小值)
func = lambda x: -x 变为降序,取前n最大值,和top相同
'''
rdd = sc.parallelize([2, 4, 1, 6])
rdd.takeOrdered(2)
rdd.takeOrdered(2, lambda x: -x)
3.14 takeSample 随机抽取
'''
param1: True随机有放回抽样,Fasle不放回抽样
param2: 抽样个数
param3: 随机数种子
'''
rdd = sc.parallelize([1])
rdd.takeSample(True, 2)
4 RDD 缓存
'''
执行完rdd2_2后,rdd1、rdd2都消失;
执行到2_2后,需要重新执行rdd1、rdd2,再生成rdd2_2
'''
rdd1 = sc.parallelize([1, 2, 3])
rdd2 = rdd1.map(lambda x: x + 1)
rdd2_1 = rdd2.map(lambda x: x * 10)
rdd2_2 = rdd2.map(lambda x: x * 20)
from pyspark.storagelevel import StorageLevel
'''
缓存rdd2,后续不用再算一遍rdd1、rdd2
'''
rdd2.cache()
rdd2.persist(StorageLevel.MEMORY_ONLY)
rdd2.persist(StorageLevel.MEMORY_ONLY_2)
rdd2.persist(StorageLevel.DISK_ONLY)
rdd2.persist(StorageLevel.MEMORY_AND_DISK)
rdd2.persist(StorageLevel.OFF_HEAP)
rdd2.unpersist()