pyspark之Transform
由于工作中要用到spark,写scala还是太菜,趁机学学pyspark这个神器
记录这个主要是自己以后查看方便,其中基本上参考3万字长文,PySpark入门级学习教程,其中也有自己增加了部分算子
# -- coding: utf-8 --
# Author : ming
# time: 2022-03-08
import findspark
findspark.init()
import os
import pyspark
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName("transform_function").setMaster("local[*]")
sc = SparkContext(conf=conf)
"""
Transform 算子解析
"""
# 1, map
rdd1 = sc.parallelize(range(1, 11), 4)
rdd1_map = rdd1.map(lambda x: x*x)
print(rdd1_map.collect())
# 2. flatMap
rdd2 = sc.parallelize(["hello ming", "hello spark", "welcome hadoop", "bye scala"])
rdd2_flat_map = rdd2.flatMap(lambda x: x.split(" "))
print(rdd2_flat_map.collect())
# 3. filter
rdd3 = sc.parallelize(range(1, 11), 4)
print(rdd3.collect())
rdd3_filter = rdd3.filter(lambda x: x % 2 == 0)
print(rdd3_filter.collect())
# 4. distinct
rdd4 = sc.parallelize([1, 2, 2, 3, 3, 3, 4, 5, 6, 6,