PySpark 代码
from pyspark.sql import SparkSession
import os
os.environ['SPARK_HOME'] = "/Users/XXXX/Downloads/spark-2.4.5-bin-hadoop2.7"
os.environ["PYSPARK_PYTHON"]="/usr/local/Cellar/python/3.7.7/Frameworks/Python.framework/Versions/3.7/bin/python3.7"
os.environ["PYSPARK_DRIVER_PYTHON"]="/usr/local/Cellar/python/3.7.7/Frameworks/Python.framework/Versions/3.7/bin/python3.7"
if __name__ == "__main__":
print("PySpark 101")
print("使用过滤器进行 RDD 转换")
spark = SparkSession \
.builder \
.appName("使用过滤器进行 RDD 转换") \
.master("local[*]") \
.enableHiveSupport() \
.getOrCreate()
py_number_list = [1, 2, 3, 4, 5]
print("打印 py_number_list")
print(py_number_list)
print(type(py_number_list))
print("从 py_number_list 创建 RDD")
#number_rdd = spark.sparkContext.parallelize(py_number_list, 3) # 第一个参数代表待并行化的对象集合,第二个参数代表分区的个数。
number_rdd = spark.sparkContext.parallelize(py_number_list) # 第一个参数代表待并行化的对象集合,第二个参数代表分区的个数。
number_even_rdd = number_rdd.filter(lambda n: n%2 == 0)
print(number_even_rdd.collect())
py_str_list = ["Arun", "Arvind", "Arjun", "Anna"]
print(py_str_list)
str_rdd = spark.sparkContext.parallelize(py_str_list, 2)
str_rdd_result = str_rdd.filter(lambda name: "r" in name).collect()
print(str_rdd_result)
print("input_file_path")
input_file_path = "file:///Users/slyrx/slyrxStudio/github_good_projects/Spark_test/data/tech.txt"
tech_rdd = spark.sparkContext.textFile(input_file_path)
tech_lower_rdd = tech_rdd.filter(lambda ele: 'park' in ele)
tech_lower_rdd_list = tech_lower_rdd.collect()
for element in tech_lower_rdd_list:
print(element)
print("停止 PySpark SparkSession 对象")
spark.stop()