一、并行化创建RDD
from pyspark import SparkContext,SparkConf
import os
os.environ['SPARK_HOME'] = '/exportrver/spark'
PYSPARK_PYTHON = "/root/anaconda3/envs/pyspark_env/bin/python"
os.environ['PYSPARK_PYTHON'] = PYSPARK_PYTHON
os.environ['PYSPARK_DRIVER_PYTHON'] = PYSPARK_PYTHON
if __name__ == '__main__':
print("PySpark First Program")
data = ["hello","world","hello","world"]
conf = SparkConf().setAppName("miniProject").setMaster("local[*]")
sc = SparkContext(conf=conf)
rdd = sc.parallelize(data)
res_rdd = rdd