from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
sparkConf = SparkConf()
# 设置Driver进程的内存
sparkConf.set('spark.driver.memory', '8G')
# 设置Driver的CPU core数量
sparkConf.set('spark.driver.cores', '2')
# 设置Spark作业总共要用多少个Executor进程来执行
sparkConf.set("spark.executor.instances", "3")
# 设置每个Executor进程的CPU core数量
sparkConf.set("spark.executor.cores", "2")
# 设置每个Executor进程的内存
sparkConf.set("spark.executor.memory", "4G")
# 设置Spark应用的名称
sparkConf.set("spark.app.name", "pyspark-test")
# 设置Executor进程的CPU core数量
# 注意:请确保"spark.kubernetes.executor.limit.cores"参数值 >= "spark.executor.cores"参数值,否者spark executor启动不起来
sparkConf.set("spark.kubernetes.executor.limit.cores", "2")
spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
sc = spark.sparkContext
from pyspark.sql import HiveContext
hiveContext = HiveContext(sc)
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeModel, DecisionTreeClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
示例:
from pyspark.sql import Row
rdd = sc.parallelize([Row(name='Alice', age=5, height=80), \
Row(name='Alice', age=10, height=80), \
Row(name='Alice11', age=10, height=80)])
# rdd
print(type(rdd))
print(rdd.take(5))
print("\n" * 2)
# dataframe
df = rdd.toDF()
print(type(df))
print(df.take(5))
df.show()
print("\n" * 2)
# re_rdd
re_rdd = df.rdd
print(type(re_rdd))
print(re_rdd.take(5))