from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
sparkConf = SparkConf()
# 设置Driver进程的内存
sparkConf.set('spark.driver.memory', '8G')
# 设置Driver的CPU core数量
sparkConf.set('spark.driver.cores', '2')
# 设置Spark作业总共要用多少个Executor进程来执行
sparkConf.set("spark.executor.instances", "3")
# 设置每个Executor进程的CPU core数量
sparkConf.set("spark.executor.cores", "2")
# 设置每个Executor进程的内存
sparkConf.set("spark.executor.memory", "4G")
# 设置Spark应用的名称
sparkConf.set("spark.app.name", "pyspark-test")
# 设置Executor进程的CPU core数量
# 注意:请确保"spark.kubernetes.executor.limit.cores"参数值 >= "spark.executor.cores"参数值,否者spark executor启动不起来
sparkConf.set("spark.kubernetes.executor.limit.cores", "2")
spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
sc = spark.sparkContext
from pyspark.sql import HiveContext
hiveContext = HiveContext(sc)
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeModel, DecisionTreeClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
1. 单个column join
from pyspark.sql import Row
rdd1 = sc.parallelize([Row(name='Alice', age=5, height=80), \
Row(name='Alice', age=10, height=80), \
Row(name='Alice11', age=10, height=80)])
df1 = rdd1.toDF()
rdd2 = sc.parallelize([Row(name='Alice', weight=45)])
df2 = rdd2.toDF()
df_join = df1.join(df2, "name", "left")
df_join.show()
2. 多个columns join
from pyspark.sql import Row
rdd1 = sc.parallelize([Row(name='Alice', age=5, height=80), \
Row(name='Alice', age=10, height=80), \
Row(name='Alice11', age=10, height=80)])
df1 = rdd1.toDF()
rdd2 = sc.parallelize([Row(name='Alice', age=5, weight=45)])
df2 = rdd2.toDF()
df_join = df1.join(df2, ["name", "age"], "left")