pyspark join

from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession 

sparkConf = SparkConf()

# 设置Driver进程的内存
sparkConf.set('spark.driver.memory', '8G')
# 设置Driver的CPU core数量
sparkConf.set('spark.driver.cores', '2')
# 设置Spark作业总共要用多少个Executor进程来执行
sparkConf.set("spark.executor.instances", "3")
# 设置每个Executor进程的CPU core数量
sparkConf.set("spark.executor.cores", "2")
# 设置每个Executor进程的内存
sparkConf.set("spark.executor.memory", "4G")
# 设置Spark应用的名称
sparkConf.set("spark.app.name", "pyspark-test")

# 设置Executor进程的CPU core数量
# 注意:请确保"spark.kubernetes.executor.limit.cores"参数值 >= "spark.executor.cores"参数值,否者spark executor启动不起来
sparkConf.set("spark.kubernetes.executor.limit.cores", "2")

spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
sc = spark.sparkContext

from pyspark.sql import HiveContext
hiveContext = HiveContext(sc)

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeModel, DecisionTreeClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

 

1. 单个column join

from pyspark.sql import Row
rdd1 = sc.parallelize([Row(name='Alice', age=5, height=80), \
                      Row(name='Alice', age=10, height=80), \
                      Row(name='Alice11', age=10, height=80)])
df1 = rdd1.toDF()

rdd2 = sc.parallelize([Row(name='Alice', weight=45)])
df2 = rdd2.toDF()

df_join = df1.join(df2, "name", "left")
df_join.show()

2. 多个columns join

from pyspark.sql import Row
rdd1 = sc.parallelize([Row(name='Alice', age=5, height=80), \
                       Row(name='Alice', age=10, height=80), \
                       Row(name='Alice11', age=10, height=80)])
df1 = rdd1.toDF()

rdd2 = sc.parallelize([Row(name='Alice', age=5, weight=45)])
df2 = rdd2.toDF()

df_join = df1.join(df2, ["name", "age"], "left")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值