pyspark rdd dataframe互转

from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession 

sparkConf = SparkConf()

# 设置Driver进程的内存
sparkConf.set('spark.driver.memory', '8G')
# 设置Driver的CPU core数量
sparkConf.set('spark.driver.cores', '2')
# 设置Spark作业总共要用多少个Executor进程来执行
sparkConf.set("spark.executor.instances", "3")
# 设置每个Executor进程的CPU core数量
sparkConf.set("spark.executor.cores", "2")
# 设置每个Executor进程的内存
sparkConf.set("spark.executor.memory", "4G")
# 设置Spark应用的名称
sparkConf.set("spark.app.name", "pyspark-test")

# 设置Executor进程的CPU core数量
# 注意:请确保"spark.kubernetes.executor.limit.cores"参数值 >= "spark.executor.cores"参数值,否者spark executor启动不起来
sparkConf.set("spark.kubernetes.executor.limit.cores", "2")

spark = SparkSession.builder.config(conf=sparkConf).enableHiveSupport().getOrCreate()
sc = spark.sparkContext

from pyspark.sql import HiveContext
hiveContext = HiveContext(sc)

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeModel, DecisionTreeClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

示例:

from pyspark.sql import Row
rdd = sc.parallelize([Row(name='Alice', age=5, height=80), \
                      Row(name='Alice', age=10, height=80), \
                      Row(name='Alice11', age=10, height=80)])

# rdd
print(type(rdd))
print(rdd.take(5))
print("\n" * 2)

# dataframe
df = rdd.toDF()
print(type(df))
print(df.take(5))
df.show()
print("\n" * 2)

# re_rdd
re_rdd = df.rdd
print(type(re_rdd))
print(re_rdd.take(5))

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值