PySpark DataFrame 操作

SQl 获取 DF 操作
# coding:utf-8
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException

if __name__ == '__main__':
    spark = SparkSession.builder.enableHiveSupport().appName(name="AppName").getOrCreate()

    payDebtWholeDta_df = spark.sql(
        "SELECT xxxx ...")
        
    # 转换成 rdd 返回
    # .rdd.distinct()
    
    print(payDebtWholeDta_df)
    # DataFrame[omsOrderItemId: string, billType: string, ...]
    
    print(payDebtWholeDta_df.collect())
	# [Row(omsOrderItemId=u'20190122332201', billType=u'1', ...)]
    # payDebtWholeDta_df = spark.createDataFrame(payDebtWholeDta_rRdd)

    # createOrReplaceTempView 的生命周期跟随 SparkSession 生命周期
    payDebtWholeDta_df.createOrReplaceTempView("payDebtWholeDataTable")
    # 不会报错,直接替换临时表
    payDebtWholeDta_df.createOrReplaceTempView("payDebtWholeDataTable")

    df3 = spark.sql("select * from payDebtWholeDataTable ")
    print(df3)
    # 同上 payDebtWholeDta_df
    payDebtWholeDta_df.createGlobalTempView("payDebtWholeDataTable")
    
    # 若不捕获异常,会报下面的错误 ,因为 createGlobalTempView 不能重复创建
    # pyspark.sql.utils.AnalysisException: u"Temporary view 'paydebtwholedatatable' already exists;"
    
    try:
        payDebtWholeDta_df.createGlobalTempView("payDebtWholeDataTable")
    except(AnalysisException) as e:
        # err is u"Temporary view 'paydebtwholedatatable' already exists;"
        print("err is {}".format(e))

    if len(payDebtWholeDta_df.take(1)) > 0:
        # DF 打印前1条
        print(payDebtWholeDta_df.take(1))
        
        # 写入到json  "xxxx"
        payDebtWholeDta_df.write.json("xxx")
        payDebtWholeDta_rdd00 = spark.read.json("xxxx")

    else:
        print("无结果")


其他常用操作

DataFrame().crossJoin()
DataFrame().describe()
DataFrame().distinct()
DataFrame().drop()
DataFrame().toJSON
DataFrame().toPandas
DataFrame().union(other)
DataFrame().write()
DataFrame().take(num)
DataFrame().sort(*cols, **kwargs)
DataFrame().select(*cols)
DataFrame().rdd()
DataFrame().orderBy(*cols, **kwargs)
DataFrame().limit
DataFrame().groupBy(*col)
DataFrame().dtypes

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值