SQl 获取 DF 操作
# coding:utf-8
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
if __name__ == '__main__':
spark = SparkSession.builder.enableHiveSupport().appName(name="AppName").getOrCreate()
payDebtWholeDta_df = spark.sql(
"SELECT xxxx ...")
# 转换成 rdd 返回
# .rdd.distinct()
print(payDebtWholeDta_df)
# DataFrame[omsOrderItemId: string, billType: string, ...]
print(payDebtWholeDta_df.collect())
# [Row(omsOrderItemId=u'20190122332201', billType=u'1', ...)]
# payDebtWholeDta_df = spark.createDataFrame(payDebtWholeDta_rRdd)
# createOrReplaceTempView 的生命周期跟随 SparkSession 生命周期
payDebtWholeDta_df.createOrReplaceTempView("payDebtWholeDataTable")
# 不会报错,直接替换临时表
payDebtWholeDta_df.createOrReplaceTempView("payDebtWholeDataTable")
df3 = spark.sql("select * from payDebtWholeDataTable ")
print(df3)
# 同上 payDebtWholeDta_df
payDebtWholeDta_df.createGlobalTempView("payDebtWholeDataTable")
# 若不捕获异常,会报下面的错误 ,因为 createGlobalTempView 不能重复创建
# pyspark.sql.utils.AnalysisException: u"Temporary view 'paydebtwholedatatable' already exists;"
try:
payDebtWholeDta_df.createGlobalTempView("payDebtWholeDataTable")
except(AnalysisException) as e:
# err is u"Temporary view 'paydebtwholedatatable' already exists;"
print("err is {}".format(e))
if len(payDebtWholeDta_df.take(1)) > 0:
# DF 打印前1条
print(payDebtWholeDta_df.take(1))
# 写入到json "xxxx"
payDebtWholeDta_df.write.json("xxx")
payDebtWholeDta_rdd00 = spark.read.json("xxxx")
else:
print("无结果")
其他常用操作
DataFrame().crossJoin()
DataFrame().describe()
DataFrame().distinct()
DataFrame().drop()
DataFrame().toJSON
DataFrame().toPandas
DataFrame().union(other)
DataFrame().write()
DataFrame().take(num)
DataFrame().sort(*cols, **kwargs)
DataFrame().select(*cols)
DataFrame().rdd()
DataFrame().orderBy(*cols, **kwargs)
DataFrame().limit
DataFrame().groupBy(*col)
DataFrame().dtypes