import findspark
from pyspark.pandas import isnull
from pyspark.sql.functions import length
from pyspark.sql.types import StructType, StructField, StringType
findspark.init()
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.\
builder.\
master("local").\
appName ("Python Spark SQL Hive integration example").getOrCreate()
data=[[34,None,'Mary'],
[24,89,'Alice'],
[30,90,'Mark']]
df=spark.createDataFrame(data,['C1','C2','C3'])
#条件: C2不为空且C1大于25
df.where('C2 is not null and C1>25').show()
#条件: C2为空
df.where(isnull('C2')).show()
#条件: C2不为空且C3长度大于3
df.where(~isnull('C2')).where(length('C3')>3).show()
#条件: C2不为空或C1大于25
df.filter((~isnull('C2'))|(data('C1')>25)).show()
# 使用sql查询
# 创建临时表
df.createOrReplaceTempView("people")
# 查询全表--使用反射机制
spark.sql("select * from people")
# 查询条件
personDf=spark.sql("select * from people where C1=34")
# 转化为rdd再进行查询
personRdd=personDf.rdd.filter(lambda line:'Alice' in line)
PySpark DataFrame 查询操作
最新推荐文章于 2024-05-10 16:58:52 发布