- 空值处理
(1)删除空值
#删除所有包含空值的行
df = df.na.drop()
#删除‘excute_errormsg’列含有空值的行
df = df.na.drop(subset='excute_errormsg')
(2)空值填充
#所有空值填充‘888’
df = df.na.fill('888‘)
#'excute_errormsg'列的空值填充‘888’
df = df.na.fill(subset='excute_errormsg')
(3)选择空值所在行
df = data.filter(data.id.isNull())
- 列重命名
df = df.withColumnRenamed('old-name','new-name')
- 删除指定列
df = df.drop('a','b','c')
- 选择指定’a’,‘b’,'c’列
df=df.select('a','b','c')
- 条件筛选
筛选‘execute_state ’列值为‘4’或者‘excute_errormsg’列不等‘-1’的值
data = data.filter((data.execute_state == '4')|(data.excute_errormsg != '-1'))
- 增加新列
(1)增加新列‘excute’,值为0
from pyspark.sql.functions import lit
data1 = data1.withColumn('excute',lit(0))
(2)增加新列‘time’,值为当前时间
from pyspark.sql.functions import current_timestamp
data1 = data1.withColumn('time',current_timestamp())
- dataframe左连接
df = data.join(data1, data.id == data1.id_1, 'left')
- dataframe union
#data1+data103+data109
data3=data1.union(data103).union(data109)
- 数据类型转换
from pyspark.sql.types import IntegerType,StringType,DateType,LongType,FloatType
data = data.withColumn('a',data['a'].cast(FloatType()))\
.withColumn('b',data['b'].cast(StringType()))\
.withColumn('c',data['c'].cast(IntegerType()))