df = spark.createDataFrame([('Natalie', 'F', 16.0,160.0),
('Logan', None,18.0,182.0),
('Joseph', 'M',None,None),
('Mason', 'F',17.0,167.0),
(None, 'M',15.0,180.0),
('Anthony', 'F',float('nan'),170.0)],['name','gender','age','height'])
df.show(truncate=False,vertical=False)
+-------+------+----+------+
|name |gender|age |height|
+-------+------+----+------+
|Natalie|F |16.0|160.0 |
|Logan |null |18.0|182.0 |
|Joseph |M |null|null |
|Mason |F |17.0|167.0 |
|null |M |15.0|180.0 |
|Anthony|F |NaN |170.0 |
+-------+------+----+------+
空值处理
删除有空值的行
dropna(how='any', thresh=None, subset=None)
和 na.drop() 一样
how – ‘any’ or ‘all’. If ‘any’, drop a row if it contains any nulls. If ‘all’, drop a row only if all its values are null.
thresh – int, default None If specified, drop rows that have less than thresh non-null values. This overwrites the how parameter.
subset – optional list of column names to consider.
df.dropna(how='any').show() ## 删除所有有空值的行
df.na.drop(how='all').show() ## 删除全为空值的行
df.dropna(subset=['age','gender']).show() ## 删除 'age','gender' 两列中有空值的行
df.dropna(thresh=3).show() ## 删除少于 3 个非空值的行
+-------+------+----+------+
| name|gender| age|height|
+-------+------+----+------+
|Natalie| F|16.0| 160.0|
| Mason| F|17.0| 167.0|
+-------+------+----+------+
+-------+------+----+------+
| name|gender| age|height|
+-------+------+----+------+
|Natalie| F|16.0| 160.0|
| Logan| null|18.0| 182.0|
| Joseph| M|null| null|
| Mason| F|17.0| 167.0|
| null| M|15.0| 180.0|
|Anthony| F| NaN| 170.0|
+-------+------+----+------+
+-------+------+----+------+
| name|gender| age|height|
+-------+------+----+------+
|Natalie| F|16.0| 160.0|
| Mason| F|17.0| 167.0|
| null| M|15.0| 180.0|
+-------+------+----+------+
+-------+------+----+------+
| name|gender| age|height|
+-------+------+----+------+
|Natalie| F|16.0| 160.0|
| Logan| null|18.0| 182.0|
| Mason| F|17.0| 167.0|
| null| M|15.0| 180.0|
|Anthony| F| NaN| 170.0|
+-------+------+----+------+
空值填充
fillna(value, subset=None)
replace(to_replace, value=<no value>, subset=None)
to_replace – bool, int, long, float, string, list or dict. Value to be replaced. If the value is a dict, then value is ignored or can be omitted, and to_replace must be a mapping between a value and a replacement.
value – bool, int, long, float, string, list or None. The replacement value must be a bool, int, long, float, string or None. If value is a list, value should be of the same length and type as to_replace. If value is a scalar and to_replace is a sequence, then value is used as a replacement for each item in to_replace.
subset – optional list of column names to consider. Columns specified in subset that do not have matching data type are ignored. For example, if value is a string, and subset contains a non-string column, then the non-string column is simply ignored.
df.fillna(16.0,subset='age').fillna(160.0,subset='height').fillna('unknown','name').show()
df.replace(float('nan'), 20).show()
df.replace(['Natalie', 'Mason'],['A','B'],['name']).show()
+-------+------+----+------+
| name|gender| age|height|
+-------+------+----+------+
|Natalie| F|16.0| 160.0|
| Logan| null|18.0| 182.0|
| Joseph| M|16.0| 160.0|
| Mason| F|17.0| 167.0|
|unknown| M|15.0| 180.0|
|Anthony| F|16.0| 170.0|
+-------+------+----+------+
+-------+------+----+------+
| name|gender| age|height|
+-------+------+----+------+
|Natalie| F|16.0| 160.0|
| Logan| null|18.0| 182.0|
| Joseph| M|null| null|
| Mason| F|17.0| 167.0|
| null| M|15.0| 180.0|
|Anthony| F|20.0| 170.0|
+-------+------+----+------+
+-------+------+----+------+
| name|gender| age|height|
+-------+------+----+------+
| A| F|16.0| 160.0|
| Logan| null|18.0| 182.0|
| Joseph| M|null| null|
| B| F|17.0| 167.0|
| null| M|15.0| 180.0|
|Anthony| F| NaN| 170.0|
+-------+------+----+------+
过滤
df.filter( ~ isnan('age') & (col('age')>16)).show() ## 过滤 gae 列 值不为 nan 的且 大于 16 的数据
# df.where(~ isnan('age') & (col('age')>16)).show() ##等价于上一行
# df[~ isnan('age') & (col('age')>16)].show() ##等价于上一行
df.filter(df.height.isNotNull() & df.age.isNotNull()).show() ## 过滤 height age 不为空的行
+-----+------+----+------+
| name|gender| age|height|
+-----+------+----+------+
|Logan| null|18.0| 182.0|
|Mason| F|17.0| 167.0|
+-----+------+----+------+
+-------+------+----+------+
| name|gender| age|height|
+-------+------+----+------+
|Natalie| F|16.0| 160.0|
| Logan| null|18.0| 182.0|
| Mason| F|17.0| 167.0|
| null| M|15.0| 180.0|
|Anthony| F| NaN| 170.0|
+-------+------+----+------+
please use ‘&’ for ‘and’, ‘|’ for ‘or’, ‘~’ for ‘not’ when building DataFrame boolean expressions.
转换列的数据类型
astype()
和 cast()
通用
printSchema()
以树形结构 打印 schema
withColumn()
增加一列或替换已有的列
withColumnRenamed()
重新命名列明
df.printSchema()
df.withColumn('age',col('age').astype(IntegerType())).printSchema() ## 将 age 列转换为整型
df.withColumn('height',col('height').cast(StringType())).printSchema() ## 将 height列转换为字符串
df.withColumnRenamed('age','age_1').show() ## 将 列名 age 替换为 age_1
root
|-- name: string (nullable = true)
|-- gender: string (nullable = true)
|-- age: double (nullable = true)
|-- height: double (nullable = true)
root
|-- name: string (nullable = true)
|-- gender: string (nullable = true)
|-- age: integer (nullable = true)
|-- height: double (nullable = true)
root
|-- name: string (nullable = true)
|-- gender: string (nullable = true)
|-- age: double (nullable = true)
|-- height: string (nullable = true)
+-------+------+-----+------+
| name|gender|age_1|height|
+-------+------+-----+------+
|Natalie| F| 16.0| 160.0|
| Logan| null| 18.0| 182.0|
| Joseph| M| null| null|
| Mason| F| 17.0| 167.0|
| null| M| 15.0| 180.0|
|Anthony| F| NaN| 170.0|
+-------+------+-----+------+
根据条件进行数值转换
df = spark.createDataFrame([('Natalie', 'F', 16,'80,56,72','English,Math,Chinese'),
('Logan', 'M',18,'82,88,91','English,Math,Chinese'),
('Joseph', 'M',16,'75,69,80','English,Math,Chinese'),
('Mason', 'F',15,'50,58,70','English,Math,Chinese'),
('Harper', 'M',10,'65,70,65','English,Math,Chinese'),
('Anthony', 'F',14,'66,71,75','English,Math,Chinese')],['name','gender','age','score','subject'])
df.show(truncate=False,vertical=False)
+-------+------+---+--------+--------------------+
|name |gender|age|score |subject |
+-------+------+---+--------+--------------------+
|Natalie|F |16 |80,56,72|English,Math,Chinese|
|Logan |M |18 |82,88,91|English,Math,Chinese|
|Joseph |M |16 |75,69,80|English,Math,Chinese|
|Mason |F |15 |50,58,70|English,Math,Chinese|
|Harper |M |10 |65,70,65|English,Math,Chinese|
|Anthony|F |14 |66,71,75|English,Math,Chinese|
+-------+------+---+--------+--------------------+
when(condition_1, value_1).when(condition_2, value_2)....otherwish(value_n)
当满足condition_1 时 值为 value_1;当满足condition_2 时 值为 value_2… 否则值为 value_n
select(*cols)
选取某些列或进行转换形成新的dataframe
substr(startPos, length)
对字符串进行操作,从指定位置选取一定长度的子串,类似于切片
alias()
给列指定别名 同name()
df.select("*", when(df.age < 12, 'child').when((df.age < 16) & (df.age >=12), 'minor').otherwise('adult').alias('isadult')).show()
df.select(df.name.substr(1, 3).name("brief"),'*').show()
+-------+------+---+--------+--------------------+-------+
| name|gender|age| score| subject|isadult|
+-------+------+---+--------+--------------------+-------+
|Natalie| F| 16|80,56,72|English,Math,Chinese| adult|
| Logan| M| 18|82,88,91|English,Math,Chinese| adult|
| Joseph| M| 16|75,69,80|English,Math,Chinese| adult|
| Mason| F| 15|50,58,70|English,Math,Chinese| minor|
| Harper| M| 10|65,70,65|English,Math,Chinese| child|
|Anthony| F| 14|66,71,75|English,Math,Chinese| minor|
+-------+------+---+--------+--------------------+-------+
+-----+-------+------+---+--------+--------------------+
|brief| name|gender|age| score| subject|
+-----+-------+------+---+--------+--------------------+
| Nat|Natalie| F| 16|80,56,72|English,Math,Chinese|
| Log| Logan| M| 18|82,88,91|English,Math,Chinese|
| Jos| Joseph| M| 16|75,69,80|English,Math,Chinese|
| Mas| Mason| F| 15|50,58,70|English,Math,Chinese|
| Har| Harper| M| 10|65,70,65|English,Math,Chinese|
| Ant|Anthony| F| 14|66,71,75|English,Math,Chinese|