把dataframe一行内容变成多行。
脚本:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
@author:
@contact:
@time:
"""
from __future__ import print_function
from pyspark.sql import SparkSession
import os, time
from pyspark.sql import Row
from pyspark.sql.functions import split, explode
if __name__ == "__main__":
os.environ['SPARK_HOME'] = "E:/data_page/spark-2.0.2-bin-hadoop2.7"
spark = SparkSession.builder.appName("test").master("local[2]").getOrCreate()
datas = ["hi I love you", "hello ni hao", "ni hao"]
sc = spark.sparkContext
rdd = sc.parallelize(datas)
df=rdd.map(lambda x: Row(text=x)).toDF()
df.show()
Df_split = df.withColumn('textNew', explode(split('text', ' '))).where('text != ""')
Df_split.show()
time.sleep(1)
spark.stop()
结果如下:
+-------------+
| text|
+-------------+
|hi I love you|
| hello ni hao|
| ni hao|
+-------------+
+-------------+-------+
| text|textNew|
+-------------+-------+
|hi I love you| hi|
|hi I love you| I|
|hi I love you| love|
|hi I love you| you|
| hello ni hao| hello|
| hello ni hao| ni|
| hello ni hao| hao|
| ni hao| ni|
| ni hao| hao|
+-------------+-------+