不定期上代码干货
spark列转行
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext, Row, functions as F
from pyspark.sql.functions import array, col, explode, struct, lit
conf = SparkConf().setAppName("test").setMaster("local[*]")
sc = SparkContext(conf=conf)
spark = SQLContext(sc)
# df is datasource, by will exclude column
def df_columns_to_line(df, by):
# Filter dtypes and split into column names and type description
df_a = df.select([col(c).cast("string") for c in df.columns])
cols, dtypes = zip(*((c, t) for (c, t) in df_a.dtypes if c not in by))
# Spark SQL supports only homogeneous columns
assert len(set(dtypes)) == 1, "All columns have to be of the same type"
# Create and explode an array of (column_name, column_value) str