创建数据框
import org.apache.spark.sql.functions._
import spark.implicits._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.functions._
import spark.implicits._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
val builder = SparkSession
.builder()
.appName("learningScala")
.config("spark.executor.heartbeatInterval","60s")
.config("spark.network.timeout","120s")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.kryoserializer.buffer.max","512m")
.config("spark.dynamicAllocation.enabled", false)
.config("spark.sql.inMemoryColumnarStorage.compressed", true)
.config("spark.sql.inMemoryColumnarStorage.batchSize", 10000)
.config("spark.sql.broadcastTimeout", 600)
.config("spark.sql.autoBroadcastJoinThreshold", -1)
.config("spark.sql.crossJoin.enabled", true)
.master("local[*]")
val spark = builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
builder: org.apache.spark.sql.SparkSession.Builder = org.apache.spark.sql.SparkSession$Builder@4bed240d
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@7ccd8b62
val df = Seq(
(1,"male", 5),
(2,"male", 6),
(3,"female", 7),
(4,"female", 8),
(5,"female", 9),
(5,"female", 10)
).toDF("id","sex","age" )
df.show()
+---+------+---+
| id| sex|age|
+---+------+---+
| 1| male| 5|
| 2| male| 6|
| 3|female| 7|
| 4|female| 8|
| 5|female| 9|
| 5|female| 10|
+---+------+---+
df: org.apache.spark.sql.DataFrame = [id: int, sex: string ... 1 more field]
agg直接求中位数
df.groupBy("sex").agg(count("age").as("cnt"),
expr("percentile(age, array(0.5))[0]").alias("median")
).show()
+------+---+------+
| sex|cnt|median|
+------+---+------+
|female| 4| 8.5|
| male| 2| 5.5|
+------+---+------+
spark.sql求解
df.createOrReplaceTempView("tmp")
spark.sql("select sex, percentile_approx(age, 0.5) as median_age from tmp group by sex").show()
+------+----------+
| sex|median_age|
+------+----------+
|female| 8|
| male| 5|
+------+----------+
spark.sql的percentile_approx函数算出来的中位数似乎不是很准确,具体原因,暂不清楚
2021-01-27 雨后初晴于南京江宁区九龙湖