1.利用Scala
- 构建
import spark.implicits._
var data1 = Seq(
("0", "2002", "196", "1", "bai"),
("1", "4004", "192", "2", "wang"),
("0", "7007", "95", "3", "wang"),
("0", "4004", "4", "4", "wang"),
("0", "7007", "15", "5", "wang"),
("1", "0", "14", "6", "zhu"),
("0", "9009", "96", "7", "bai"),
("1", "9009", "126", "8", "bai"),
("0","10010", "19219", "9", "wei")
).toDF("label", "AMOUNT", "Pclass", "MAC_id", "PAYER_CODE")
data1.show()
+-----+------+------+------+----------+
|label|AMOUNT|Pclass|MAC_id|PAYER_CODE|
+-----+------+------+------+----------+
| 0| 2002| 196| 1| bai|
| 1| 4004| 192| 2| wang|
| 0| 7007| 95| 3| wang|
| 0| 4004| 4| 4| wang|
| 0| 7007| 15| 5| wang|
| 1| 0| 14| 6| zhu|
| 0| 9009| 96| 7| bai|
| 1| 9009| 126| 8| bai|
| 0| 10010| 19219| 9| wei|
+-----+------+------+------+----------+
//转换数据类型
var colname = "label,AMOUNT,Pclass,MAC_id"
var colName_list = colname.split(",")
var result_data = data1
for (i <- 0 until colName_list.size) {
result_data = result_data.withColumn(colName_list(i), col(colName_list(i)).cast("double"))
}
var coltype = result_data.dtypes.toMap
结果:
coltype: scala.collection.immutable.Map[String,String] = Map(label -> DoubleType, Pclass -> DoubleType, AMOUNT -> DoubleType, MAC_id -> DoubleType, PAYER_CODE -> StringType)
方法共有三种:
// Method 1: Use describe()
var dfmax1 = df.describe("label").filter("summary = 'max'").select("label").collect().map(_(0)).toList(0).toString.toDouble
var dfmin1 = df.describe("label").filter("summary = 'min'").select("label").collect().map(_(0)).toList(0).toString.toDouble
dfmax1: Double = 1.0
dfmin1: Double = 0.0
// Method 2: Use SQL
df.createOrReplaceTempView("df_table")
spark.sql("SELECT MAX(label) as maxval FROM df_table").collect().map(_(0)).toList(0).toString.toDouble
spark.sql("SELECT MIN(label) as maxval FROM df_table").collect().map(_(0)).toList(0).toString.toDouble
// Method 3: Use agg()
df.agg(max("label")).collect().map(_(0)).toList(0).toString.toDouble
df.agg(min("label")).collect().map(_(0)).toList(0).toString.toDouble
求和
import org.apache.spark.sql.functions._
// 1. 利用agg 计算单列
val sums1=.agg(sum("label")).collect().map(_(0)).toList(0).toString
结果:
sums1: String = 3.0
// 2.变成df 多列算和
val sums1: DataFrame = df.agg(sum("label").as("sum_col1"), sum("AMOUNT").as("sum_col2"))
sums1.show()
结果:
+--------+--------+
|sum_col1|sum_col2|
+--------+--------+
| 3.0| 52052.0|
+--------+--------+
//3.变成Row 利用first方法 多列算和
val sums1: DataFrame = df.agg(sum("label").as("sum_col1"), sum("AMOUNT").as("sum_col2")).first
结果:
sums1: org.apache.spark.sql.Row = [3.0,52052.0]
参考:https://www.runexception.com/q/1034
2.利用Python
可参考:https://cloud.tencent.com/developer/ask/70161