import org.apache.spark.sql.functions._ val myDF = sqlContext.parquetFile("hdfs:/to/my/file.parquet") val coder: (Int => String) = (arg: Int) => {if (arg < 100) "little" else "big"} val sqlfunc = udf(coder) myDF.withColumn("Code", sqlfunc(col("Amt")))
(1 to minusHours).toDF("hour_num").withColumn("a", datediff(current_date, current_date)).show
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
val schema =
StructType(
StructField("imps", IntegerType, true) ::
StructField("clks", LongType, true) :: Nil)
val row1 = Row(1, 2)
val row2 = Row(2, 3)
val row3 = Row(2, 2)
val row4 = Row(3, 3)
val row5 = Row(3, 2)
val row6 = Row(3, 3)
val row7 = Row(4, 2)
val row8 = Row(5, 3)
val df = sqlContext.createDataFrame(sc.parallelize(List(row1,row2,row3,row4,row5,row6,row7,row8)), schema)
df.groupBy("imps").agg(count("imps")).show
+----+-----------+
|imps|COUNT(imps)|
+----+-----------+
| 1| 1|
| 2| 2|
| 3| 3|
| 4| 1|
| 5| 1|
+----+-----------+
==============================================
import org.apache.spark.sql.functions._
val myDF = sqlContext.parquetFile("hdfs:/to/my/file.parquet")
val coder: (Int => String) = (arg: Int) => {if (arg < 100) "little" else "big"}
val sqlfunc = udf(coder)
myDF.withColumn("Code", sqlfunc(col("Amt")))
I think withColumn is the right way to add a column
but that fail at v1.4.1
======
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
val schema =
StructType(
StructField("imps", LongType, true) ::
StructField("clks", LongType, true) ::
StructField("cost", DoubleType, true) ::
StructField("spending", DoubleType, true) :: Nil)
// val people =
// sc.textFile("examples/src/main/resources/people.txt").map(
// _.split(",")).map(p => Row(p(0), p(1).trim.toInt))
val row = Row(null, null, null, null)
val row = Row(null, null, null, null)
val dataFrame = sqlContext.createDataFrame(sc.parallelize(List(row)), schema)
dataFrame.write.parquet(savePath)
val schema2 =
StructType(
StructField("id", StringType, true) :: Nil)
val row = Row("1")
val d2 = sqlContext.createDataFrame(sc.parallelize(List(row)), schema)
d1.withColumn("c2",d2.col("id")) ==> That no use!!!
case classUserDefinedFunction(f: AnyRef, dataType: DataType) extends Product with Serializable
Experimental
A user-defined function. To create one, use the udf functions in functions. As an example:
import sqlContext._
// Defined a UDF that returns true or false based on some numeric score.
val predict = udf((score: Double) => if (score > 0.5) true else false)
// Projects a column that adds a prediction column based on the score column.
df.select( predict(df("score")) )
That`s succes!!
import sqlContext.implicits._ //下边代码中 "$" 才能被识别出来
dfOffline.select($"member_id", formatDate($"create_time") as "costDate", memberCastOneDay($"goods_price", $"goods_num") as "cost").registerTempTable("tmpMiddleData1")