1.dataframe 基本操作
def main(args: Array[String]): Unit = { val spark = SparkSession.builder() .appName("test") .master("local[*]") .getOrCreate() import spark.implicits._ val people = spark.read.format("json").load("people.json") people.show() /* +----+-------+ | age| name| +----+-------+ |null|Michael| | 30| Andy| | 19| Justin| +----+-------+ */ people.printSchema() /*root |-- age: long (nullable = true) |-- name: string (nullable = true)*/ people.select($"name").show() /* +-------+ | name| +-------+ |Michael| | Andy| | Justin| +-------+*/ people.select($"name", $"age".cast("string").as("age")).printSchema() /* root |-- name: string (nullable = true) |-- age: string (nullable = true)*/ people.select($"name", ($"age" + 1).as("age")).show() /* +-------+----+ | name| age| +-------+----+ |Michael|null| | Andy| 31| | Justin| 20| +-------+----+*/ people.filter($"age" > 21).show() // +---+----+ // |age|name| // +---+----+ // | 30|Andy| // +---+----+ people.groupBy("age").count().show() // +----+-----+ // | age|count| // +----+-----+ // | 19| 1| // |null| 1| // | 30| 1| // +----+-----+ spark.stop() }
2.用sql 访问dataframe
val people = spark.read.format("json").load("people.json") people.createOrReplaceTempView("tb") spark.sql("select name,age from tb").show() // +-------+----+ // | name| age| // +-------+----+ // |Michael|null| // | Andy| 30| // | Justin| 19| // +-------+----+
3.创建dataset
val ccDs = Seq(Person("jason",28),Person("dong",27)).toDS() ccDs.select("name").show() val pDs = Seq(1,2,3).toDS() pDs.map(_+1).show() pDs.printSchema()
4.反射推断模式
val spark = SparkSession.builder() .appName("test") .master("local[*]") .getOrCreate() import spark.implicits._ val sc = spark.sparkContext val peopleDF = sc.textFile("people.txt") .map(_.split(",", -1)) .map(arr => Person(arr(0).trim, arr(1).trim.toInt)) .toDF().cache().createOrReplaceTempView("people") val teenagerDF = spark.sql("select * from people where age between 13 and 15").cache() teenagerDF.map(t => "name :" + t(0)).show() // +-------------+ // | value| // +-------------+ // |name :Michael| // +-------------+ teenagerDF.map(t => "name:" + t.getAs[String]("name")).show() // +-------------+ // | value| // +-------------+ // |name :Michael| // +-------------+ implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]] teenagerDF.map(t => t.getValuesMap[Any](Seq("name", "age"))).collect().foreach(println) // +-------------+ // | value| // +-------------+ // |name :Michael| // +-------------+ spark.stop()
5.通过编程指定schema来创建DF
val peopleRDD = sc.textFile("people.txt") .map(_.split(",", -1)) .map(arr => Row(arr(0).trim, arr(1).trim)) val schemaString = "name age" val structfield = schemaString.split("\\s+") .map(a => StructField(a, StringType, true)) val schema = StructType(structfield) val peopleDF = spark.createDataFrame(peopleRDD, schema) peopleDF.show() // +-------+---+ // | name|age| // +-------+---+ // |Michael| 15| // | Andy| 30| // | Justin| 19| // +-------+---+
6.直接从file执行sql
spark.sql("select name,age from json.`people.json`").show() // +-------+----+ // | name| age| // +-------+----+ // |Michael|null| // | Andy| 30| // | Justin| 19| // +-------+----+
7.合并schema
val squaresDF = spark.sparkContext.makeRDD(1 to 5).map(i => (i, i * i)).toDF("value", "square") squaresDF.write.parquet("data/test_table/key=1") val cubesDF = spark.sparkContext.makeRDD(6 to 10).map(i => (i, i * i * i)).toDF("value", "cube") cubesDF.write.parquet("data/test_table/key=2") val mergedDF = spark.read.option("mergeSchema", "true").parquet("data/test_table") mergedDF.printSchema() // root // |-- value: integer (nullable = true) // |-- square: integer (nullable = true) // |-- cube: integer (nullable = true) // |-- key: integer (nullable = true) mergedDF.show() // +-----+------+----+---+ // |value|square|cube|key| // +-----+------+----+---+ // | 4| 16|null| 1| // | 5| 25|null| 1| // | 9| null| 729| 2| // | 10| null|1000| 2| // | 1| 1|null| 1| // | 2| 4|null| 1| // | 3| 9|null| 1| // | 6| null| 216| 2| // | 7| null| 343| 2| // | 8| null| 512| 2| // +-----+------+----+---+
8.dataframe 字符串拼接
val squaresDF = spark.sparkContext.makeRDD(1 to 5).map(i => (i, i * i)).toDF("value", "square") squaresDF.createOrReplaceTempView("vs") squaresDF.show() squaresDF.map{case Row(key:Int,value:Int)=>s"$key$value"}.toDF("vv").show() spark.sql("select concat(value,square) as vv from vs").show()