val spark = SparkSession.builder().config(conf).getOrCreate();
val df = spark.createDataFrame(List(("Jason",34),("Tom",20))).toDF("Name","age");
df.select("Name","age").show();
val spark = SparkSession.builder().config(conf).getOrCreate();
val df = spark.createDataFrame(List(("Jason",34),("Tom",20))).toDF("Name","age");
df.selectExpr("*","(2019-age) as Brith_Year").show();
val spark = SparkSession.builder().config(conf).getOrCreate();
val df = spark.createDataFrame(List(("Jason",34),("Tom",20))).toDF("Name","age");
df.selectExpr("count(distinct(Name)) as count").show();
输出结果
+-----+|count|+-----+|2|+-----+
filler(condition), where(condition)
val spark = SparkSession.builder().config(conf).getOrCreate();
val df = spark.createDataFrame(List(("Jason",34),("Tom",20))).toDF("Name","age");
df.filter("age < 30").show();
输出结果
+----+---+|Name|age|+----+---+| Tom|20|+----+---+
val spark = SparkSession.builder().config(conf).getOrCreate();
val df = spark.createDataFrame(List(("Jason",34),("Tom",20))).toDF("Name","age");//df.filter("age < 30").show();
df.filter("age == 34 and Name == 'Jason'").show();
val spark = SparkSession.builder().config(conf).getOrCreate();import spark.implicits._
val df = spark.sparkContext.parallelize(List((1,"Jason"),(3,"Mike"),(4,"Tom"))).toDF("ID","Name");
df.orderBy(col("ID").desc).limit(1).show();
输出结果
+---+----+| ID|Name|+---+----+|4| Tom|+---+----+
union(otherDataFrame)
val spark = SparkSession.builder().config(conf).getOrCreate();import spark.implicits._
val df1 = spark.sparkContext.parallelize(List((1,"Jason"),(3,"Tom"))).toDF("ID","Name");
val df2 = spark.sparkContext.parallelize(List((2,"Mike"),(5,"James"))).toDF("ID","Name");
df1.union(df2).sort(col("ID").desc).show();
val spark = SparkSession.builder().config(conf).getOrCreate();import spark.implicits._
val df1 = spark.sparkContext.parallelize(List((1,"Jason"),(3,"Tom"))).toDF("ID","Name");
val df2 = spark.sparkContext.parallelize(List((2,"Mike"),(5,"James"))).toDF("ID","Name");
df1.union(df2).sort(col("ID").desc).withColumn("New Name",col("ID")+col("ID")).show()
val spark = SparkSession.builder().config(conf).getOrCreate();import spark.implicits._
val df1 = spark.sparkContext.parallelize(List(("one",1),("Two",2),("Three",3))).toDF("Num","Number");
df1.withColumnRenamed("Num","Number1").withColumnRenamed("Number","Num1").show();
val spark = SparkSession.builder().config(conf).getOrCreate();import spark.implicits._;
val df = spark.sparkContext.parallelize(List(("Jason","DBA"),("Jason","BigData"),("Jason","Dev"))).toDF("Name","Job");
df.drop("Name").sort(col("Job").desc).show();
val spark = SparkSession.builder().config(conf).getOrCreate();import spark.implicits._;
val df = spark.sparkContext.parallelize(List(("Jason","DBA"),("Jason","BigData"),("Jason","Dev"))).toDF("Name","Job");
df.sample(0.2).show();
val spark = SparkSession.builder().config(conf).getOrCreate();import spark.implicits._;
val df = spark.sparkContext.parallelize(List(1,2,3,4,5,6,7,8,9,10)).toDF("Num");
df.describe("Num").show();
val spark = SparkSession.builder().config(conf).getOrCreate();import spark.implicits._;
val df = spark.sparkContext.parallelize(List(("abcdefghijklmn123123132"))).toDF("Num");
df.show(true);