spark sql 的常用写法

最新推荐文章于 2024-08-16 20:58:31 发布

maketubu7

最新推荐文章于 2024-08-16 20:58:31 发布

阅读量1.5k

点赞数

分类专栏： spark

本文链接：https://blog.csdn.net/maketubu7/article/details/82884972

版权

spark 专栏收录该内容

23 篇文章 0 订阅

订阅专栏

import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object taxi_csv {
  def main(args: Array[String]): Unit = {
    val csv_path = "datas/taxi.csv"



    val spark = SparkSession.builder()
      .master("local[*]")
      .appName("csv")
      .getOrCreate()



    val schema = StructType(Array(
      StructField("tid",LongType),
      StructField("lan",StringType),
      StructField("len",StringType),
      StructField("time",StringType)
    ))
    spark.sqlContext.read
      .format("csv")
      .option("header",false)
      .schema(schema)
      .load(csv_path)
        .createOrReplaceTempView("taxi")

    val taxiDF = spark.sqlContext.read
      .format("csv")
      .option("header",false)
      .schema(schema)
      .load(csv_path)

    val hourDF = taxiDF.select(taxiDF.col("tid"),taxiDF.col("time").substr(0,2).as("hour"))

    spark.sql(
      """
        |SELECT tid,SUBSTR(time,0,2) AS hour
        |FROM taxi
      """.stripMargin).createOrReplaceTempView("taxi_hour")



    val hour_groupDF = hourDF.groupBy("tid","hour").count()
    
    //对应下面的sql
       spark.sql(
      """
        |SELECT tid,hour,count(1) AS count
        |FROM taxi_hour
        |GROUP BY tid,hour
      """.stripMargin).createOrReplaceTempView("taxi_hour_count")

      hour_groupDF.createOrReplaceTempView("taxi_hour_count")

      spark.sql(
      """
        |SELECT tid,hour,count,
        |ROW_NUMBER() OVER(PARTITION BY hour ORDER BY count DESC) AS rnk
        |from taxi_hour_count
      """.stripMargin).createOrReplaceTempView("taxi_hour_count_rnk")

    val taxi_hour_count_rnkDF =  spark.sql(
      """
        |SELECT tid,hour,count,
        |ROW_NUMBER() OVER(PARTITION BY hour ORDER BY count DESC) AS rnk
        |from taxi_hour_count
      """.stripMargin)



    val result = taxi_hour_count_rnkDF.filter(taxi_hour_count_rnkDF.col("rnk") <= 5)


    result.coalesce(3).write
      .format("csv")
      .option("header",true)
      //若不注释，则最后都会为12个分区文件,无论前面是怎么分区的
      .partitionBy("hour")
      .mode(SaveMode.Overwrite)
      .save("/out_datas/taxi_groupbyhour")
  }

相关api的使用方法，如下，及rdd向dataframe的转换

import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession, types}

/**
 * DataFrame和RDD的互操作
 */
object DataFrameRDDApp {

  def main(args: Array[String]) {

    val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()

    inferReflection(spark)

//    program(spark)

    spark.stop()
  }

  def program(spark: SparkSession): Unit = {
    // RDD ==> DataFrame
    //编程转换，
    val rdd = spark.sparkContext.textFile("datas/student.txt")

    val infoRDD = rdd.map(_.split("\\|")).map(line => Row(line(0).toInt, line(1), line(2), line(3)))

    val structType = StructType(Array(StructField("id", IntegerType, true),
      StructField("name", StringType, true),
      StructField("phone", StringType, true),
      StructField("email", StringType, true)))

    val infoDF = spark.createDataFrame(infoRDD,structType)
    infoDF.printSchema()
    infoDF.show()


    //通过df的api进行操作
    infoDF.filter(infoDF.col("id") > 5).show
    infoDF.filter(infoDF.col("name").startsWith("a")).show

    //通过sql的方式进行操作
    infoDF.createOrReplaceTempView("infos")
    spark.sql("select * from infos where id > 30").show()
  }

  def inferReflection(spark: SparkSession) {
    // RDD ==> DataFrame
    val rdd = spark.sparkContext.textFile("datas/student.txt")

    //注意：需要导入隐式转换
    import spark.implicits._
    val studentDF = rdd.map(_.split("\\|")).map(x => Student(x(0).toInt, x(1), x(2), x(3))).toDF()

    studentDF.show()
    studentDF.filter("substr(name,0,1) = 'a'").show()

    studentDF.filter(studentDF.col("id") > 6).show

    studentDF.filter(studentDF.col("name").startsWith("a").as("re_name"))

    studentDF.createOrReplaceTempView("student")

    spark.sql("select * from student where id > 18").show()
  }

  case class Student(id: Int, name: String, phone: String, email: String)

}

问题：在将数据保存到本地计算机上时，有一个报错 HADOOP_HOME and hadoop.home.dir are unset

解决：解压一个hadoop目录到本地文件夹，然后将winutils.exe这个软件放进bin目录，并设置本地环境变量，HADOOP_HOME,然后重启idea，即可继续撸代码。

val mysqlDF = spark.read.format("jdbc")
      .option("url", "jdbc:mysql://master:3306")
      .option("dbtable", "employees.departments")
      .option("user", "root")
      .option("password", "123456")
      //添加jar包到jars 文件夹下，不然会报class not found 的错误
      .option("driver", "com.mysql.jdbc.Driver")
      .load()

通过spark 读取外部mysql数据源，参数配置，