spark sql 的常用写法

import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object taxi_csv {
  def main(args: Array[String]): Unit = {
    val csv_path = "datas/taxi.csv"



    val spark = SparkSession.builder()
      .master("local[*]")
      .appName("csv")
      .getOrCreate()



    val schema = StructType(Array(
      StructField("tid",LongType),
      StructField("lan",StringType),
      StructField("len",StringType),
      StructField("time",StringType)
    ))
    spark.sqlContext.read
      .format("csv")
      .option("header",false)
      .schema(schema)
      .load(csv_path)
        .createOrReplaceTempView("taxi")

    val taxiDF = spark.sqlContext.read
      .format("csv")
      .option("header",false)
      .schema(schema)
      .load(csv_path)

    val hourDF = taxiDF.select(taxiDF.col("tid"),taxiDF.col("time").substr(0,2).as("hour"))

    spark.sql(
      """
        |SELECT tid,SUBSTR(time,0,2) AS hour
        |FROM taxi
      """.stripMargin).createOrReplaceTempView("taxi_hour")



    val hour_groupDF = hourDF.groupBy("tid","hour").count()
    
    //对应下面的sql
       spark.sql(
      """
        |SELECT tid,hour,count(1) AS count
        |FROM taxi_hour
        |GROUP BY tid,hour
      """.stripMargin).createOrReplaceTempView("taxi_hour_count")

      hour_groupDF.createOrReplaceTempView("taxi_hour_count")

      spark.sql(
      """
        |SELECT tid,hour,count,
        |ROW_NUMBER() OVER(PARTITION BY hour ORDER BY count DESC) AS rnk
        |from taxi_hour_count
      """.stripMargin).createOrReplaceTempView("taxi_hour_count_rnk")

    val taxi_hour_count_rnkDF =  spark.sql(
      """
        |SELECT tid,hour,count,
        |ROW_NUMBER() OVER(PARTITION BY hour ORDER BY count DESC) AS rnk
        |from taxi_hour_count
      """.stripMargin)



    val result = taxi_hour_count_rnkDF.filter(taxi_hour_count_rnkDF.col("rnk") <= 5)


    result.coalesce(3).write
      .format("csv")
      .option("header",true)
      //若不注释,则最后都会为12个分区文件,无论前面是怎么分区的
      .partitionBy("hour")
      .mode(SaveMode.Overwrite)
      .save("/out_datas/taxi_groupbyhour")
  }

相关api的使用方法,如下,及rdd向dataframe的转换

import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession, types}

/**
 * DataFrame和RDD的互操作
 */
object DataFrameRDDApp {

  def main(args: Array[String]) {

    val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()

    inferReflection(spark)

//    program(spark)

    spark.stop()
  }

  def program(spark: SparkSession): Unit = {
    // RDD ==> DataFrame
    //编程转换,
    val rdd = spark.sparkContext.textFile("datas/student.txt")

    val infoRDD = rdd.map(_.split("\\|")).map(line => Row(line(0).toInt, line(1), line(2), line(3)))

    val structType = StructType(Array(StructField("id", IntegerType, true),
      StructField("name", StringType, true),
      StructField("phone", StringType, true),
      StructField("email", StringType, true)))

    val infoDF = spark.createDataFrame(infoRDD,structType)
    infoDF.printSchema()
    infoDF.show()


    //通过df的api进行操作
    infoDF.filter(infoDF.col("id") > 5).show
    infoDF.filter(infoDF.col("name").startsWith("a")).show

    //通过sql的方式进行操作
    infoDF.createOrReplaceTempView("infos")
    spark.sql("select * from infos where id > 30").show()
  }

  def inferReflection(spark: SparkSession) {
    // RDD ==> DataFrame
    val rdd = spark.sparkContext.textFile("datas/student.txt")

    //注意:需要导入隐式转换
    import spark.implicits._
    val studentDF = rdd.map(_.split("\\|")).map(x => Student(x(0).toInt, x(1), x(2), x(3))).toDF()

    studentDF.show()
    studentDF.filter("substr(name,0,1) = 'a'").show()

    studentDF.filter(studentDF.col("id") > 6).show

    studentDF.filter(studentDF.col("name").startsWith("a").as("re_name"))

    studentDF.createOrReplaceTempView("student")

    spark.sql("select * from student where id > 18").show()
  }

  case class Student(id: Int, name: String, phone: String, email: String)

}

问题:在将数据保存到本地计算机上时,有一个报错    HADOOP_HOME and hadoop.home.dir are unset

解决: 解压一个hadoop目录到本地文件夹,然后将winutils.exe这个软件放进bin目录,并设置本地环境变量,HADOOP_HOME,然后重启idea,即可继续撸代码。

 

val mysqlDF = spark.read.format("jdbc")
      .option("url", "jdbc:mysql://master:3306")
      .option("dbtable", "employees.departments")
      .option("user", "root")
      .option("password", "123456")
      //添加jar包到jars 文件夹下,不然会报class not found 的错误
      .option("driver", "com.mysql.jdbc.Driver")
      .load()

通过spark 读取外部mysql数据源,参数配置,

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值