从结构化文件创建DataFrame

1,CSV

object CreateDataFrameFromCSV {

  def main(args: Array[String]): Unit = {

    val spark: SparkSession = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()

    //获取数据的schema信息,每一行都有读取,将描述信息返回Driver端 

//    val schema = new StructType()
//      .add("name", DataTypes.StringType)
//      .add("age", DataTypes.IntegerType)
//      .add("fv", DataTypes.DoubleType)
//
//
//    val df: DataFrame = spark
//      .read
//      .schema(schema)
//      //.option("inferSchema", "true") //推断数据类型
//      .csv("data/user.csv")
//
//    //val df2 = df.toDF("name", "age", "fv")


    val df = spark.read
        .option("header", true)   //读取头信息当做列
        .option("inferSchema", "true")
        .csv("data/user.csv")

    df.printSchema()

    df.write.csv("out/csv")

    spark.stop()
  }
}

2,JDBC

import java.util.Properties

import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}

/**
 * 通过JDBC的方式创建DataFrame
 */
object CreateDataFrameFromJDBC {

  def main(args: Array[String]): Unit = {

    val spark: SparkSession = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()


    val props = new Properties()
    props.setProperty("driver", "com.mysql.jdbc.Driver")
    props.setProperty("user", "root")
    props.setProperty("password", "123456")

    val url = "jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8"

    //因为数据库的表有schema,
    //Driver访问了数据库,获取到了表的Schema信息
    val df: DataFrame = spark.read.jdbc(url, "t_boy", props)

    df.printSchema()

    //df.show()
    //将数据写回到MySQL
    df.where("fv >= 100").write.mode(SaveMode.Append).jdbc(url, "t_person", props)

    //Thread.sleep(100000000)

  }
}

3,JSON

object CreateDataFrameFromJSON {

  def main(args: Array[String]): Unit = {

    val spark: SparkSession = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()

    //获取数据的schema信息,每一行都有读取,将描述信息返回Driver端了
    //val df: DataFrame = spark.read.json("data/user.json")

    val df: DataFrame = spark.read.format("json").load("data/user.json")

    df.printSchema()

    import spark.implicits._
    //df.where($"_corrupt_record".isNull).show()

    val df2: DataFrame = df.filter(row => {
      row.getString(0) == null
    }).select("name", "age", "fv", "gender")

    //SaveMode.ErrorIfExists有目录就保存
    //SaveMode.Ignore有数据不写入其不报错
    //SaveMode.Overwrite将原来的数据删掉在写入
    //SaveMode.Append将新的数据追加
    //df2.write.mode(SaveMode.Append).json("out/boy")
    df2.write.mode(SaveMode.Append).format("json").save("out2/boy")


    spark.stop()
  }
}

4,Parquet

object CreateDataFrameFromParquet {

  def main(args: Array[String]): Unit = {

    val spark: SparkSession = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()

    val df: DataFrame = spark.read.parquet("out/par")

    //df.printSchema()

    df.select("age").show()

    spark.stop()
  }
}

5,Orc

object CreateDataFrameFromOrc {

  def main(args: Array[String]): Unit = {

    val spark: SparkSession = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()

    val df: DataFrame = spark.read.orc("out/orc")

    df.printSchema()

    df.select("age").show()

    spark.stop()
  }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值