Spark 读取各种文件获得df并写入

package com.spark.sql
import org.apache
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Encoder, Row, SaveMode, SparkSession}

object DataSource {
  System.setProperty("hadoop.home.dir","D:\\soft\\hadoop\\hadoop-2.7.3")
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local").appName("datasource").getOrCreate()

//    val spark = SparkSession.builder().master("local").appName("datasource")
//      .enableHiveSupport().getOrCreate()

//    val spark = SparkSession.builder().master("local").appName("datasource")
//        .config("fs.defaultFS", "hdfs://star.com")
//        .config("spark.sql.warehouse.dir", "hdfs://star.com:8020/test")
//        .enableHiveSupport().getOrCreate()

    //如果使用hive数据源,需在SparkSession中添加配置 (5)
//        val spark = SparkSession.builder().master("local").appName("datasource")
//            .config("fs.defaultFS", "hdfs://star.com")
//          .config("spark.sql.warehouse.dir", "hdfs://star.com:8020/test")
//            .config("javax.jdo.option.ConnectionURL", "jdbc:mysql://star.com:3306/hive?createDatabaseIfNotExist=true")
//            .config("javax.jdo.option.ConnectionDriverName","com.mysql.jdbc.Driver")
//            .config("javax.jdo.option.ConnectionUserName", "root")
//            .config("javax.jdo.option.ConnectionPassword", "root")
//          .enableHiveSupport().getOrCreate()
    //1.读写txt文件
       // txtFile(spark)
    //2.读写csv文件
//       csvFile(spark)
    //3.读写json文件
//        jsonFile(spark)
    //4.读写parquet文件
//        generateParquet(spark)
//        parquetFile(spark)
    //5.读写hive数据源
//        hiveFile(spark)
    //6.读写jdbc数据源
    jdbcFile(spark)

    spark.close()
  }

  def txtFile(spark:SparkSession): Unit = {
    val file = "G:\\data\\people.txt"
    //普通文件:text
    //    val df = spark.read.text(file)
    //或
    val df = spark.read.format("text").load(file)
    //val df = spark.read.format("text").load("G:\\data\\people.txt")
    df.show()
    df.printSchema()
    df.write.text("G://result/txt")
    //或
    //    df.write.format("text").save("d://result/txt")
  }

  def csvFile(spark:SparkSession): Unit = {
    val file = "G:\\data\\people.csv"
    //默认选项:"header" => "false"|"sep" => ",")
    //可指定"inferSchema" => "true"来推断视图schema
    //也可使用spark.read.schema(schema)指定视图
        val structType = StructType(List(StructField("name1", StringType),StructField("age1", StringType)))
    val df = spark.read.option("header", "true")/*.schema(structType)*/.csv(file)
    //    val df = spark.read.format("csv").load(file)
    df.show()
    df.printSchema()
    df.write.csv("G://result/csv")
    //    df.write.format("csv").save("d://result/csv")
  }

  def jsonFile(spark:SparkSession): Unit = {
    val file = "G:\\data\\people.json"
    val df = spark.read.json(file)
    //    val df = spark.read.format("json").load(file)
    df.show()
    df.printSchema()
    //df.write.json("G://result/json")
    //    df.write.format("json").save("G://result/json")
  }

  def generateParquet(spark:SparkSession): Unit = {
    //生成Parquet文件
    val file = "G:\\data\\users.txt"
    val df = spark.read.option("header", "true").option("sep"," ").csv(file)
    df.printSchema()
    print(df.count())
    df.show(5)
    df.write.save("G://result/parquet1")
  }

  def parquetFile(spark:SparkSession): Unit = {
    val file = "G:\\data\\users.parquet"
    val df = spark.read.load(file)
    //    val df = spark.read.parquet(file)
    //    val df = spark.read.format("parquet").load(file)
    df.show()
    df.printSchema()
    //savemode:4个模式
    df.write.mode(SaveMode.Append).save("G://result/parquet")
    //    df.write.parquet("d://result/parquet")
    //    df.write.format("parquet").save("d://result/parquet")
  }

  def hiveFile(spark: SparkSession): Unit ={
    //1.使用默认derby数据库
    //1.1.默认default database
    //如果没有指定spark.sql.warehouse.dir配置,则在当前目录下生成spark-warehouse目录存放peoples表目录;
    //如果指定了spark.sql.warehouse.dir配置,则在hdfs的/test生成peoples表目录。
      val file = "G:/data/people.json"
//        spark.sql("drop table if exists peoples")
//        spark.sql("create table if not exists peoples(name string, age int) row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'")
//        spark.sql("load data local inpath '" + file + "' INTO TABLE peoples")
//
//        val df = spark.sql("select * from peoples where age < 20")
//        df.show()
//        df.write.saveAsTable("teenager")
//        val df1 = spark.table("teenager")
//        df1.show()
    //1.2.指定test1 database
    //如果没有指定spark.sql.warehouse.dir配置,则在当前目录下生成spark-warehouse目录存放test1.db库目录;
    // 如果指定了spark.sql.warehouse.dir配置,则在hdfs的/test生成test1.db库目录
//        spark.sql("create database if not exists test1")
//        spark.sql("use test1")
//        spark.sql("drop table if exists peoples")
//        spark.sql("create table if not exists peoples(name string, age int) row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'")
//        spark.sql("load data local inpath '" + file + "' INTO TABLE peoples")
//
//        val df = spark.sql("select * from peoples where age < 20")
//        df.show()
//    //saveAsTable:df中的数据放到一个teenager表中(自动生成表)
//        df.write.saveAsTable("teenager")
//    //table:读取hive表中的数据
//        val df1 = spark.table("teenager")
//        df1.show()
    //2.使用mysql数据库
    spark.sql("create database if not exists test1")
    spark.sql("use test1")
    spark.sql("drop table if exists peoples")
    spark.sql("create table if not exists peoples(name string, age int) row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'")
    spark.sql("load data local inpath '" + file + "' INTO TABLE peoples")

    val df = spark.sql("select * from peoples where age < 20")
    df.show()
    df.write.saveAsTable("teenager1")
    val df1 = spark.table("teenager1")
    df1.show()
  }

  def jdbcFile(spark: SparkSession): Unit = {
    val peoplesDF = spark.read.format("jdbc")
      .option("driver", "com.mysql.jdbc.Driver")
      .option("url", "jdbc:mysql://star.com:3306/cdadb?createDatabaseIfNotExist=true")
      .option("dbtable", "peoples")
      .option("user", "root")
      .option("password","root").load()

    peoplesDF.printSchema()
    peoplesDF.show()

    peoplesDF.write.format("jdbc")
      .option("driver", "com.mysql.jdbc.Driver")
      .option("url", "jdbc:mysql://star.com:3306/cdadb?createDatabaseIfNotExist=true")
      .option("dbtable", "peoples1")
      .option("user", "root")
      .option("password","root")
      .save()
  }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值