spark不同数据源读取

DataFrame

DataFrame creation

  1. DataFrame可以从json,csv,parquet,hdfs,hive table,jdbc等数据源读取数据(注意:最好是像hdfs分布式文件系统,否则需要保证每台机器的相同位置有盖资源或者网络系统挂载)

    • json读取的数据默认为StringTypeLongType类型,次序并非文件中的次序.
    • 读取csv文件有两种方式:
      • 带头的文件用option读取(感觉并不好),这种读取的数据类型全部为StringType
      • 不带头的文件用schema读取(其实是一种普通的txt,非常常见,好用),自定义的头可以方便的读取数据类型,但必须认为保证数据类型的正确.
    • table读取的数据类型保持原类型.
    • 读取parquet类型的文件,能够读取其原有数据类型,推荐用这种!
    • 从数据库读取的类型todo
    • rdd构建dataframe,需要注意用map变成一个Row
     private def generateDF(spark: SparkSession) = {
        //generate rdd from json file( directly get dataType,you can use row.getLong)
        import spark.implicits._
        val rdd1 = spark.read.json("src/main/resources/data/people.json")
        rdd1.printSchema()
        rdd1.map(row => (row.getAs[String]("name"), row.getLong(0), row.getString(1), row.getLong(2), row.getString(3)))
          .foreach(row => println(row._1 + "|" + row._2 + "|" + row._3 + "|" + row._4 + "|" + row._5))
    
    
        //generate rdd from csv file with header which will generate  all StringType fields
        val rdd2 = spark.read.option("header", "true").csv("src/main/resources/data/people.csv")
        rdd2.printSchema()
        rdd2.map(row =>
          row.getAs[String](0) + "|" + row.getAs[String](1).toInt + "|" +
            row.getAs[String]("gender") + "|" + row.getAs[String](3).toInt)
          .foreach(str => println(str))
    
    
        //generate rdd from csv file without header which will control the field type
        val schema = StructType(Seq(
          StructField("name", StringType, true),
          StructField("age", IntegerType, true),
          StructField("gender", StringType, true),
          StructField("index", IntegerType, true)
        ))
        val rdd3 = spark.read.schema(schema).csv("src/main/resources/data/people1.csv")
        rdd3.printSchema()
        rdd3.map(row => (row.getString(0), row.getInt(1), row.getInt(3)))
          .foreach(tup => println(tup))
      } private def generateDF(spark: SparkSession) = {
        //generate rdd from json file( directly get dataType,you can use row.getLong)
        import spark.implicits._
        val df1 = spark.read.json("src/main/resources/data/people.json")
        df1.printSchema()
        df1.map(row => (row.getAs[String]("name"), row.getLong(0), row.getString(1), row.getLong(2), row.getString(3)))
          .foreach(row => println(row._1 + "|" + row._2 + "|" + row._3 + "|" + row._4 + "|" + row._5))
    
    
        //generate rdd from csv file with header which will generate  all StringType fields
        val df2 = spark.read.option("header", "true").csv("src/main/resources/data/people.csv")
        df2.printSchema()
        df2.map(row =>
          row.getAs[String](0) + "|" + row.getAs[String](1).toInt + "|" +
            row.getAs[String]("gender") + "|" + row.getAs[String](3).toInt)
          .foreach(str => println(str))
    
    
        //generate rdd from csv file without header which will control the field type
        val schema = StructType(Seq(
          StructField("name", StringType, true),
          StructField("age", IntegerType, true),
          StructField("gender", StringType, true),
          StructField("index", IntegerType, true)
        ))
        val df3 = spark.read.schema(schema).csv("src/main/resources/data/people1.csv")
        df3.printSchema()
        df3.map(row => (row.getString(0), row.getInt(1), row.getInt(3)))
          .foreach(tup => println(tup))
    
        //read data from hive table
        df3.write.saveAsTable("people")
        val df4 = spark.read.table("people")
        df4.printSchema()
    
        //read data from hdfs
        val df5 = spark.read.load("")
    
       //read data from parquet,best form hdfs
        val df6 = spark.read.parquet("src/main/resources/data/people")
        df6.printSchema()
        df6.map(row => (row.getInt(1), row.getString(0)))
          .foreach(tuple => println(tuple))
    
       //generate dataframe from rdd
        val rdd = spark.sparkContext.textFile("src/main/resources/data/people1.csv")
        val rdd2 = rdd.map(line => line.split(",")).map(arr => Row(arr(0), arr(1).toInt, arr(2), arr(3).toInt))
        val schema = StructType(
          StructField("name", StringType, true) ::
            StructField("index", IntegerType, true) ::
            StructField("gender", StringType, true) ::
            StructField("age", IntegerType, true) :: Nil
        )
        val df7 = spark.createDataFrame(rdd2, schema)
        df7.printSchema()
        df7.show()
      }
    

转载于:https://my.oschina.net/xd03122049/blog/1827369

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值