Spark Read/Write files

Read CSV
# cat people-comma.csv
name,age,job
Jorge,30,Developer
Bob,32,Developer
启动 spark-shell
$SPARK_HOME/bin/spark-shell --conf spark.hadoop.fs.defaultFS=file:///
val df = spark.read.option("header", "true").option("inferSchema", "true").csv("people-comma.csv")
scala> df.show
+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
+-----+---+---------+


scala> df.printSchema
   def printSchema(level: Int): Unit   def printSchema(): Unit

scala> df.printSchema
root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
Write ORC
#不带压缩
df.write.option("compression", "none").orc("orcdir")
Read Parquet
scala> val df = spark.read.parquet("struct_example.parquet")
df: org.apache.spark.sql.DataFrame = [_1: struct<_1: bigint, _2: array<string>>]

scala> df.show
+--------------+
|            _1|
+--------------+
|  {0, [val_0]}|
|  {1, [val_1]}|
|  {2, [val_2]}|
|  {3, [val_3]}|
|  {4, [val_4]}|
|  {5, [val_5]}|
|  {6, [val_6]}|
|  {7, [val_7]}|
|  {8, [val_8]}|
|  {9, [val_9]}|

scala> df.printSchema
root
 |-- _1: struct (nullable = true)
 |    |-- _1: long (nullable = true)
 |    |-- _2: array (nullable = true)
 |    |    |-- element: string (containsNull = true)

WRITE GZIP 文件
val df = spark.read.parquet("/mnt/DP_disk1/part-00095-356249a2-c285-42b9-8a18-5b10be61e0c4-c000.snappy.parquet")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/lineitem.parquet")
Gen Data
Expression
/**
   * Selects a set of columns. This is a variant of `select` that can only select
   * existing columns using column names (i.e. cannot construct expressions).
   *
   * {{{
   *   // The following two are equivalent:
   *   ds.select("colA", "colB")
   *   ds.select($"colA", $"colB")
   * }}}
   *
   * @group untypedrel
   * @since 2.0.0
   */
  @scala.annotation.varargs
  def select(col: String, cols: String*): DataFrame = select((col +: cols).map(Column(_)) : _*)
/**
   * Selects a set of SQL expressions. This is a variant of `select` that accepts
   * SQL expressions.
   *
   * {{{
   *   // The following are equivalent:
   *   ds.selectExpr("colA", "colB as newName", "abs(colC)")
   *   ds.select(expr("colA"), expr("colB as newName"), expr("abs(colC)"))
   * }}}
   *
   * @group untypedrel
   * @since 2.0.0
   */
  @scala.annotation.varargs
  def selectExpr(exprs: String*): DataFrame = {
    select(exprs.map { expr =>
      Column(sparkSession.sessionState.sqlParser.parseExpression(expr))
    }: _*)
  }

以下两者等价

df.selectExpr("shuffle(c1)")
df.select(shuffle($"c1"))
val df = spark.range(2).selectExpr("CAST(id AS string) c1", "CAST(id AS string) c2")
val df = spark.range(1).selectExpr("NAMED_STRUCT('a', id, 'b', id + 1) AS s")
scala> df.show
+------+
|     s|
+------+
|{0, 1}|
+------+
write data
val df = Seq(
				("AB"),
				("AB"),
				("AB"),
				("AB"),
				("AB"),
			).toDF("col0_str")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/string_same_value")

val df = Seq(
				("AB"),
				("CD"),
				("EF"),
				("GH"),
				("IJ"),
			).toDF("col0_str")

df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/string_variance_value")

#disable dictionary
df.coalesce(1).write.option("compression","gzip").option("parquet.enable.dictionary","false").parquet("/mnt/DP_disk1/string_same_noDic")
Int64 数据
val df = spark.range(10000000).select($"id" % 1000 as "key")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/Int64_1000dist_value")



val df = spark.range(10000000).select($"id" as "key")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/Int64_vary_value")
array 类型数据
import org.apache.spark.sql.types._
val df = spark.range(10).select($"id" % 1000 as "key").select($"key".cast(StringType)).selectExpr("array('hello', concat('Hello_',key)) as col_10dist")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/array_vary")
val df= spark.read.format("parquet").load("/mnt/DP_disk1/array_vary.gz.parquet")
Struct 数据类型
import org.apache.spark.sql.types._
val df = spark.range(10000000).select($"id" % 1000 as "key").select($"key".cast(StringType)).selectExpr("struct('hello', concat('Hello_',key)) as col_struct")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/struct_1000dist")

val df= spark.read.format("parquet").load("/mnt/DP_disk1/struct_1000dist_10M.gz.parquet")
Map 数据类型
import org.apache.spark.sql.types._
val df = spark.range(10000000).select($"id" % 1000 as "key").select($"key".cast(StringType)).selectExpr("map('hello', concat('Hello_',key)) as col_map")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/map_1000dist")

val df= spark.read.format("parquet").load("/mnt/DP_disk1/map_1000dist_10M.gz.parquet")
10M 数据集
import org.apache.spark.sql.types._
val df = spark.range(10000000).select($"id".cast(StringType)).selectExpr("concat('Hello_', id) as col_vary")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/string_vary_value")
val df = spark.range(10000000).select($"id" % 1000 as "key").select($"key".cast(StringType)).selectExpr("concat('Hello_',key) as col_10dist")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/string_10dist_value")
读取SQL 文件的脚本
$SPARK_HOME/bin/spark-shell --master local[*]
spark.sql("show tables;").show
import scala.io.Source
val fileContents = Source.fromFile("queries/query24.sql").getLines.filter(!_.startsWith("--")).filter(!_.startsWith("LIMIT")).mkString(" ")
spark.sql(fileContents).show
val queryContent: String = Source.fromFile(s"${log_location}/tpch-queries/${q}.sql").mkString
val df = spark.sql(s"$queryContent")
Spark-SQL 读取文件
for query in  1 2 3 4 5 6 7 8 9 10 11 12 13 '14a' '14b' 15 16 17 18 19 20 21 22 '23a' '23b' '24a' '24b' 25 26 27 28 29 30 31 32 33 34 35 36 37 38 '39a' '39b' 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99; do {SPARK_HOME}/bin/beeline -u jdbc:hive2://{IP}:10001  -i ./init -f /home/sparkuser/tpcds/tpcds_2_4.orig/q${query}.sql; done

Hadoop copy data
hadoop distcp hdfs://{IP}:9000/part_tpcds_decimal_10 hdfs://{IP}:9000/
create empty parquet, none compression
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row

val schema = StructType(Array(
      StructField("a", LongType, true),
      StructField("b", DoubleType, true)
    ))

val dfParquet = spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema)

dfParquet.coalesce(1)
      .write
      .format("parquet")
      .mode("overwrite")
	  .option("compression","none")
      .parquet("/mnt/DP_disk2/testingdata/empty")
UT createOrReplaceTempView example
  test("Union show integer null") {
    withSQLConf("spark.sql.adaptive.enabled" -> "false",
      SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") {

      val schema10 = StructType(Array(
        StructField("a", IntegerType, true)
      ))
      val rowData2 = Seq(Row(1))
      val df1 = spark.createDataFrame(rowData2.asJava, schema10)
      df1.createOrReplaceTempView("IntegerTab")


      val df = sql("SELECT a from IntegerTab;")
      df.show()
      df.explain(false)
      df.printSchema()
    }
  }

  test("Test show") {
    val lfile = Files.createTempFile("", ".parquet").toFile
    lfile.deleteOnExit()
    val parquetPath = lfile.getAbsolutePath
    val fileFormat: String = "parquet"

    val schema = StructType(Array(
      StructField("double_field1", DoubleType, true),
      StructField("int_field1", IntegerType, true),
      StructField("string_field1", StringType, true)
    ))
    val rowData = Seq(
      Row(1.025, 1, "{\"a\":\"b\"}"),
      Row(1.035, 2, null),
      Row(1.045, 3, null)
    )

    var dfParquet = spark.createDataFrame(rowData.asJava, schema)
    dfParquet.coalesce(1)
      .write
      .format("parquet")
      .mode("overwrite")
      .parquet(parquetPath)

    spark.catalog.createTable("datatab", parquetPath, fileFormat)
    val df = sql("SELECT double_field1 from datatab;")
    df.show()
    df.explain(false)
    df.printSchema()
  }
spark-sql 建表

https://spark.apache.org/docs/3.1.1/sql-ref-syntax-ddl-create-table-datasource.html

CREATE TABLE student (id INT, name STRING, age INT) 
USING CSV
LOCATION 'file:///xxx/data/student';
insert into student values (1,'Json', 22),(1,'Mali',23);
CREATE TABLE student USING CSV AS SELECT * FROM {catalog}.{database}.student;
insert into student SELECT * FROM  {catalog}.{database}.student;
创建 基于 bucket location 的 表

CREATE TABLE cosnstable(col0_str string) using parquet options (path "cosn://xxx/testdata/testparquet");
CREATE TABLE cosnstableoverwrite(col0_str string) using parquet options (path "cosn://xxx/testdata/testparquetoverwrite");


insert overwrite cosnstableoverwrite select * from cosnstable;



CREATE TABLE metacosnstable(col0_str string) using parquet options (path "cosn://xxx/testdata/testparquet");
CREATE TABLE metacosnstableoverwrite(col0_str string) using parquet options (path "cosn://xxx/testdata/testparquetoverwrite");

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值