Read CSV
# cat people-comma.csv
name,age,job
Jorge,30,Developer
Bob,32,Developer
启动 spark-shell
$SPARK_HOME/bin/spark-shell --conf spark.hadoop.fs.defaultFS=file:///
val df = spark.read.option("header", "true").option("inferSchema", "true").csv("people-comma.csv")
scala> df.show
+-----+---+---------+
| name|age| job|
+-----+---+---------+
|Jorge| 30|Developer|
| Bob| 32|Developer|
+-----+---+---------+
scala> df.printSchema
def printSchema(level: Int): Unit def printSchema(): Unit
scala> df.printSchema
root
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
|-- job: string (nullable = true)
Write ORC
#不带压缩
df.write.option("compression", "none").orc("orcdir")
Read Parquet
scala> val df = spark.read.parquet("struct_example.parquet")
df: org.apache.spark.sql.DataFrame = [_1: struct<_1: bigint, _2: array<string>>]
scala> df.show
+--------------+
| _1|
+--------------+
| {0, [val_0]}|
| {1, [val_1]}|
| {2, [val_2]}|
| {3, [val_3]}|
| {4, [val_4]}|
| {5, [val_5]}|
| {6, [val_6]}|
| {7, [val_7]}|
| {8, [val_8]}|
| {9, [val_9]}|
scala> df.printSchema
root
|-- _1: struct (nullable = true)
| |-- _1: long (nullable = true)
| |-- _2: array (nullable = true)
| | |-- element: string (containsNull = true)
WRITE GZIP 文件
val df = spark.read.parquet("/mnt/DP_disk1/part-00095-356249a2-c285-42b9-8a18-5b10be61e0c4-c000.snappy.parquet")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/lineitem.parquet")
Gen Data
Expression
/**
* Selects a set of columns. This is a variant of `select` that can only select
* existing columns using column names (i.e. cannot construct expressions).
*
* {{{
* // The following two are equivalent:
* ds.select("colA", "colB")
* ds.select($"colA", $"colB")
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def select(col: String, cols: String*): DataFrame = select((col +: cols).map(Column(_)) : _*)
/**
* Selects a set of SQL expressions. This is a variant of `select` that accepts
* SQL expressions.
*
* {{{
* // The following are equivalent:
* ds.selectExpr("colA", "colB as newName", "abs(colC)")
* ds.select(expr("colA"), expr("colB as newName"), expr("abs(colC)"))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def selectExpr(exprs: String*): DataFrame = {
select(exprs.map { expr =>
Column(sparkSession.sessionState.sqlParser.parseExpression(expr))
}: _*)
}
以下两者等价
df.selectExpr("shuffle(c1)")
df.select(shuffle($"c1"))
val df = spark.range(2).selectExpr("CAST(id AS string) c1", "CAST(id AS string) c2")
val df = spark.range(1).selectExpr("NAMED_STRUCT('a', id, 'b', id + 1) AS s")
scala> df.show
+------+
| s|
+------+
|{0, 1}|
+------+
write data
val df = Seq(
("AB"),
("AB"),
("AB"),
("AB"),
("AB"),
).toDF("col0_str")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/string_same_value")
val df = Seq(
("AB"),
("CD"),
("EF"),
("GH"),
("IJ"),
).toDF("col0_str")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/string_variance_value")
#disable dictionary
df.coalesce(1).write.option("compression","gzip").option("parquet.enable.dictionary","false").parquet("/mnt/DP_disk1/string_same_noDic")
Int64 数据
val df = spark.range(10000000).select($"id" % 1000 as "key")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/Int64_1000dist_value")
val df = spark.range(10000000).select($"id" as "key")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/Int64_vary_value")
array 类型数据
import org.apache.spark.sql.types._
val df = spark.range(10).select($"id" % 1000 as "key").select($"key".cast(StringType)).selectExpr("array('hello', concat('Hello_',key)) as col_10dist")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/array_vary")
val df= spark.read.format("parquet").load("/mnt/DP_disk1/array_vary.gz.parquet")
Struct 数据类型
import org.apache.spark.sql.types._
val df = spark.range(10000000).select($"id" % 1000 as "key").select($"key".cast(StringType)).selectExpr("struct('hello', concat('Hello_',key)) as col_struct")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/struct_1000dist")
val df= spark.read.format("parquet").load("/mnt/DP_disk1/struct_1000dist_10M.gz.parquet")
Map 数据类型
import org.apache.spark.sql.types._
val df = spark.range(10000000).select($"id" % 1000 as "key").select($"key".cast(StringType)).selectExpr("map('hello', concat('Hello_',key)) as col_map")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/map_1000dist")
val df= spark.read.format("parquet").load("/mnt/DP_disk1/map_1000dist_10M.gz.parquet")
10M 数据集
import org.apache.spark.sql.types._
val df = spark.range(10000000).select($"id".cast(StringType)).selectExpr("concat('Hello_', id) as col_vary")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/string_vary_value")
val df = spark.range(10000000).select($"id" % 1000 as "key").select($"key".cast(StringType)).selectExpr("concat('Hello_',key) as col_10dist")
df.coalesce(1).write.option("compression","gzip").parquet("/mnt/DP_disk1/string_10dist_value")
读取SQL 文件的脚本
$SPARK_HOME/bin/spark-shell --master local[*]
spark.sql("show tables;").show
import scala.io.Source
val fileContents = Source.fromFile("queries/query24.sql").getLines.filter(!_.startsWith("--")).filter(!_.startsWith("LIMIT")).mkString(" ")
spark.sql(fileContents).show
val queryContent: String = Source.fromFile(s"${log_location}/tpch-queries/${q}.sql").mkString
val df = spark.sql(s"$queryContent")
Spark-SQL 读取文件
for query in 1 2 3 4 5 6 7 8 9 10 11 12 13 '14a' '14b' 15 16 17 18 19 20 21 22 '23a' '23b' '24a' '24b' 25 26 27 28 29 30 31 32 33 34 35 36 37 38 '39a' '39b' 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99; do {SPARK_HOME}/bin/beeline -u jdbc:hive2://{IP}:10001 -i ./init -f /home/sparkuser/tpcds/tpcds_2_4.orig/q${query}.sql; done
Hadoop copy data
hadoop distcp hdfs://{IP}:9000/part_tpcds_decimal_10 hdfs://{IP}:9000/
create empty parquet, none compression
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
val schema = StructType(Array(
StructField("a", LongType, true),
StructField("b", DoubleType, true)
))
val dfParquet = spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema)
dfParquet.coalesce(1)
.write
.format("parquet")
.mode("overwrite")
.option("compression","none")
.parquet("/mnt/DP_disk2/testingdata/empty")
UT createOrReplaceTempView example
test("Union show integer null") {
withSQLConf("spark.sql.adaptive.enabled" -> "false",
SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") {
val schema10 = StructType(Array(
StructField("a", IntegerType, true)
))
val rowData2 = Seq(Row(1))
val df1 = spark.createDataFrame(rowData2.asJava, schema10)
df1.createOrReplaceTempView("IntegerTab")
val df = sql("SELECT a from IntegerTab;")
df.show()
df.explain(false)
df.printSchema()
}
}
test("Test show") {
val lfile = Files.createTempFile("", ".parquet").toFile
lfile.deleteOnExit()
val parquetPath = lfile.getAbsolutePath
val fileFormat: String = "parquet"
val schema = StructType(Array(
StructField("double_field1", DoubleType, true),
StructField("int_field1", IntegerType, true),
StructField("string_field1", StringType, true)
))
val rowData = Seq(
Row(1.025, 1, "{\"a\":\"b\"}"),
Row(1.035, 2, null),
Row(1.045, 3, null)
)
var dfParquet = spark.createDataFrame(rowData.asJava, schema)
dfParquet.coalesce(1)
.write
.format("parquet")
.mode("overwrite")
.parquet(parquetPath)
spark.catalog.createTable("datatab", parquetPath, fileFormat)
val df = sql("SELECT double_field1 from datatab;")
df.show()
df.explain(false)
df.printSchema()
}
spark-sql 建表
https://spark.apache.org/docs/3.1.1/sql-ref-syntax-ddl-create-table-datasource.html
CREATE TABLE student (id INT, name STRING, age INT)
USING CSV
LOCATION 'file:///xxx/data/student';
insert into student values (1,'Json', 22),(1,'Mali',23);
CREATE TABLE student USING CSV AS SELECT * FROM {catalog}.{database}.student;
insert into student SELECT * FROM {catalog}.{database}.student;
创建 基于 bucket location 的 表
CREATE TABLE cosnstable(col0_str string) using parquet options (path "cosn://xxx/testdata/testparquet");
CREATE TABLE cosnstableoverwrite(col0_str string) using parquet options (path "cosn://xxx/testdata/testparquetoverwrite");
insert overwrite cosnstableoverwrite select * from cosnstable;
CREATE TABLE metacosnstable(col0_str string) using parquet options (path "cosn://xxx/testdata/testparquet");
CREATE TABLE metacosnstableoverwrite(col0_str string) using parquet options (path "cosn://xxx/testdata/testparquetoverwrite");