20240801 大数据批处理-SPRAK3.5(高效率)

SPARK SQL

通过API方式可以灵活的实现业务需求,复用现有代码,搭建自定义的应用框架,有其不可替代的应用场景。

在API的基础上,日常的简单的业务需求应该通过更加高效的方式实现, SPARK SQL。(*各大厂商通过SQL已经成功改变了大数据的开发范式)

数据接入:

数据源数据接入
JDBC
//JDBC 没有Source,自定义个DefaultSource
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions;
import org.apache.spark.sql.sources.BaseRelation;
import org.apache.spark.sql.sources.SchemaRelationProvider;
import org.apache.spark.sql.types.StructType;
import scala.collection.immutable.Map;

public class DefaultSource extends org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider implements SchemaRelationProvider {
    @Override
    public BaseRelation createRelation(SQLContext sqlContext, Map<String, String> parameters, StructType schema) {
        return super.createRelation(sqlContext, parameters);
    }
}

spark.sql("CREATE TEMPORARY TABLE LINE_ITEM_TBL " +
        "USING org.apache.spark.examples.poc.source.jdbc " +
        "OPTIONS (" +
        "  url 'jdbc:mysql://localhost:33306/spark' " +
        ", dbtable '(select * from poc_1m.lineitem limit 10) as t' " +
        ", user 'root' " +
        ", password 'root' " +
        ", partitionColumn 'l_orderkey' " +
        ", lowerBound 0" +
        ", upperBound 1000000 " +
        ", numPartitions 8 " +
        " ) ");
REDIS
spark.sql(
  """
    |CREATE TEMPORARY TABLE lineitem_info (l_orderkey STRING, ext_info STRING)
    |    USING org.apache.spark.sql.redis OPTIONS (
    |    table 'lineitem_info',
    |    key.column 'l_orderkey',
    |    host 'localhost',
    |    port '6379',
    |    dbNum 2
    |    )
  """.stripMargin)

var redis_df = spark.sql(
  """
    |select * from lineitem_info limit 10
  """.stripMargin)
redis_df.show()
println(redis_df.count())
CSV
spark.sql(
  """
    |CREATE TEMPORARY TABLE people_tbl (name STRING, age STRING, job STRING)
    | USING CSV
    | OPTIONS (
    | header 'true',
    | multiLine 'true'
    | )
    | LOCATION "file:///D:/GitRepoSpark/spark/examples/src/main/resources/people2.csv"
    |""".stripMargin)
spark.sql("select * from people_tbl").show()
PARQUET
var df = spark.sql(
  """
    |SELECT * FROM parquet.`examples/src/main/resources/users.parquet`
    |""".stripMargin)
df.show()
df.schema.printTreeString()

spark.sql(
  """
    |CREATE TEMPORARY TABLE user_parquet_tbl (name STRING, favorite_color STRING, favorite_numbers ARRAY<INT>)
    | USING parquet
    | OPTIONS (
    | 'parquet.bloom.filter.enabled' 'true'
    | )
    | LOCATION 'examples/src/main/resources/users.parquet'
    |""".stripMargin)
spark.sql("select * from user_parquet_tbl").show()
JSON
var df = spark.sql(
  """
    |SELECT * FROM json.`examples/src/main/resources/people.json`
    |""".stripMargin)
df.show()
df.schema.printTreeString()

spark.sql(
  """
    |CREATE TEMPORARY TABLE user_json_tbl (name STRING, age INT)
    | USING json
    | LOCATION 'examples/src/main/resources/people.json'
    |""".stripMargin)
spark.sql("select * from user_json_tbl").show()
ORC
var df = spark.sql(
  """
    |SELECT * FROM orc.`examples/src/main/resources/users.orc`
    |""".stripMargin)
df.show()
df.schema.printTreeString()

spark.sql(
  """
    |CREATE TEMPORARY TABLE user_orc_tbl (name STRING, favorite_color STRING, favorite_numbers ARRAY<INT>)
    | USING orc
    | OPTIONS (
    | 'orc.bloom.filter.columns' 'favorite_color'
    | )
    | LOCATION 'examples/src/main/resources/users.orc'
    |""".stripMargin)
spark.sql("select * from user_orc_tbl").show()
AVRO
var df = spark.sql(
  """
    |SELECT * FROM avro.`examples/src/main/resources/users.avro`
    |""".stripMargin)
df.show()
df.schema.printTreeString()

spark.sql(
  """
    |CREATE TEMPORARY TABLE user_avro_tbl (name STRING, favorite_color STRING, favorite_numbers ARRAY<INT>)
    | USING avro
    | OPTIONS (
    | 'orc.bloom.filter.columns' 'favorite_color'
    | )
    | LOCATION 'examples/src/main/resources/users.avro'
    |""".stripMargin)
spark.sql("select * from user_avro_tbl").show()

数据计算:

SPARK SQL 兼容HIVE SQL SQL Reference - Spark 3.5.1 Documentation

(*和大的平台产品不同,主要是数据类型和BUILD-IN函数,需要参考平台的官方文档)

数据输出:

    // OVERWRITE MODE
    spark.sql(
      """
        |INSERT OVERWRITE TABLE LINE_ITEM_TBL_SINK select * from LINE_ITEM_TBL_SRC
        |""".stripMargin
      ).show()

    // APPEND MODE (throws error if primary key conflicts )
    spark.sql(
      """
        |INSERT INTO TABLE LINE_ITEM_TBL_SINK select * from LINE_ITEM_TBL_SRC
        |""".stripMargin
    ).show()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值