0、前言
参数名和默认值 |
---|
spark.default.parallelism=Default number of partitions in RDDs |
spark.executor.cores=1 in YARN mode 一般默认值 |
spark.files.maxPartitionBytes=134217728(128M) |
spark.files.openCostInBytes=4194304 (4 MiB) |
spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=1 不同版本算法task提交数据 |
【重点】在spark sql中有对应参数为:
spark.sql.files.maxPartitionBytes=134217728(128M) 本次重点源码分析
spark.sql.files.openCostInBytes=4194304 (4 MiB) 本次重点源码分析
spark.default.parallelism = math.max(totalCoreCount.get(), 2)
对应源码位置如下:
org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend#defaultParallelism
org.apache.spark.sql.internal.SQLConf#FILES_MAX_PARTITION_BYTES
org.apache.spark.sql.internal.SQLConf#FILES_OPEN_COST_IN_BYTES
1、 环境准备
create database bicoredata;
CREATE TABLE bicoredata.dwd_start_log_dm(
`device_id` string,
`area` string,
`uid` string,
`app_v` string,
`event_type` string,
`os_type` string,
`channel` string,
`language` string,
`brand` string,
`entry` string,
`action` string,
`error_code` string
)
comment 'dwd用户启动日志信息'
partitioned by (`dt` string)
stored as orc
tblproperties("orc.compress"="ZLIB")
location '/bicoredata/dwd_start_log_dm';
-- 解析ods日志到dwd表
insert overwrite table bicoredata.dwd_start_log_dm
partition(dt='20220721')
select get_json_object(line, '$.attr.device_id'),
get_json_object(line, '$.attr.area'),
get_json_object(line, '$.attr.uid'),
get_json_object(line, '$.attr.app_v'),
get_json_object(line, '$.attr.event_type'),
get_json_object(line, '$.attr.os_type'),
get_json_object(line, '$.attr.channel'),
get_json_object(line, '$.attr.language'),
get_json_object(line, '$.attr.brand'),
get_json_object(line, '$.app_active.json.entry'),
get_json_object(line, '$.app_active.json.action'),
get_json_object(line, '$.app_active.json.error_code')
from
(
select split(str, ' ')[7] as line
from biods.ods_start_log
where dt='20220721'
)t
2、 代码准备
package org.example.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
object SparkSqlHive {
def main(args: Array[String]): Unit = {
System.setProperty("HADOOP_USER_NAME", "root")
// 动态分配参数必须 在 yarn环境下才能生效,client/cluster
val ss = SparkSession.builder().master("yarn").appName("the test of SparkSession")
.config("spark.deploy.mode","cluster")
.config("yarn.resourcemanager.hostname", "hadoop2"