Spark 参数 | Notion

也可以在 $SPARK_HOME/conf/spark-defaults.conf 中配置
--conf spark.yarn.maxAppAttempts=4 # AM 尝试重新启动的重试次数

--conf spark.yarn.am.attemptFailuresValidityInterval=1h # AM 重置重试次数的周期

--conf spark.yarn.max.executor.failures=8 # executor 的 最大失败数量

--conf spark.yarn.executor.failuresValidityInterval=1h # executor 的失败重置周期

--queue realtime_queue # 提交到一个单独队列里, 避免任务阻塞

--conf spark.speculation=true # 启动推测批处理的运行时间

--executor-memory 2g # 指定单个 executor 的内存. 
# 如果在 yarn 中.则会分配 2048 + max(2048m * 0.1, 384), 然后向上
# 整数取整的 MB. 则会分配 3GB 的内存

--driver-memory 1g # 指定单个 driver 的内存. 计算方式同上

--conf spark.streaming.concurrentJobs=10 # 提高 job 并发数, 指定一个线程池的核心线程数

--conf spark.streaming.kafka.maxRetries=50 # 获取topic分区 leaders 和最新的 offsets 时的最大重试次数

--conf spark.shuffle.service.enabled true # 启用External shuffle Service服务
	

--conf spark.shuffle.service.port=7337 # Shuffle Service默认服务端口，必须和yarn-site中的一致
	
--conf spark.dynamicAllocation.enabled=true # 开启动态资源分配
	

--conf spark.dynamicAllocation.minExecutors 0 # 每个Application最小分配的executor数
	
--conf spark.dynamicAllocation.maxExecutors 3 # 每个Application最大并发分配的executor数

--conf spark.dynamicAllocation.schedulerBacklogTimeout=1s

--conf spark.dynamicAllocation.sustainedSchedulerBacklogTimeout=5s

--conf spark.dynamicAllocation.executorIdleTimeout=60s # executor 空闲超过60s 则释放

# 如果配置了动态分配.则其值需要在 spark.dynamicAllocation.minExecutors 和 spark.dynamicAllocation.maxExecutors 之间
--conf spark.executor.instances=3

#  如果启用动态分配，则要运行executor的初始数量。如果设置了“–num-executors”（或“spark.executor.instances”）并且大于这个值，则会使用这个值进行初始化。 如： max(initialExecuor = 3, –num-executors = 10) 取最大
--conf spark.dynamicAllocation.initialExecutors=1

# 如果启用了动态分配，并且缓存数据块的executor已经空闲了超过这个时间，executor将被释放
--conf spark.dynamicAllocation.cachedExecutorIdleTimeout=60s

# 设置每秒每个分区最大获取日志数，控制处理数据量，保证数据均匀处理。
--conf spark.streaming.kafka.maxRatePerPartition=2000

# 对老年代GC 采用标记清除的方式
--conf spark.executor.extraJavaOptions=-XX:+UseConcMarkSweepGC

# 指定 spark 的 jars 包路径, 这样就无需每次都打包 jars 上传到 hdfs 上
--conf spark.yarn.archive=hdfs://txz-data0:9820/share/lib/spark2/jars

# 允许使用序列化buffer的最大值
--conf spark.kryoserializer.buffer.max=256m