Skip to content
Permalink
Browse files

Add additional Spark conf options:

- spark.rdd.compress
- spark.shuffle.compress
- spark.serializer
  • Loading branch information...
ruebot committed Apr 5, 2019
1 parent afa8361 commit 55b8aedce1e69b535c26373a0984831f6455f393
Showing with 7 additions and 1 deletion.
  1. +4 −1 app/jobs/spark_job.rb
  2. +3 −0 config/application.yml.example
@@ -38,6 +38,9 @@ def perform(user_id, collection_id)
spark_threads = ENV['SPARK_THREADS']
spark_heartbeat_interval = ENV['SPARK_HEARTBEAT_INTERVAL']
spark_driver_max_result_size = ENV['SPARK_DRIVER_MAXRESULTSIZE']
spark_rdd_compress = ENV['SPARK_RDD_COMPRESS']
spark_serializer = ENV['SPARK_SERIALIZER']
spark_shuffle_compress = ENV['SPARK_SHUFFLE_COMPRESS']
spark_job = %(
import io.archivesunleashed._
import io.archivesunleashed.app._
@@ -70,7 +73,7 @@ def perform(user_id, collection_id)
sys.exit
)
File.open(collection_spark_job_file, 'w') { |file| file.write(spark_job) }
spark_job_cmd = spark_shell + ' --master local[' + spark_threads + '] --driver-memory ' + spark_memory_driver + ' --conf spark.network.timeout=' + spark_network_timeout + ' --conf spark.executor.heartbeatInterval=' + spark_heartbeat_interval + ' --conf spark.driver.maxResultSize=' + spark_driver_max_result_size + ' --packages "io.archivesunleashed:aut:' + aut_version + '" -i ' + collection_spark_job_file + ' | tee ' + collection_spark_job_file + '.log'
spark_job_cmd = spark_shell + ' --master local[' + spark_threads + '] --driver-memory ' + spark_memory_driver + ' --conf spark.network.timeout=' + spark_network_timeout + ' --conf spark.executor.heartbeatInterval=' + spark_heartbeat_interval + ' --conf spark.driver.maxResultSize=' + spark_driver_max_result_size + ' --conf spark.rdd.compress=' + spark_rdd_compress + ' --conf spark.serializer=' + spark_serializer + ' --conf spark.shuffle.compress=' + spark_shuffle_compress + ' --packages "io.archivesunleashed:aut:' + aut_version + '" -i ' + collection_spark_job_file + ' | tee ' + collection_spark_job_file + '.log'
logger.info 'Executing: ' + spark_job_cmd
system(spark_job_cmd)
domain_success = collection_derivatives + '/all-domains/output/_SUCCESS'
@@ -21,6 +21,9 @@ SPARK_MEMORY_DRIVER: "5G"
SPARK_NETWORK_TIMEOUT: "10000000"
SPARK_HEARTBEAT_INTERVAL: "600s"
SPARK_THREADS: "*"
SPARK_RDD_COMPRESS: "true"
SPARK_SHUFFLE_COMPRESS: "true"
SPARK_SERIALIZER: "org.apache.spark.serializer.KryoSerializer"
AUT_VERSION: "0.17.0"
EMAIL_SERVER_NAME: "smtp.something.ca"
EMAIL_DOMAIN: "something.ca"

0 comments on commit 55b8aed

Please sign in to comment.
You can’t perform that action at this time.