Skip to content
Permalink
Browse files

Add spark.kryoserializer.buffer.max to spark job.

- Sort spark conf variables in spark job
- Sort variables in application config example
  • Loading branch information...
ruebot committed Jun 26, 2019
1 parent b64a0e7 commit 57aa5af7ce1a0f80ab1cc9827cb785d9379620d6
Showing with 28 additions and 26 deletions.
  1. +6 −5 app/jobs/spark_job.rb
  2. +22 −21 config/application.yml.example
@@ -32,15 +32,16 @@ def perform(user_id, collection_id)
FileUtils.mkdir_p collection_derivatives
FileUtils.mkdir_p collection_spark_jobs_path
FileUtils.mkdir_p collection_derivatives + '/gephi'
spark_memory_driver = ENV['SPARK_MEMORY_DRIVER']
spark_network_timeout = ENV['SPARK_NETWORK_TIMEOUT']
aut_version = ENV['AUT_VERSION']
spark_threads = ENV['SPARK_THREADS']
spark_heartbeat_interval = ENV['SPARK_HEARTBEAT_INTERVAL']
spark_driver_max_result_size = ENV['SPARK_DRIVER_MAXRESULTSIZE']
spark_heartbeat_interval = ENV['SPARK_HEARTBEAT_INTERVAL']
spark_kryoserializer_buffer_max = ENV['SPARK_KRYOSERIALIZER_BUFFER_MAX']
spark_memory_driver = ENV['SPARK_MEMORY_DRIVER']
spark_network_timeout = ENV['SPARK_NETWORK_TIMEOUT']
spark_rdd_compress = ENV['SPARK_RDD_COMPRESS']
spark_serializer = ENV['SPARK_SERIALIZER']
spark_shuffle_compress = ENV['SPARK_SHUFFLE_COMPRESS']
spark_threads = ENV['SPARK_THREADS']
spark_job = %(
import io.archivesunleashed._
import io.archivesunleashed.app._
@@ -73,7 +74,7 @@ def perform(user_id, collection_id)
sys.exit
)
File.open(collection_spark_job_file, 'w') { |file| file.write(spark_job) }
spark_job_cmd = spark_shell + ' --master local[' + spark_threads + '] --driver-memory ' + spark_memory_driver + ' --conf spark.network.timeout=' + spark_network_timeout + ' --conf spark.executor.heartbeatInterval=' + spark_heartbeat_interval + ' --conf spark.driver.maxResultSize=' + spark_driver_max_result_size + ' --conf spark.rdd.compress=' + spark_rdd_compress + ' --conf spark.serializer=' + spark_serializer + ' --conf spark.shuffle.compress=' + spark_shuffle_compress + ' --packages "io.archivesunleashed:aut:' + aut_version + '" -i ' + collection_spark_job_file + ' 2>&1 | tee ' + collection_spark_job_file + '.log'
spark_job_cmd = spark_shell + ' --master local[' + spark_threads + '] --driver-memory ' + spark_memory_driver + ' --conf spark.network.timeout=' + spark_network_timeout + ' --conf spark.executor.heartbeatInterval=' + spark_heartbeat_interval + ' --conf spark.driver.maxResultSize=' + spark_driver_max_result_size + ' --conf spark.rdd.compress=' + spark_rdd_compress + ' --conf spark.serializer=' + spark_serializer + ' --conf spark.shuffle.compress=' + spark_shuffle_compress + ' --conf spark.kryoserializer.buffer.max=' + spark_kryoserializer_buffer_max + ' --packages "io.archivesunleashed:aut:' + aut_version + '" -i ' + collection_spark_job_file + ' 2>&1 | tee ' + collection_spark_job_file + '.log'
logger.info 'Executing: ' + spark_job_cmd
system(spark_job_cmd)
domain_success = collection_derivatives + '/all-domains/output/_SUCCESS'
@@ -9,34 +9,35 @@
# production:
# stripe_api_key: sk_live_EeHnL644i6zo4Iyq4v1KdV9H
# stripe_publishable_key: pk_live_9lcthxpSIHbGwmdO941O1XVU
TWITTER_KEY: "SOME KEY"
TWITTER_SECRET: "SOME SECRET"
AUT_VERSION: "0.17.0"
BASE_HOSTNAME: "cloud.archivesunleashed.org" #if in development mode: localhost:3000
BASE_HOST_URL: "https://cloud.archivesunleashed.org" #if in development mode: http://localhost:3000
DASHBOARD_PASS: "test"
DASHBOARD_USER: "test"
DJW_PASSWORD: "some password"
DJW_USERNAME: "some user"
DOWNLOAD_PATH: "tests/fixtures/files" #"/path/to/somewhere"
EMAIL_DOMAIN: "something.ca"
EMAIL_PASSWORD: "somepassword"
EMAIL_SERVER_NAME: "smtp.something.ca"
EMAIL_USERNAME: "someuser"
GITHUB_KEY: "SOME KEY"
GITHUB_SECRET: "SOME SECRET"
WASAPI_KEY: "32BITKEY SecureRandom.random_bytes(32)"
DOWNLOAD_PATH: "tests/fixtures/files" #"/path/to/somewhere"
SPARK_SHELL: "/path/to/spark/bin/spark-shell"
GRAPHPASS: "/path/to/graphpass/directory/graphpass"
SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/SOME/IDENTIFER" #https://slack.com/apps/A0F7XDUAZ-incoming-webhooks
SPARK_DRIVER_MAXRESULTSIZE: "4G"
SPARK_HEARTBEAT_INTERVAL: "600s"
SPARK_KRYOSERIALIZER_BUFFER_MAX: "2000m" #spark.kryoserializer.buffer.max must be less than 2048 mb
SPARK_MEMORY_DRIVER: "5G"
SPARK_NETWORK_TIMEOUT: "10000000"
SPARK_HEARTBEAT_INTERVAL: "600s"
SPARK_THREADS: "*"
SPARK_RDD_COMPRESS: "true"
SPARK_SHUFFLE_COMPRESS: "true"
SPARK_SERIALIZER: "org.apache.spark.serializer.KryoSerializer"
AUT_VERSION: "0.17.0"
EMAIL_SERVER_NAME: "smtp.something.ca"
EMAIL_DOMAIN: "something.ca"
EMAIL_USERNAME: "someuser"
EMAIL_PASSWORD: "somepassword"
GRAPHPASS: "/path/to/graphpass/directory/graphpass"
DJW_USERNAME: "some user"
DJW_PASSWORD: "some password"
DASHBOARD_USER: "test"
DASHBOARD_PASS: "test"
BASE_HOSTNAME: "cloud.archivesunleashed.org" #if in development mode: localhost:3000
BASE_HOST_URL: "https://cloud.archivesunleashed.org" #if in development mode: http://localhost:3000
SLACK_WEBHOOK_URL: "https://hooks.slack.com/services/SOME/IDENTIFER" #https://slack.com/apps/A0F7XDUAZ-incoming-webhooks
SPARK_SHELL: "/path/to/spark/bin/spark-shell"
SPARK_SHUFFLE_COMPRESS: "true"
SPARK_THREADS: "*"
TWITTER_KEY: "SOME KEY"
TWITTER_SECRET: "SOME SECRET"
WASAPI_KEY: "32BITKEY SecureRandom.random_bytes(32)"

production:
SECRET_KEY_BASE: "some secret key base"

0 comments on commit 57aa5af

Please sign in to comment.
You can’t perform that action at this time.