Background job for basic analysis; Resolves archivesunleashed#28

- sets up an ugly background job for basic analysis - frames out mailers (I'll come back to this) - updates rubocop because this is so ugly - update gitignore because we're running spark now - update README
ruebot · Feb 27, 2018 · 1e43a20e38f3b3ac467b3342e4ae4f15a3ffbaaa · 1e43a20
1 parent 67a7f0f
commit 1e43a20e38f3b3ac467b3342e4ae4f15a3ffbaaa
diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,6 @@ vendor/cache
 .env
 *.swp
 coverage
+  
+
+  
+derby.log
+  
+metastore_db
diff --git a/.rubocop.yml b/.rubocop.yml
@@ -12,23 +12,27 @@ AllCops:
 Metrics/AbcSize:
  Exclude:
+  
+    - app/jobs/collections_spark_job.rb
    - app/jobs/wasapi_files_populate_job.rb
    - app/jobs/wasapi_files_download_job.rb
 Metrics/BlockLength:
  Exclude:
+  
+    - app/jobs/collections_spark_job.rb
    - app/jobs/wasapi_files_download_job.rb
    - app/jobs/wasapi_files_populate_job.rb
 Metrics/MethodLength:
  Exclude:
+  
+    - app/jobs/collections_spark_job.rb
    - app/jobs/wasapi_files_populate_job.rb
    - app/controllers/collections_controller.rb
    - app/jobs/wasapi_files_download_job.rb
 Metrics/LineLength:
  Exclude:
    - app/helpers/users_helper.rb
+  
+    - app/jobs/collections_spark_job.rb
    - app/jobs/wasapi_files_populate_job.rb
 Rails:
diff --git a/README.md b/README.md
@@ -39,7 +39,16 @@ bundle exec rake jobs:work
 Then visit http://localhost:3000.
-  
-**N.B.** This application makes use of the [dotenv-rails](https://github.com/bkeepers/dotenv) gem. You will need a `.env` file in the root of the application with `TWITTER_KEY`, `TWITTER_SECRET`, `GITHUB_KEY`, and `GITHUB_SECRET` set in order to login. You will also need a 32bit `WASAPI_KEY` declared in that file for Archive-It credential encryption,and base path set for downloads `DOWNLOAD_PATH`.
+  
+**N.B.** This application makes use of the [dotenv-rails](https://github.com/bkeepers/dotenv) gem.
+  
+
+  
+You will need a `.env` file in the root of the application with:
+  
+* `TWITTER_KEY`
+  
+* `TWITTER_SECRET`
+  
+* `GITHUB_KEY`
+  
+* `GITHUB_SECRET`
+  
+* You will also need a 32bit `WASAPI_KEY` declared in that file for Archive-It credential encryption
+  
+* Base path set for downloads `DOWNLOAD_PATH`
+  
+* Path to `spark-shell` for set in `SPARK_SHELL`
 ### Run a console
diff --git a/app/controllers/collections_controller.rb b/app/controllers/collections_controller.rb
@@ -10,6 +10,7 @@ def index; end
  def download
    WasapiFilesDownloadJob.perform_later(@user, @collection_id)
+  
+    CollectionsSparkJob.perform_later(@user, @collection_id)
  end
  def show; end
diff --git a/app/jobs/collections_spark_job.rb b/app/jobs/collections_spark_job.rb
@@ -0,0 +1,40 @@
+  
+# frozen_string_literal: true
+  
+
+  
+# Methods for Basic Spark Jobs.
+  
+class CollectionsSparkJob < ApplicationJob
+  
+  queue_as :default
+  
+  require 'open-uri'
+  
+
+  
+  def after_perform
+  
+    UserMailer.notify_collection_downloaded(something)
+  
+  end
+  
+
+  
+  def perform(user_id, collection_id)
+  
+    spark_shell = ENV['SPARK_SHELL']
+  
+    WasapiFile.where('user_id = ? AND collection_id = ?', user_id, collection_id).each do |c|
+  
+      collection_path = ENV['DOWNLOAD_PATH'] +
+  
+                        '/' + c.account.to_s +
+  
+                        '/' + c.collection_id.to_s + '/'
+  
+      collection_warcs = collection_path + 'warcs/*.gz'
+  
+      collection_derivatives = collection_path + 'derivatives'
+  
+      collection_spark_jobs_path = collection_path + 'spark_jobs'
+  
+      collection_spark_job_file = collection_spark_jobs_path + '/' + c.collection_id.to_s + '.scala'
+  
+      FileUtils.mkdir_p collection_derivatives
+  
+      FileUtils.mkdir_p collection_spark_jobs_path
+  
+      spark_job = %(
+  
+      import io.archivesunleashed.spark.matchbox.{ExtractDomain, ExtractLinks, RemoveHTML, RecordLoader, WriteGEXF}
+  
+      import io.archivesunleashed.spark.rdd.RecordRDD._
+  
+      sc.setLogLevel("INFO")
+  
+      RecordLoader.loadArchives("#{collection_warcs}", sc).keepValidPages().map(r => ExtractDomain(r.getUrl)).countItems().saveAsTextFile("#{collection_derivatives}/all-domains")
+  
+      RecordLoader.loadArchives("#{collection_warcs}", sc).keepValidPages().map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(r.getContentString))).saveAsTextFile("#{collection_derivatives}/all-text")
+  
+      val links = RecordLoader.loadArchives("#{collection_warcs}", sc).keepValidPages().map(r => (r.getCrawlDate, ExtractLinks(r.getUrl, r.getContentString))).flatMap(r => r._2.map(f => (r._1, ExtractDomain(f._1).replaceAll("^\\\\s*www\\\\.", ""), ExtractDomain(f._2).replaceAll("^\\\\s*www\\\\.", "")))).filter(r => r._2 != "" && r._3 != "").countItems().filter(r => r._2 > 5)
+  
+      WriteGEXF(links, "#{collection_derivatives}/links-for-gephi.gexf")
+  
+      sys.exit
+  
+      )
+  
+      File.open(collection_spark_job_file, 'w') { |file| file.write(spark_job) }
+  
+      cmd = spark_shell + ' --master local[12] --driver-memory 5G --conf spark.network.timeout=10000000 --packages "io.archivesunleashed:aut:0.12.1" -i ' + collection_spark_job_file + ' | tee ' + collection_spark_job_file + '.log'
+  
+      logger.info '[INFO] Executing: ' + cmd
+  
+      system(cmd)
+  
+    end
+  
+  end
+  
+end
diff --git a/app/jobs/wasapi_files_download_job.rb b/app/jobs/wasapi_files_download_job.rb
@@ -5,9 +5,14 @@ class WasapiFilesDownloadJob < ApplicationJob
  queue_as :default
  require 'open-uri'
+  
+  def after_perform
+  
+    UserMailer.notify_collection_downloaded(something)
+  
+  end
+  
+
  def perform(user_id, collection_id)
    wasapi_username = user_id.wasapi_username
    wasapi_password = user_id.wasapi_password
+  
+    logger.debug user_id
    download_files = WasapiFile.where('user_id = ? AND collection_id = ?',
                                      user_id, collection_id)
    Parallel.each(download_files, in_threads: 5) do |wasapi_file|
diff --git a/app/jobs/wasapi_files_populate_job.rb b/app/jobs/wasapi_files_populate_job.rb
@@ -8,6 +8,10 @@ class WasapiFilesPopulateJob < ApplicationJob
  WASAPI_BASE_URL = 'https://partner.archive-it.org/wasapi/v1/webdata'
  AI_COLLECTION_API_URL = 'https://partner.archive-it.org/api/collection/'
+  
+  def after_perform
+  
+    UserMailer.notify_collection_setup(something)
+  
+  end
+  
+
  def perform(user)
    wasapi_request = HTTP.basic_auth(user: user.wasapi_username,
                                     pass: user.wasapi_password)
diff --git a/app/mailers/user_mailer.rb b/app/mailers/user_mailer.rb
@@ -0,0 +1,6 @@
+  
+# frozen_string_literal: true
+  
+
+  
+# Methods for User Mailer
+  
+class UserMailer < ApplicationMailer
+  
+  default from: 'notifications@archivesunleashed.org'
+  
+end
diff --git a/test/mailers/previews/user_mailer_preview.rb b/test/mailers/previews/user_mailer_preview.rb
@@ -0,0 +1,5 @@
+  
+# frozen_string_literal: true
+  
+
+  
+# Preview all emails at http://localhost:3000/rails/mailers/user_mailer
+  
+class UserMailerPreview < ActionMailer::Preview
+  
+end
diff --git a/test/mailers/user_mailer_test.rb b/test/mailers/user_mailer_test.rb
@@ -0,0 +1,9 @@
+  
+# frozen_string_literal: true
+  
+
+  
+require 'test_helper'
+  
+
+  
+class UserMailerTest < ActionMailer::TestCase
+  
+  # test "the truth" do
+  
+  #   assert true
+  
+  # end
+  
+end