Add textfilter background job; resolves #197.

- Add texfilter_job - Update Graphpass job to chain to textfilter job - Update Collections Controller for textfilter download link - Update routes for textfilter download path - Update Collections show view to add download link - Update rubocop config - Update README
archivesunleashed · Nov 2, 2018 · a0be87588dab36afdd7495e0baef76ed36d9d9e0 · a0be875
1 parent 9b68867
commit a0be87588dab36afdd7495e0baef76ed36d9d9e0
diff --git a/.rubocop.yml b/.rubocop.yml
@@ -16,15 +16,17 @@ Metrics/AbcSize:
    - app/helpers/collections_helper.rb
    - app/helpers/dashboards_helper.rb
    - app/helpers/wasapi_files_helper.rb
+    - app/jobs/cleanup_job.rb
    - app/jobs/graphpass_job.rb
    - app/jobs/spark_job.rb
-    - app/jobs/cleanup_job.rb
+    - app/jobs/textfilter_job.rb
    - app/jobs/wasapi_download_job.rb
    - app/jobs/wasapi_seed_job.rb

 Metrics/BlockLength:
  Exclude:
    - app/jobs/spark_job.rb
+    - app/jobs/textfilter_job.rb
    - app/jobs/wasapi_download_job.rb
    - app/jobs/wasapi_seed_job.rb
    - config/environments/development.rb
@@ -39,9 +41,10 @@ Metrics/CyclomaticComplexity:
 Metrics/LineLength:
  Exclude:
    - app/helpers/users_helper.rb
+    - app/jobs/cleanup_job.rb
    - app/jobs/graphpass_job.rb
    - app/jobs/spark_job.rb
-    - app/jobs/cleanup_job.rb
+    - app/jobs/textfilter_job.rb
    - app/jobs/wasapi_seed_job.rb
    - app/mailers/user_mailer.rb
    - test/controllers/dashboards_controller_test.rb
@@ -55,9 +58,10 @@ Metrics/MethodLength:
    - app/helpers/collections_helper.rb
    - app/helpers/dashboards_helper.rb
    - app/helpers/wasapi_files_helper.rb
+    - app/jobs/cleanup_job.rb
    - app/jobs/graphpass_job.rb
    - app/jobs/spark_job.rb
-    - app/jobs/cleanup_job.rb
+    - app/jobs/textfilter_job.rb
    - app/jobs/wasapi_download_job.rb
    - app/jobs/wasapi_seed_job.rb

@@ -119,6 +123,7 @@ Style/NegatedIf:

 Style/Next:
  Exclude:
+    - app/jobs/textfilter_job.rb
    - app/jobs/wasapi_download_job.rb
    - app/jobs/wasapi_seed_job.rb

diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ bundle exec rake jobs:work
 Or to simulate production environment with Delayed::Job:

 ```
-bin/delayed_job --pool=spark,tasks:1 --pool=graphpass,tasks:1 --pool=seed,tasks:10 --pool=download,tasks:4 --pool=cleanup,tasks:2 start
+bin/delayed_job --pool=spark,tasks:1 --pool=graphpass,tasks:1 --pool=seed,tasks:10 --pool=download,tasks:4 --pool=cleanup,tasks:2 --pool=textfilter,tasks:2 start
 ```

 Then visit http://localhost:3000.
diff --git a/app/controllers/collections_controller.rb b/app/controllers/collections_controller.rb
@@ -9,8 +9,10 @@ class CollectionsController < ApplicationController
  before_action :graphml_path, only: %i[download_graphml]
  before_action :domains_path, only: %i[download_domains]
  before_action :fulltext_path, only: %i[download_fulltext]
+  before_action :textfilter_path, only: %i[download_textfilter]
  before_action :correct_user, only: %i[show download download_gexf
-                                        download_fulltext download_domains]
+                                        download_fulltext download_domains
+                                        download_textfilter]

  def download
    WasapiDownloadJob.set(queue: :download)
@@ -45,6 +47,13 @@ def download_fulltext
    )
  end

+  def download_textfilter
+    send_file(
+      @textfilter_path,
+      type: 'application/zip'
+    )
+  end
+
  def download_domains
    send_file(
      @domains_path,
@@ -100,6 +109,13 @@ def fulltext_path
                     '-fulltext.txt'
  end

+  def textfilter_path
+    @textfilter_path = ENV['DOWNLOAD_PATH'] + '/' + params[:format].to_s + '/' +
+                       params[:collection_id].to_s + '/' +
+                       params[:user_id].to_s + '/derivatives/filtered-text/' +
+                       params[:collection_id].to_s + '-filtered_text.zip'
+  end
+
  def correct_user
    @user = User.find(params[:user_id])
    redirect_to(root_url) unless current_user?(@user)
diff --git a/app/helpers/collections_helper.rb b/app/helpers/collections_helper.rb
@@ -21,6 +21,16 @@ def display_domains(user_id, collection_id, account)
    end
  end

+  def textfilter_path(user_id, collection_id, account)
+    collection_path = ENV['DOWNLOAD_PATH'] +
+                      '/' + account.to_s +
+                      '/' + collection_id.to_s + '/'
+    collection_derivatives = collection_path +
+                             user_id.to_s + '/derivatives'
+    collection_derivatives + '/filtered-text/' + collection_id.to_s +
+      '-filtered_text.zip'
+  end
+
  def gexf_path(user_id, collection_id, account)
    collection_path = ENV['DOWNLOAD_PATH'] +
                      '/' + account.to_s +
diff --git a/app/jobs/graphpass_job.rb b/app/jobs/graphpass_job.rb
@@ -4,11 +4,7 @@
 class GraphpassJob < ApplicationJob
  queue_as :graphpass

-  after_perform do |job|
-    UserMailer.notify_collection_analyzed(job.arguments.first,
-                                          job.arguments.second).deliver_now
-    CleanupJob.set(wait: 1.day).perform_later(job.arguments.first,
-                                              job.arguments.second)
+  after_perform do
    update_dashboard = Dashboard.find_by(job_id: job_id)
    update_dashboard.end_time = DateTime.now.utc
    update_dashboard.save
@@ -40,6 +36,8 @@ def perform(user_id, collection_id)
      logger.info 'Executing: ' + combine_full_text_output_cmd
      system(combine_full_text_output_cmd)
      FileUtils.rm_rf(collection_derivatives + '/all-text/output')
+      TextfilterJob.set(queue: :textfilter)
+                   .perform_later(user_id, collection_id)
    end
  end
 end
diff --git a/app/jobs/textfilter_job.rb b/app/jobs/textfilter_job.rb
@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+
+# Methods for Basic Spark Jobs.
+class TextfilterJob < ApplicationJob
+  require 'csv'
+
+  queue_as :textfilter
+
+  after_perform do |job|
+    UserMailer.notify_collection_analyzed(job.arguments.first,
+                                          job.arguments.second).deliver_now
+    CleanupJob.set(wait: 1.day).perform_later(job.arguments.first,
+                                              job.arguments.second)
+    update_dashboard = Dashboard.find_by(job_id: job_id)
+    update_dashboard.end_time = DateTime.now.utc
+    update_dashboard.save
+  end
+
+  def perform(user_id, collection_id)
+    Dashboard.find_or_create_by!(
+      job_id: job_id,
+      user_id: user_id,
+      collection_id: collection_id,
+      queue: 'textfilter',
+      start_time: DateTime.now.utc
+    )
+    Collection.where('user_id = ? AND collection_id = ?', user_id, collection_id).each do |c|
+      collection_path = ENV['DOWNLOAD_PATH'] +
+                        '/' + c.account.to_s +
+                        '/' + c.collection_id.to_s + '/'
+      collection_derivatives = collection_path + c.user_id.to_s + '/derivatives'
+      collection_domains = collection_derivatives + '/all-domains/' +
+                           c.collection_id.to_s + '-fullurls.txt'
+      collection_fulltext = collection_derivatives + '/all-text/' +
+                            c.collection_id.to_s + '-fulltext.txt'
+      collection_filtered_text_path = collection_derivatives + '/filtered-text'
+      FileUtils.mkdir_p collection_filtered_text_path
+      unless File.zero?(collection_domains) || !File.file?(collection_domains)
+        text = File.open(collection_domains).read
+        csv_text = text.delete! '()'
+        csv = CSV.parse(csv_text, headers: false)
+        csv.take(10).each do |row|
+          # THIS IS UGLY.
+          # WE PROBABLY SHOULDN'T EXEC OUT TO GREP AND ZIP.
+          domain_textfilter = collection_filtered_text_path + '/' +
+                              collection_id.to_s + '-' + row[0].parameterize +
+                              '.txt'
+          grep_query = "'," + row[0] + ",'"
+          grep_command = '-a ' + grep_query + ' ' + collection_fulltext +
+                         ' > ' + domain_textfilter
+          `grep #{grep_command}`
+          filtered_text_zip = collection_filtered_text_path + '/' +
+                              collection_id.to_s + '-filtered_text.zip'
+          zip_command = '-j ' + filtered_text_zip + ' ' +
+                        collection_filtered_text_path + '/*.txt'
+          `zip #{zip_command}`
+        end
+      end
+    end
+  end
+end
diff --git a/app/views/collections/show.html.erb b/app/views/collections/show.html.erb
@@ -81,7 +81,10 @@
            <%= link_to('Domains', user_collection_download_domains_path(@user.id, @collection.id, @collection.account), class: 'btn btn-primary', 'data-toggle': 'tooltip', 'data-placement': 'top', title: 'A csv file that explains the distribution of domains within the web archive.') %>
          <% end %>
          <% unless !File.exists? fulltext_path(@user.id, @collection.id, @collection.account) %>
-            <%= link_to('Full Text', user_collection_download_fulltext_path(@user.id, @collection.id, @collection.account), class: 'btn btn-primary', 'data-toggle': 'tooltip', 'data-placement': 'top', title: 'A txt file that contains the plain text extracted from HTML documents within the web archive. You can find the crawl date, full URL, and the plain text of each page within the file.') %>
+            <%= link_to('Full Text', user_collection_download_fulltext_path(@user.id, @collection.id, @collection.account), class: 'btn btn-primary', 'data-toggle': 'tooltip', 'data-placement': 'top', title: 'A text file that contains the plain text extracted from HTML documents within the web archive. You can find the crawl date, full URL, and the plain text of each page within the file.') %>
+          <% end %>
+          <% unless !File.exists? textfilter_path(@user.id, @collection.id, @collection.account) %>
+            <%= link_to('Text by Domains', user_collection_download_textfilter_path(@user.id, @collection.id, @collection.account), class: 'btn btn-primary', 'data-toggle': 'tooltip', 'data-placement': 'top', title: 'A zip file that contains the text of the top ten domains within a web archive, each within their own text file. Within the files you can find the crawl date, full URL, and the plain text of each page within the file.') %>
          <% end %>
          <p><small><%=link_to("You can find information about how to use these files here.", "/derivatives")%></small></p>
        </div>
diff --git a/config/routes.rb b/config/routes.rb
@@ -12,6 +12,7 @@
      get :download_gexf
      get :download_graphml
      get :download_fulltext
+      get :download_textfilter
    end
  end