Permalink
Browse files

Add textfilter background job; resolves #197.

- Add texfilter_job
- Update Graphpass job to chain to textfilter job
- Update Collections Controller for textfilter download link
- Update routes for textfilter download path
- Update Collections show view to add download link
- Update rubocop config
- Update README
  • Loading branch information...
ruebot committed Nov 2, 2018
1 parent 9b68867 commit a0be87588dab36afdd7495e0baef76ed36d9d9e0
@@ -16,15 +16,17 @@ Metrics/AbcSize:
- app/helpers/collections_helper.rb
- app/helpers/dashboards_helper.rb
- app/helpers/wasapi_files_helper.rb
- app/jobs/cleanup_job.rb
- app/jobs/graphpass_job.rb
- app/jobs/spark_job.rb
- app/jobs/cleanup_job.rb
- app/jobs/textfilter_job.rb
- app/jobs/wasapi_download_job.rb
- app/jobs/wasapi_seed_job.rb
Metrics/BlockLength:
Exclude:
- app/jobs/spark_job.rb
- app/jobs/textfilter_job.rb
- app/jobs/wasapi_download_job.rb
- app/jobs/wasapi_seed_job.rb
- config/environments/development.rb
@@ -39,9 +41,10 @@ Metrics/CyclomaticComplexity:
Metrics/LineLength:
Exclude:
- app/helpers/users_helper.rb
- app/jobs/cleanup_job.rb
- app/jobs/graphpass_job.rb
- app/jobs/spark_job.rb
- app/jobs/cleanup_job.rb
- app/jobs/textfilter_job.rb
- app/jobs/wasapi_seed_job.rb
- app/mailers/user_mailer.rb
- test/controllers/dashboards_controller_test.rb
@@ -55,9 +58,10 @@ Metrics/MethodLength:
- app/helpers/collections_helper.rb
- app/helpers/dashboards_helper.rb
- app/helpers/wasapi_files_helper.rb
- app/jobs/cleanup_job.rb
- app/jobs/graphpass_job.rb
- app/jobs/spark_job.rb
- app/jobs/cleanup_job.rb
- app/jobs/textfilter_job.rb
- app/jobs/wasapi_download_job.rb
- app/jobs/wasapi_seed_job.rb
@@ -119,6 +123,7 @@ Style/NegatedIf:
Style/Next:
Exclude:
- app/jobs/textfilter_job.rb
- app/jobs/wasapi_download_job.rb
- app/jobs/wasapi_seed_job.rb
@@ -50,7 +50,7 @@ bundle exec rake jobs:work
Or to simulate production environment with Delayed::Job:
```
bin/delayed_job --pool=spark,tasks:1 --pool=graphpass,tasks:1 --pool=seed,tasks:10 --pool=download,tasks:4 --pool=cleanup,tasks:2 start
bin/delayed_job --pool=spark,tasks:1 --pool=graphpass,tasks:1 --pool=seed,tasks:10 --pool=download,tasks:4 --pool=cleanup,tasks:2 --pool=textfilter,tasks:2 start
```
Then visit http://localhost:3000.
@@ -9,8 +9,10 @@ class CollectionsController < ApplicationController
before_action :graphml_path, only: %i[download_graphml]
before_action :domains_path, only: %i[download_domains]
before_action :fulltext_path, only: %i[download_fulltext]
before_action :textfilter_path, only: %i[download_textfilter]
before_action :correct_user, only: %i[show download download_gexf
download_fulltext download_domains]
download_fulltext download_domains
download_textfilter]
def download
WasapiDownloadJob.set(queue: :download)
@@ -45,6 +47,13 @@ def download_fulltext
)
end
def download_textfilter
send_file(
@textfilter_path,
type: 'application/zip'
)
end
def download_domains
send_file(
@domains_path,
@@ -100,6 +109,13 @@ def fulltext_path
'-fulltext.txt'
end
def textfilter_path
@textfilter_path = ENV['DOWNLOAD_PATH'] + '/' + params[:format].to_s + '/' +
params[:collection_id].to_s + '/' +
params[:user_id].to_s + '/derivatives/filtered-text/' +
params[:collection_id].to_s + '-filtered_text.zip'
end
def correct_user
@user = User.find(params[:user_id])
redirect_to(root_url) unless current_user?(@user)
@@ -21,6 +21,16 @@ def display_domains(user_id, collection_id, account)
end
end
def textfilter_path(user_id, collection_id, account)
collection_path = ENV['DOWNLOAD_PATH'] +
'/' + account.to_s +
'/' + collection_id.to_s + '/'
collection_derivatives = collection_path +
user_id.to_s + '/derivatives'
collection_derivatives + '/filtered-text/' + collection_id.to_s +
'-filtered_text.zip'
end
def gexf_path(user_id, collection_id, account)
collection_path = ENV['DOWNLOAD_PATH'] +
'/' + account.to_s +
@@ -4,11 +4,7 @@
class GraphpassJob < ApplicationJob
queue_as :graphpass
after_perform do |job|
UserMailer.notify_collection_analyzed(job.arguments.first,
job.arguments.second).deliver_now
CleanupJob.set(wait: 1.day).perform_later(job.arguments.first,
job.arguments.second)
after_perform do
update_dashboard = Dashboard.find_by(job_id: job_id)
update_dashboard.end_time = DateTime.now.utc
update_dashboard.save
@@ -40,6 +36,8 @@ def perform(user_id, collection_id)
logger.info 'Executing: ' + combine_full_text_output_cmd
system(combine_full_text_output_cmd)
FileUtils.rm_rf(collection_derivatives + '/all-text/output')
TextfilterJob.set(queue: :textfilter)
.perform_later(user_id, collection_id)
end
end
end
@@ -0,0 +1,61 @@
# frozen_string_literal: true
# Methods for Basic Spark Jobs.
class TextfilterJob < ApplicationJob
require 'csv'
queue_as :textfilter
after_perform do |job|
UserMailer.notify_collection_analyzed(job.arguments.first,
job.arguments.second).deliver_now
CleanupJob.set(wait: 1.day).perform_later(job.arguments.first,
job.arguments.second)
update_dashboard = Dashboard.find_by(job_id: job_id)
update_dashboard.end_time = DateTime.now.utc
update_dashboard.save
end
def perform(user_id, collection_id)
Dashboard.find_or_create_by!(
job_id: job_id,
user_id: user_id,
collection_id: collection_id,
queue: 'textfilter',
start_time: DateTime.now.utc
)
Collection.where('user_id = ? AND collection_id = ?', user_id, collection_id).each do |c|
collection_path = ENV['DOWNLOAD_PATH'] +
'/' + c.account.to_s +
'/' + c.collection_id.to_s + '/'
collection_derivatives = collection_path + c.user_id.to_s + '/derivatives'
collection_domains = collection_derivatives + '/all-domains/' +
c.collection_id.to_s + '-fullurls.txt'
collection_fulltext = collection_derivatives + '/all-text/' +
c.collection_id.to_s + '-fulltext.txt'
collection_filtered_text_path = collection_derivatives + '/filtered-text'
FileUtils.mkdir_p collection_filtered_text_path
unless File.zero?(collection_domains) || !File.file?(collection_domains)
text = File.open(collection_domains).read
csv_text = text.delete! '()'
csv = CSV.parse(csv_text, headers: false)
csv.take(10).each do |row|
# THIS IS UGLY.
# WE PROBABLY SHOULDN'T EXEC OUT TO GREP AND ZIP.
domain_textfilter = collection_filtered_text_path + '/' +
collection_id.to_s + '-' + row[0].parameterize +
'.txt'
grep_query = "'," + row[0] + ",'"
grep_command = '-a ' + grep_query + ' ' + collection_fulltext +
' > ' + domain_textfilter
`grep #{grep_command}`
filtered_text_zip = collection_filtered_text_path + '/' +
collection_id.to_s + '-filtered_text.zip'
zip_command = '-j ' + filtered_text_zip + ' ' +
collection_filtered_text_path + '/*.txt'
`zip #{zip_command}`
end
end
end
end
end
@@ -81,7 +81,10 @@
<%= link_to('Domains', user_collection_download_domains_path(@user.id, @collection.id, @collection.account), class: 'btn btn-primary', 'data-toggle': 'tooltip', 'data-placement': 'top', title: 'A csv file that explains the distribution of domains within the web archive.') %>
<% end %>
<% unless !File.exists? fulltext_path(@user.id, @collection.id, @collection.account) %>
<%= link_to('Full Text', user_collection_download_fulltext_path(@user.id, @collection.id, @collection.account), class: 'btn btn-primary', 'data-toggle': 'tooltip', 'data-placement': 'top', title: 'A txt file that contains the plain text extracted from HTML documents within the web archive. You can find the crawl date, full URL, and the plain text of each page within the file.') %>
<%= link_to('Full Text', user_collection_download_fulltext_path(@user.id, @collection.id, @collection.account), class: 'btn btn-primary', 'data-toggle': 'tooltip', 'data-placement': 'top', title: 'A text file that contains the plain text extracted from HTML documents within the web archive. You can find the crawl date, full URL, and the plain text of each page within the file.') %>
<% end %>
<% unless !File.exists? textfilter_path(@user.id, @collection.id, @collection.account) %>
<%= link_to('Text by Domains', user_collection_download_textfilter_path(@user.id, @collection.id, @collection.account), class: 'btn btn-primary', 'data-toggle': 'tooltip', 'data-placement': 'top', title: 'A zip file that contains the text of the top ten domains within a web archive, each within their own text file. Within the files you can find the crawl date, full URL, and the plain text of each page within the file.') %>
<% end %>
<p><small><%=link_to("You can find information about how to use these files here.", "/derivatives")%></small></p>
</div>
@@ -12,6 +12,7 @@
get :download_gexf
get :download_graphml
get :download_fulltext
get :download_textfilter
end
end

0 comments on commit a0be875

Please sign in to comment.