Permalink
Browse files

Background job for basic analysis; Resolves archivesunleashed#28

- sets up an ugly background job for basic analysis
- frames out mailers (I'll come back to this)
- updates rubocop because this is so ugly
- update gitignore because we're running spark now
- update README
  • Loading branch information...
ruebot committed Feb 27, 2018
1 parent 67a7f0f commit 1e43a20e38f3b3ac467b3342e4ae4f15a3ffbaaa
@@ -26,3 +26,6 @@ vendor/cache
.env
*.swp
coverage

derby.log
metastore_db
@@ -12,23 +12,27 @@ AllCops:

Metrics/AbcSize:
Exclude:
- app/jobs/collections_spark_job.rb
- app/jobs/wasapi_files_populate_job.rb
- app/jobs/wasapi_files_download_job.rb

Metrics/BlockLength:
Exclude:
- app/jobs/collections_spark_job.rb
- app/jobs/wasapi_files_download_job.rb
- app/jobs/wasapi_files_populate_job.rb

Metrics/MethodLength:
Exclude:
- app/jobs/collections_spark_job.rb
- app/jobs/wasapi_files_populate_job.rb
- app/controllers/collections_controller.rb
- app/jobs/wasapi_files_download_job.rb

Metrics/LineLength:
Exclude:
- app/helpers/users_helper.rb
- app/jobs/collections_spark_job.rb
- app/jobs/wasapi_files_populate_job.rb

Rails:
@@ -39,7 +39,16 @@ bundle exec rake jobs:work

Then visit http://localhost:3000.

**N.B.** This application makes use of the [dotenv-rails](https://github.com/bkeepers/dotenv) gem. You will need a `.env` file in the root of the application with `TWITTER_KEY`, `TWITTER_SECRET`, `GITHUB_KEY`, and `GITHUB_SECRET` set in order to login. You will also need a 32bit `WASAPI_KEY` declared in that file for Archive-It credential encryption,and base path set for downloads `DOWNLOAD_PATH`.
**N.B.** This application makes use of the [dotenv-rails](https://github.com/bkeepers/dotenv) gem.

You will need a `.env` file in the root of the application with:
* `TWITTER_KEY`
* `TWITTER_SECRET`
* `GITHUB_KEY`
* `GITHUB_SECRET`
* You will also need a 32bit `WASAPI_KEY` declared in that file for Archive-It credential encryption
* Base path set for downloads `DOWNLOAD_PATH`
* Path to `spark-shell` for set in `SPARK_SHELL`

### Run a console

@@ -10,6 +10,7 @@ def index; end

def download
WasapiFilesDownloadJob.perform_later(@user, @collection_id)
CollectionsSparkJob.perform_later(@user, @collection_id)
end

def show; end
@@ -0,0 +1,40 @@
# frozen_string_literal: true

# Methods for Basic Spark Jobs.
class CollectionsSparkJob < ApplicationJob
queue_as :default
require 'open-uri'

def after_perform
UserMailer.notify_collection_downloaded(something)
end

def perform(user_id, collection_id)
spark_shell = ENV['SPARK_SHELL']
WasapiFile.where('user_id = ? AND collection_id = ?', user_id, collection_id).each do |c|
collection_path = ENV['DOWNLOAD_PATH'] +
'/' + c.account.to_s +
'/' + c.collection_id.to_s + '/'
collection_warcs = collection_path + 'warcs/*.gz'
collection_derivatives = collection_path + 'derivatives'
collection_spark_jobs_path = collection_path + 'spark_jobs'
collection_spark_job_file = collection_spark_jobs_path + '/' + c.collection_id.to_s + '.scala'
FileUtils.mkdir_p collection_derivatives
FileUtils.mkdir_p collection_spark_jobs_path
spark_job = %(
import io.archivesunleashed.spark.matchbox.{ExtractDomain, ExtractLinks, RemoveHTML, RecordLoader, WriteGEXF}
import io.archivesunleashed.spark.rdd.RecordRDD._
sc.setLogLevel("INFO")
RecordLoader.loadArchives("#{collection_warcs}", sc).keepValidPages().map(r => ExtractDomain(r.getUrl)).countItems().saveAsTextFile("#{collection_derivatives}/all-domains")
RecordLoader.loadArchives("#{collection_warcs}", sc).keepValidPages().map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(r.getContentString))).saveAsTextFile("#{collection_derivatives}/all-text")
val links = RecordLoader.loadArchives("#{collection_warcs}", sc).keepValidPages().map(r => (r.getCrawlDate, ExtractLinks(r.getUrl, r.getContentString))).flatMap(r => r._2.map(f => (r._1, ExtractDomain(f._1).replaceAll("^\\\\s*www\\\\.", ""), ExtractDomain(f._2).replaceAll("^\\\\s*www\\\\.", "")))).filter(r => r._2 != "" && r._3 != "").countItems().filter(r => r._2 > 5)
WriteGEXF(links, "#{collection_derivatives}/links-for-gephi.gexf")
sys.exit
)
File.open(collection_spark_job_file, 'w') { |file| file.write(spark_job) }
cmd = spark_shell + ' --master local[12] --driver-memory 5G --conf spark.network.timeout=10000000 --packages "io.archivesunleashed:aut:0.12.1" -i ' + collection_spark_job_file + ' | tee ' + collection_spark_job_file + '.log'
logger.info '[INFO] Executing: ' + cmd
system(cmd)
end
end
end
@@ -5,9 +5,14 @@ class WasapiFilesDownloadJob < ApplicationJob
queue_as :default
require 'open-uri'

def after_perform
UserMailer.notify_collection_downloaded(something)
end

def perform(user_id, collection_id)
wasapi_username = user_id.wasapi_username
wasapi_password = user_id.wasapi_password
logger.debug user_id
download_files = WasapiFile.where('user_id = ? AND collection_id = ?',
user_id, collection_id)
Parallel.each(download_files, in_threads: 5) do |wasapi_file|
@@ -8,6 +8,10 @@ class WasapiFilesPopulateJob < ApplicationJob
WASAPI_BASE_URL = 'https://partner.archive-it.org/wasapi/v1/webdata'
AI_COLLECTION_API_URL = 'https://partner.archive-it.org/api/collection/'

def after_perform
UserMailer.notify_collection_setup(something)
end

def perform(user)
wasapi_request = HTTP.basic_auth(user: user.wasapi_username,
pass: user.wasapi_password)
@@ -0,0 +1,6 @@
# frozen_string_literal: true

# Methods for User Mailer
class UserMailer < ApplicationMailer
default from: 'notifications@archivesunleashed.org'
end
@@ -0,0 +1,5 @@
# frozen_string_literal: true

# Preview all emails at http://localhost:3000/rails/mailers/user_mailer
class UserMailerPreview < ActionMailer::Preview
end
@@ -0,0 +1,9 @@
# frozen_string_literal: true

require 'test_helper'

class UserMailerTest < ActionMailer::TestCase
# test "the truth" do
# assert true
# end
end

0 comments on commit 1e43a20

Please sign in to comment.