|
@@ -29,6 +29,10 @@ import org.scalatest.{BeforeAndAfter, FunSuite} |
|
|
|
@RunWith(classOf[JUnitRunner]) |
|
|
|
class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { |
|
|
|
private val arcPath = Resources.getResource("arc/example.arc.gz").getPath |
|
|
|
private val mediaPath = Resources.getResource("warc/example.media.warc.gz").getPath |
|
|
|
private val docPath = Resources.getResource("warc/example.docs.warc.gz").getPath |
|
|
|
private val txtPath = Resources.getResource("warc/example.txt.warc.gz").getPath |
|
|
|
private val pdfPath = Resources.getResource("warc/example.pdf.warc.gz").getPath |
|
|
|
private val master = "local[4]" |
|
|
|
private val appName = "example-df" |
|
|
|
private var sc: SparkContext = _ |
|
@@ -49,6 +53,13 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { |
|
|
|
val hyperlinks = df.extractHyperlinks(arcPath) |
|
|
|
val imageLinks = df.extractImageLinks(arcPath) |
|
|
|
val images = df.extractImages(arcPath) |
|
|
|
val pdfs = df.extractPDFs(pdfPath) |
|
|
|
val audio = df.extractAudio(mediaPath) |
|
|
|
val video = df.extractVideo(mediaPath) |
|
|
|
val spreadsheets = df.extractSpreadsheets(docPath) |
|
|
|
val powerpoint = df.extractPresentationProgram(docPath) |
|
|
|
val word = df.extractWordProcessor(docPath) |
|
|
|
val text = df.extractTextFiles(txtPath) |
|
|
|
|
|
|
|
val r_1 = validPages.select(url, mime_type).take(1)(0) |
|
|
|
assert(r_1.getAs[String](url) == "http://www.archive.org/") |
|
@@ -65,6 +76,34 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { |
|
|
|
val r_4 = images.take(1)(0) |
|
|
|
assert(r_4.getAs[String](url) == "http://www.archive.org/images/logoc.jpg") |
|
|
|
assert(r_4.getAs[String](md5) == "8211d1fbb9b03d8522a1ae378f9d1b24") |
|
|
|
|
|
|
|
val r_5 = pdfs.take(1)(0) |
|
|
|
assert(r_5.getAs[String](url) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y") |
|
|
|
assert(r_5.getAs[String](md5) == "aaba59d2287afd40c996488a39bbc0dd") |
|
|
|
|
|
|
|
val r_6 = audio.take(1)(0) |
|
|
|
assert(r_6.getAs[String](url) == "https://ruebot.net/files/feniz.mp3") |
|
|
|
assert(r_6.getAs[String](md5) == "f7e7ec84b12c294e19af1ba41732c733") |
|
|
|
|
|
|
|
val r_7 = video.take(1)(0) |
|
|
|
assert(r_7.getAs[String](url) == "https://ruebot.net/2018-11-12%2016.14.11.mp4") |
|
|
|
assert(r_7.getAs[String](md5) == "2cde7de3213a87269957033f6315fce2") |
|
|
|
|
|
|
|
val r_8 = spreadsheets.take(1)(0) |
|
|
|
assert(r_8.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods") |
|
|
|
assert(r_8.getAs[String](md5) == "7f70280757d8beb2d1bfd6fb1b6ae6e9") |
|
|
|
|
|
|
|
val r_9 = powerpoint.take(1)(0) |
|
|
|
assert(r_9.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx") |
|
|
|
assert(r_9.getAs[String](md5) == "7a7b1fe4b6d311376eaced9de3b682ee") |
|
|
|
|
|
|
|
val r_10 = word.take(1)(0) |
|
|
|
assert(r_10.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf") |
|
|
|
assert(r_10.getAs[String](md5) == "e483512b65ba44d71e843c57de2adeb7") |
|
|
|
|
|
|
|
val r_11 = text.take(1)(0) |
|
|
|
assert(r_11.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/aut-text.txt") |
|
|
|
assert(r_11.getAs[String](md5) == "32abd404fb560ecf14b75611f3cc5c2c") |
|
|
|
} |
|
|
|
|
|
|
|
after { |
|
|
0 comments on commit
bced854