Skip to content
Permalink
Browse files

Improve test coverage. (#354)

- Add tests a few more filters in RecordLoader
- Add binary extration DataFrameLoader tests
  • Loading branch information...
ruebot authored and ianmilligan1 committed Aug 22, 2019
1 parent 4313174 commit bced85497759a4be801843658d951331ec029655
@@ -117,6 +117,26 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
assert (r2.sameElements(r))
}

test ("keep mime tika") {
val base = RecordLoader.loadArchives(arcPath, sc)
val mime = Set ("text/plain", "image/jpeg")
val r2 = base.keepMimeTypesTika(mime)
.map (mp => mp.getUrl).take(3)
assert (r2.deep == Array("dns:www.archive.org",
"http://www.archive.org/robots.txt",
"http://www.archive.org/images/logoc.jpg").deep)
}

test ("keep mime web server") {
val base = RecordLoader.loadArchives(arcPath, sc)
val mime = Set ("text/plain", "image/jpeg")
val r2 = base.keepMimeTypes(mime)
.map (mp => mp.getUrl).take(3)
assert (r2.deep == Array("filedesc://IAH-20080430204825-00000-blackbook.arc",
"http://www.archive.org/robots.txt",
"http://www.archive.org/images/logoc.jpg").deep)
}

test ("check for keep content"){
val expected = 1
val base = RecordLoader.loadArchives(arcPath, sc)
@@ -128,12 +148,22 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
assert (y1 == expected)
}

test ("discard mime") {
test ("discard mime web server") {
val base = RecordLoader.loadArchives(arcPath, sc)
val mime = Set ("text/plain", "image/jpeg")
val r2 = base.discardMimeTypes(mime)
.map (mp => mp.getUrl).take(3)
assert (r2.deep == Array("dns:www.archive.org", archive, "http://www.archive.org/index.php").deep)
assert (r2.deep == Array("dns:www.archive.org", archive,
"http://www.archive.org/index.php").deep)
}

test ("discard mime tika") {
val base = RecordLoader.loadArchives(arcPath, sc)
val mime = Set ("text/plain", "image/jpeg")
val r2 = base.discardMimeTypesTika(mime)
.map (mp => mp.getUrl).take(3)
assert (r2.deep == Array("filedesc://IAH-20080430204825-00000-blackbook.arc",
"http://www.archive.org/", "http://www.archive.org/index.php").deep)
}

test ("discard date") {
@@ -29,6 +29,10 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
private val arcPath = Resources.getResource("arc/example.arc.gz").getPath
private val mediaPath = Resources.getResource("warc/example.media.warc.gz").getPath
private val docPath = Resources.getResource("warc/example.docs.warc.gz").getPath
private val txtPath = Resources.getResource("warc/example.txt.warc.gz").getPath
private val pdfPath = Resources.getResource("warc/example.pdf.warc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
@@ -49,6 +53,13 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
val hyperlinks = df.extractHyperlinks(arcPath)
val imageLinks = df.extractImageLinks(arcPath)
val images = df.extractImages(arcPath)
val pdfs = df.extractPDFs(pdfPath)
val audio = df.extractAudio(mediaPath)
val video = df.extractVideo(mediaPath)
val spreadsheets = df.extractSpreadsheets(docPath)
val powerpoint = df.extractPresentationProgram(docPath)
val word = df.extractWordProcessor(docPath)
val text = df.extractTextFiles(txtPath)

val r_1 = validPages.select(url, mime_type).take(1)(0)
assert(r_1.getAs[String](url) == "http://www.archive.org/")
@@ -65,6 +76,34 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
val r_4 = images.take(1)(0)
assert(r_4.getAs[String](url) == "http://www.archive.org/images/logoc.jpg")
assert(r_4.getAs[String](md5) == "8211d1fbb9b03d8522a1ae378f9d1b24")

val r_5 = pdfs.take(1)(0)
assert(r_5.getAs[String](url) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y")
assert(r_5.getAs[String](md5) == "aaba59d2287afd40c996488a39bbc0dd")

val r_6 = audio.take(1)(0)
assert(r_6.getAs[String](url) == "https://ruebot.net/files/feniz.mp3")
assert(r_6.getAs[String](md5) == "f7e7ec84b12c294e19af1ba41732c733")

val r_7 = video.take(1)(0)
assert(r_7.getAs[String](url) == "https://ruebot.net/2018-11-12%2016.14.11.mp4")
assert(r_7.getAs[String](md5) == "2cde7de3213a87269957033f6315fce2")

val r_8 = spreadsheets.take(1)(0)
assert(r_8.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods")
assert(r_8.getAs[String](md5) == "7f70280757d8beb2d1bfd6fb1b6ae6e9")

val r_9 = powerpoint.take(1)(0)
assert(r_9.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx")
assert(r_9.getAs[String](md5) == "7a7b1fe4b6d311376eaced9de3b682ee")

val r_10 = word.take(1)(0)
assert(r_10.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf")
assert(r_10.getAs[String](md5) == "e483512b65ba44d71e843c57de2adeb7")

val r_11 = text.take(1)(0)
assert(r_11.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/aut-text.txt")
assert(r_11.getAs[String](md5) == "32abd404fb560ecf14b75611f3cc5c2c")
}

after {
@@ -34,6 +34,7 @@ class TupleFormatterTest extends FunSuite with Matchers {
assert(TupleFormatter.tabDelimit(tuple) == "ab\tbl\tc\t9\td\t5\thi\t1")
assert(TupleFormatter.tabDelimit.isInstanceOf[Poly1])
}

test("just flatten") {
val tuple = ("an", 1, "cr", ("x", 3, ("NO", "YES")), "perhaps", "maybe", 3, (0,1))
val flatTuple = ("an", 1, "cr", "x", 3, "NO", "YES", "perhaps", "maybe", 3, 0, 1)

0 comments on commit bced854

Please sign in to comment.
You can’t perform that action at this time.