Skip to content
Permalink
Browse files

Update test coverage for data frames (#336).

- This commit will fall under @ruebot, but @jrwiebe did the heavy lifting here; see #336 for his commits before they were squashed down.
- Resolves #265
- Resolves #263
- Update Scaladocs
  • Loading branch information...
ruebot authored and ianmilligan1 committed Jul 31, 2019
1 parent 64c1f1f commit 605afcc6aeb6f55e08940ff0dfda75d89050fce1
@@ -47,8 +47,8 @@ package object df {
implicit class SaveImage(df: DataFrame) {
/**
* @param bytesColumnName the name of the column containing the image bytes
* @param fileName the name of the file to save the images to (without extension)
* e.g. fileName = "foo" => images are saved as foo0.jpg, foo1.jpg
* @param fileName the base name of the file to save the images to (without extension)
* e.g. fileName = "foo" => images are saved as foo-[md5 hash].jpg
*/
def saveToDisk(bytesColumnName: String, fileName: String): Unit = {
df.select(bytesColumnName).foreach(row => {
@@ -0,0 +1,73 @@
/*
* Archives Unleashed Toolkit (AUT):
* An open-source toolkit for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.df

import io.archivesunleashed.DataFrameLoader
import com.google.common.io.Resources
// scalastyle:off underscore.import
import org.apache.spark.sql.functions._
// scalastyle:on underscore.import
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}

@RunWith(classOf[JUnitRunner])
class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
private val arcPath = Resources.getResource("arc/example.arc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _

before {
val conf = new SparkConf()
.setMaster(master)
.setAppName(appName)
sc = new SparkContext(conf)
}

test("Test DataFrameLoader") {
val df = new DataFrameLoader(sc)
val validPages = df.extractValidPages(arcPath)
val hyperlinks = df.extractHyperlinks(arcPath)
val imageLinks = df.extractImageLinks(arcPath)
val images = df.extractImages(arcPath)

val r_1 = validPages.select("url", "mime_type").take(1)(0)
assert(r_1.getAs[String]("url") == "http://www.archive.org/")
assert(r_1.getAs[String]("mime_type") == "text/html")

val r_2 = hyperlinks.select("Dest", "Anchor").take(3)(2)
assert(r_2(0) == "http://web.archive.org/collections/web/advanced.html")
assert(r_2(1) == "Advanced Search")

val r_3 = imageLinks.take(100)(99)
assert(r_3.get(0) == "http://www.archive.org/details/secretarmiesb00spivrich")
assert(r_3.get(1) == "http://www.archive.org/images/star.png")

val r_4 = images.take(1)(0)
assert(r_4.getAs[String]("url") == "http://www.archive.org/images/logoc.jpg")
assert(r_4.getAs[String]("md5") == "8211d1fbb9b03d8522a1ae378f9d1b24")
}

after {
if (sc != null) {
sc.stop()
}
}
}
@@ -46,13 +46,7 @@ class ExtractImageDetailsTest extends FunSuite with BeforeAndAfter {
val df = RecordLoader.loadArchives(arcPath, sc)
.extractImageDetailsDF()

// We need this in order to use the $-notation
val spark = SparkSession.builder().master("local").getOrCreate()
// scalastyle:off
import spark.implicits._
// scalastyle:on

val extracted = df.select($"url", $"mime_type", $"width", $"height", $"md5")
val extracted = df.select("url", "mime_type", "width", "height", "md5")
.orderBy(desc("md5")).head(2).toList
assert(extracted.size == 2)
assert("http://www.archive.org/images/mediatype_movies.gif" == extracted(0)(0))
@@ -34,6 +34,9 @@ import java.io.{File, ByteArrayInputStream}
import javax.imageio.ImageIO
import java.util.Base64

case class TestImageDetails(url: String, mime_type: String, width: String,
height: String, md5: String, bytes: String)

@RunWith(classOf[JUnitRunner])
class SaveImageTest extends FunSuite with BeforeAndAfter {
private val arcPath = Resources.getResource("arc/example.arc.gz").getPath
@@ -53,13 +56,7 @@ class SaveImageTest extends FunSuite with BeforeAndAfter {
val df = RecordLoader.loadArchives(arcPath, sc)
.extractImageDetailsDF()

// We need this in order to use the $-notation
val spark = SparkSession.builder().master("local").getOrCreate()
// scalastyle:off
import spark.implicits._
// scalastyle:on

val extracted = df.select($"bytes")
val extracted = df.select("bytes")
.orderBy(desc(testString)).limit(1)
extracted.saveToDisk(testString, "/tmp/foo")

@@ -85,6 +82,27 @@ class SaveImageTest extends FunSuite with BeforeAndAfter {
Files.delete(Paths.get(fileName))
}

test("Attempt to save invalid image") {
val dummyEncBytes = Base64.getEncoder.encodeToString(Array.range(0, 127)
.map(_.toByte))
val dummyMD5 = dummyEncBytes.computeHash()
val dummyImg = TestImageDetails("http://example.com/fake.jpg", "image/jpeg",
"600", "800", dummyMD5, dummyEncBytes)

// For toDF().
val spark = SparkSession.builder().master("local").getOrCreate()
// scalastyle:off
import spark.implicits._
// scalastyle:on
val df = Seq(dummyImg).toDF

df.saveToDisk("bytes", "/tmp/foo")

// Check that no file was written.
assert(new File("/tmp").listFiles.filter(_.isFile).toList
.count(_.getName.startsWith("foo-" + dummyMD5)) == 0)
}

after {
if (sc != null) {
sc.stop()

0 comments on commit 605afcc

Please sign in to comment.
You can’t perform that action at this time.