Update test coverage for data frames (#336).

- This commit will fall under @ruebot, but @jrwiebe did the heavy lifting here; see #336 for his commits before they were squashed down. - Resolves #265 - Resolves #263 - Update Scaladocs
archivesunleashed · Jul 31, 2019 · 605afcc6aeb6f55e08940ff0dfda75d89050fce1 · 605afcc
1 parent 64c1f1f
commit 605afcc6aeb6f55e08940ff0dfda75d89050fce1
diff --git a/src/main/scala/io/archivesunleashed/df/package.scala b/src/main/scala/io/archivesunleashed/df/package.scala
@@ -47,8 +47,8 @@ package object df {
  implicit class SaveImage(df: DataFrame) {
    /**
     * @param bytesColumnName the name of the column containing the image bytes
-  
-     * @param fileName the name of the file to save the images to (without extension)
-  
-     * e.g. fileName = "foo" => images are saved as foo0.jpg, foo1.jpg
+  
+     * @param fileName the base name of the file to save the images to (without extension)
+  
+     * e.g. fileName = "foo" => images are saved as foo-[md5 hash].jpg
    */
    def saveToDisk(bytesColumnName: String, fileName: String): Unit = {
      df.select(bytesColumnName).foreach(row => {

diff --git a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
@@ -0,0 +1,73 @@
+  
+/*
+  
+ * Archives Unleashed Toolkit (AUT):
+  
+ * An open-source toolkit for analyzing web archives.
+  
+ *
+  
+ * Licensed under the Apache License, Version 2.0 (the "License");
+  
+ * you may not use this file except in compliance with the License.
+  
+ * You may obtain a copy of the License at
+  
+ *
+  
+ *     http://www.apache.org/licenses/LICENSE-2.0
+  
+ *
+  
+ * Unless required by applicable law or agreed to in writing, software
+  
+ * distributed under the License is distributed on an "AS IS" BASIS,
+  
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  
+ * See the License for the specific language governing permissions and
+  
+ * limitations under the License.
+  
+ */
+  
+package io.archivesunleashed.df
+  
+
+  
+import io.archivesunleashed.DataFrameLoader
+  
+import com.google.common.io.Resources
+  
+// scalastyle:off underscore.import
+  
+import org.apache.spark.sql.functions._
+  
+// scalastyle:on underscore.import
+  
+import org.apache.spark.sql.SparkSession
+  
+import org.apache.spark.{SparkConf, SparkContext}
+  
+import org.junit.runner.RunWith
+  
+import org.scalatest.junit.JUnitRunner
+  
+import org.scalatest.{BeforeAndAfter, FunSuite}
+  
+
+  
+@RunWith(classOf[JUnitRunner])
+  
+class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
+  
+  private val arcPath = Resources.getResource("arc/example.arc.gz").getPath
+  
+  private val master = "local[4]"
+  
+  private val appName = "example-df"
+  
+  private var sc: SparkContext = _
+  
+
+  
+  before {
+  
+    val conf = new SparkConf()
+  
+      .setMaster(master)
+  
+      .setAppName(appName)
+  
+    sc = new SparkContext(conf)
+  
+  }
+  
+
+  
+  test("Test DataFrameLoader") {
+  
+    val df = new DataFrameLoader(sc)
+  
+    val validPages = df.extractValidPages(arcPath)
+  
+    val hyperlinks = df.extractHyperlinks(arcPath)
+  
+    val imageLinks = df.extractImageLinks(arcPath)
+  
+    val images = df.extractImages(arcPath)
+  
+
+  
+    val r_1 = validPages.select("url", "mime_type").take(1)(0)
+  
+    assert(r_1.getAs[String]("url") == "http://www.archive.org/")
+  
+    assert(r_1.getAs[String]("mime_type") == "text/html")
+  
+
+  
+    val r_2 = hyperlinks.select("Dest", "Anchor").take(3)(2)
+  
+    assert(r_2(0) == "http://web.archive.org/collections/web/advanced.html")
+  
+    assert(r_2(1) == "Advanced Search")
+  
+
+  
+    val r_3 = imageLinks.take(100)(99)
+  
+    assert(r_3.get(0) == "http://www.archive.org/details/secretarmiesb00spivrich")
+  
+    assert(r_3.get(1) == "http://www.archive.org/images/star.png")
+  
+
+  
+    val r_4 = images.take(1)(0)
+  
+    assert(r_4.getAs[String]("url") == "http://www.archive.org/images/logoc.jpg")
+  
+    assert(r_4.getAs[String]("md5") == "8211d1fbb9b03d8522a1ae378f9d1b24")
+  
+  }
+  
+
+  
+  after {
+  
+    if (sc != null) {
+  
+      sc.stop()
+  
+    }
+  
+  }
+  
+}
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala
@@ -46,13 +46,7 @@ class ExtractImageDetailsTest extends FunSuite with BeforeAndAfter {
    val df = RecordLoader.loadArchives(arcPath, sc)
      .extractImageDetailsDF()
-  
-    // We need this in order to use the $-notation
-  
-    val spark = SparkSession.builder().master("local").getOrCreate()
-  
-    // scalastyle:off
-  
-    import spark.implicits._
-  
-    // scalastyle:on
-  
-
-  
-    val extracted = df.select($"url", $"mime_type", $"width", $"height", $"md5")
+  
+    val extracted = df.select("url", "mime_type", "width", "height", "md5")
      .orderBy(desc("md5")).head(2).toList
    assert(extracted.size == 2)
    assert("http://www.archive.org/images/mediatype_movies.gif" == extracted(0)(0))

diff --git a/src/test/scala/io/archivesunleashed/df/SaveImageTest.scala b/src/test/scala/io/archivesunleashed/df/SaveImageTest.scala
@@ -34,6 +34,9 @@ import java.io.{File, ByteArrayInputStream}
 import javax.imageio.ImageIO
 import java.util.Base64
+  
+case class TestImageDetails(url: String, mime_type: String, width: String,
+  
+                        height: String, md5: String, bytes: String)
+  
+
 @RunWith(classOf[JUnitRunner])
 class SaveImageTest extends FunSuite with BeforeAndAfter {
  private val arcPath = Resources.getResource("arc/example.arc.gz").getPath
@@ -53,13 +56,7 @@ class SaveImageTest extends FunSuite with BeforeAndAfter {
    val df = RecordLoader.loadArchives(arcPath, sc)
      .extractImageDetailsDF()
-  
-    // We need this in order to use the $-notation
-  
-    val spark = SparkSession.builder().master("local").getOrCreate()
-  
-    // scalastyle:off
-  
-    import spark.implicits._
-  
-    // scalastyle:on
-  
-
-  
-    val extracted = df.select($"bytes")
+  
+    val extracted = df.select("bytes")
      .orderBy(desc(testString)).limit(1)
    extracted.saveToDisk(testString, "/tmp/foo")
@@ -85,6 +82,27 @@ class SaveImageTest extends FunSuite with BeforeAndAfter {
    Files.delete(Paths.get(fileName))
  }
+  
+  test("Attempt to save invalid image") {
+  
+    val dummyEncBytes = Base64.getEncoder.encodeToString(Array.range(0, 127)
+  
+      .map(_.toByte))
+  
+    val dummyMD5 = dummyEncBytes.computeHash()
+  
+    val dummyImg = TestImageDetails("http://example.com/fake.jpg", "image/jpeg",
+  
+      "600", "800", dummyMD5, dummyEncBytes)
+  
+
+  
+    // For toDF().
+  
+    val spark = SparkSession.builder().master("local").getOrCreate()
+  
+    // scalastyle:off
+  
+    import spark.implicits._
+  
+    // scalastyle:on
+  
+    val df = Seq(dummyImg).toDF
+  
+
+  
+    df.saveToDisk("bytes", "/tmp/foo")
+  
+
+  
+    // Check that no file was written.
+  
+    assert(new File("/tmp").listFiles.filter(_.isFile).toList
+  
+      .count(_.getName.startsWith("foo-" + dummyMD5)) == 0)
+  
+  }
+  
+
  after {
    if (sc != null) {
      sc.stop()