Skip to content
Permalink
Browse files

More scalastyle work; addresses #196. (#339)

- Remove all underscore imports, except shapeless
- Address all scalastyle warnings
- Update scalastyle config for magic numbers, and null (only used in
tests)
  • Loading branch information...
ruebot authored and ianmilligan1 committed Aug 8, 2019
1 parent 9623c7a commit b2d7394b2d67b6bf0c535db16683852b0f270aa6
@@ -67,15 +67,15 @@
<parameter name="maxParameters"><![CDATA[8]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.MagicNumberChecker" level="warning" enabled="true">
<check class="org.scalastyle.scalariform.MagicNumberChecker" level="warning" enabled="false">
<parameters>
<parameter name="ignore"><![CDATA[-1,0,1,2,3]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.ReturnChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NullChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NullChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.NoCloneChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NoFinalizeChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.CovariantEqualsChecker" level="warning" enabled="true"></check>
@@ -18,9 +18,7 @@
package io.archivesunleashed

import org.apache.spark.SparkContext
// scalastyle:off underscore.import
import org.apache.spark.sql._
// scalastyle:on underscore.import
import org.apache.spark.sql.DataFrame

class DataFrameLoader(sc: SparkContext) {
def extractValidPages(path: String): DataFrame = {
@@ -16,9 +16,7 @@
*/
package io.archivesunleashed.app

// scalastyle:off underscore.import
import io.archivesunleashed._
// scalastyle:on underscore.import
import io.archivesunleashed.RecordLoader
import io.archivesunleashed.matchbox.{NERClassifier, RemoveHTML}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
@@ -16,11 +16,7 @@
*/
package io.archivesunleashed.app

// scalastyle:off underscore.import
import io.archivesunleashed._
import io.archivesunleashed.matchbox
import org.apache.spark.graphx._
// scalastyle:on underscore.import
import org.apache.spark.graphx.{Edge, Graph, PartitionStrategy, VertexId}
import org.apache.spark.rdd.RDD

/** Extracts a site link structure using Spark's GraphX utility. */
@@ -16,9 +16,7 @@
*/
package io.archivesunleashed.app

// scalastyle:off underscore.import
import io.archivesunleashed._
// scalastyle:on underscore.import
import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.matchbox.{ComputeImageSize, ComputeMD5}
import org.apache.spark.rdd.RDD
import org.apache.spark.{RangePartitioner, SparkContext}
@@ -21,9 +21,7 @@ import java.io.{BufferedReader, BufferedWriter, InputStreamReader, OutputStreamW
import io.archivesunleashed.matchbox.NERClassifier
import io.archivesunleashed.util.JsonUtils
import org.apache.hadoop.conf.Configuration
// scalastyle:off underscore.import
import org.apache.hadoop.fs._
// scalastyle:on underscore.import
import org.apache.hadoop.fs.{FileUtil, FileSystem, Path}
import org.apache.spark.SparkContext

import scala.collection.mutable.MutableList
@@ -15,9 +15,7 @@
* limitations under the License.
*/
package io.archivesunleashed.app
// scalastyle:off underscore.import
import io.archivesunleashed.matchbox._
// scalastyle:on underscore.import
import io.archivesunleashed.matchbox.{ComputeMD5, WWWLink}
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import org.apache.spark.rdd.RDD
@@ -15,9 +15,7 @@
* limitations under the License.
*/
package io.archivesunleashed.app
// scalastyle:off underscore.import
import io.archivesunleashed.matchbox._
// scalastyle:on underscore.import
import io.archivesunleashed.matchbox.{ComputeMD5, WWWLink}
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import org.apache.spark.rdd.RDD
@@ -15,9 +15,7 @@
* limitations under the License.
*/
package io.archivesunleashed.app
// scalastyle:off underscore.import
import io.archivesunleashed.matchbox._
// scalastyle: on underscore.import
import io.archivesunleashed.matchbox.{ComputeMD5, WWWLink}
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import org.apache.spark.rdd.RDD
@@ -15,10 +15,7 @@
* limitations under the License.
*/
package io.archivesunleashed.app
// scalastyle:off underscore.import
import io.archivesunleashed.matchbox._
import org.apache.spark.graphx._
// scalastyle:on underscore.import
import org.apache.spark.graphx.Graph
import java.nio.charset.StandardCharsets
import java.nio.file.{Files, Paths}
import io.archivesunleashed.app.ExtractGraphX.{VertexData,EdgeData,VertexDataPR}
@@ -17,9 +17,7 @@

package io.archivesunleashed

// scalastyle:off underscore.import
import io.archivesunleashed.matchbox._
// scalastyle:on underscore.import
import io.archivesunleashed.matchbox.{ComputeMD5, ExtractDomain, RemoveHTML}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.DataFrame
import java.io.ByteArrayInputStream
@@ -23,9 +23,8 @@ object ExtractDate {
type DateComponent = Value
val YYYY, MM, DD, YYYYMM, YYYYMMDD = Value
}
// scalastyle:off underscore.import
import DateComponent._
// scalastyle:on underscore.import

import DateComponent.{DateComponent, DD, MM, YYYY, YYYYMM}

/** Extracts the wanted date component from a date.
*
@@ -19,9 +19,7 @@ package io.archivesunleashed

import java.io.IOException
import java.security.MessageDigest
// scalastyle:off underscore.import
import scala.xml.Utility._
// scalastyle:on underscore.import
import scala.xml.Utility.escape


/** Package object which supplies implicits providing common UDF-related functionalities. */
@@ -24,11 +24,9 @@ import io.archivesunleashed.matchbox.ImageDetails
import io.archivesunleashed.matchbox.ExtractDate.DateComponent
import java.net.URI
import org.apache.hadoop.fs.{FileSystem, Path}
// scalastyle:off underscore.import
import io.archivesunleashed.matchbox.ExtractDate.DateComponent._
import org.apache.spark.sql._
import org.apache.spark.sql.types._
// scalastyle:on: underscore.import
import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.hadoop.io.LongWritable
import org.apache.spark.{SerializableWritable, SparkContext}
import org.apache.spark.rdd.RDD
@@ -31,6 +31,13 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
private val master = "local[4]"
private val appName = "example-spark"
private var sc: SparkContext = _
private val exampleArc = "example.arc.gz"
private val exampleWarc = "example.warc.gz"
private val exampleDate = "20080430"
private val exampleUrl = "www.archive.org"
private val exampleStatusCode1 = "000"
private val exampleStatusCode2 = "200"
private val exampleMimeType = "text/plain"

before {
val conf = new SparkConf()
@@ -51,28 +58,28 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
.take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => FilenameUtils.getName(x.getArchiveFilename)).take(3)
assert(textSampleArc.deep == Array("example.arc.gz",
"example.arc.gz", "example.arc.gz").deep)
assert(textSampleWarc.deep == Array("example.warc.gz",
"example.warc.gz", "example.warc.gz").deep)
assert(textSampleArc.deep == Array(exampleArc,
exampleArc, exampleArc).deep)
assert(textSampleWarc.deep == Array(exampleWarc,
exampleWarc, exampleWarc).deep)
}

test("Crawl Dates") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getCrawlDate).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getCrawlDate).take(3)
assert(textSampleArc.deep == Array("20080430", "20080430", "20080430").deep)
assert(textSampleWarc.deep == Array("20080430", "20080430", "20080430").deep)
assert(textSampleArc.deep == Array(exampleDate, exampleDate, exampleDate).deep)
assert(textSampleWarc.deep == Array(exampleDate, exampleDate, exampleDate).deep)
}

test("Domains") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getDomain).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getDomain).take(3)
assert(textSampleArc.deep == Array("", "", "www.archive.org").deep)
assert(textSampleWarc.deep == Array("", "www.archive.org", "www.archive.org").deep)
assert(textSampleArc.deep == Array("", "", exampleUrl).deep)
assert(textSampleWarc.deep == Array("", exampleUrl, exampleUrl).deep)
}

test("Urls") {
@@ -91,17 +98,21 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
.map(x => x.getMimeType).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getMimeType).take(3)
assert (textSampleArc.deep == Array ("text/plain", "text/dns", "text/plain").deep)
assert (textSampleWarc.deep == Array("unknown", "text/plain", "text/html").deep)
assert (textSampleArc.deep == Array (exampleMimeType, "text/dns",
exampleMimeType).deep)
assert (textSampleWarc.deep == Array("unknown", exampleMimeType,
"text/html").deep)
}

test("Get Http Status") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getHttpStatus).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getHttpStatus).take(3)
assert (textSampleArc.deep == Array("000", "000", "200").deep)
assert (textSampleWarc.deep == Array("000", "200", "200").deep)
assert (textSampleArc.deep == Array(exampleStatusCode1, exampleStatusCode1,
exampleStatusCode2).deep)
assert (textSampleWarc.deep == Array(exampleStatusCode1, exampleStatusCode2,
exampleStatusCode2).deep)
}

after {
@@ -32,18 +32,28 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
private var sc: SparkContext = _
private val master = "local[4]"
private val appName = "example-spark"
private val network = Seq((("Date1", "Source1", "Destination1"), 3),
(("Date2", "Source2", "Destination2"), 4),
(("Date3", "Source3", "Destination3"), 100))
private val unescapedNetwork = Seq((("Date1", "Source1", "Destination1"), 3),
(("Date2", "Source2", "Destination2"), 4),
(("Date3", "Source<3", "Destination<3"), 100))
private val networkDf = Seq(("Date1", "Source1", "Destination1", 3),
("Date2", "Source2", "Destination2", 4),
("Date3", "Source3", "Destination3", 100))
private val networkWithDuplication = Seq((("Date1", "Source1", "Destination1"), 3),
(("Date2", "Source2", "Source2"), 4),
(("Date3", "Source3", "Destination3"), 100))
private val date1 = "Date1"
private val date2 = "Date2"
private val date3 = "Date3"
private val source1 = "Source1"
private val source2 = "Source2"
private val source3 = "Source3"
private val destination1 = "Destination1"
private val destination2 = "Destination2"
private val destination3 = "Destination3"
private val xmlDeclaration = """<?xml version="1.0" encoding="UTF-8"?>"""
private val network = Seq(((date1, source1, destination1), 3),
((date2, source2, destination2), 4),
((date3, source3, destination3), 100))
private val unescapedNetwork = Seq(((date1, source1, destination1), 3),
((date2, source2, destination2), 4),
((date3, "Source<3", "Destination<3"), 100))
private val networkDf = Seq((date1, source1, destination1, 3),
(date2, source2, destination2, 4),
(date3, source3, destination3, 100))
private val networkWithDuplication = Seq(((date1, source1, destination1), 3),
((date2, source2, source2), 4),
((date3, source3, destination3), 100))
private val testFile = "temporaryTestFile.txt"
private val testFile2 = "temporaryTestFile2.txt"

@@ -61,7 +71,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
WriteGraph.asGexf(networkrdd, testFile)
assert(Files.exists(Paths.get(testFile)))
val lines = Source.fromFile(testFile).getLines.toList
assert(lines(testLines._1) == """<?xml version="1.0" encoding="UTF-8"?>""")
assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """<node id="3" label="Destination1" />""")
assert(lines(testLines._3) == """</attvalues>""")
assert(lines(testLines._4) == """</edges>""")
@@ -77,7 +87,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
val ret = WriteGraph.asGexf(networkarray, testFile)
assert(ret)
val lines = Source.fromFile(testFile).getLines.toList
assert(lines(testLines._1) == """<?xml version="1.0" encoding="UTF-8"?>""")
assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """<node id="8d3ab53ec817a1e5bf9ffd6e749b3983" label="Destination2" />""")
assert(lines(testLines._3) == """</attvalues>""")
assert(lines(testLines._4) == """</edges>""")
@@ -104,7 +114,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
test ("Nodelookup returns a option") {
val networkrdd = sc.parallelize(network)
val nodes = WriteGraph.nodesWithIds(networkrdd)
val lookup = "Source1"
val lookup = source1
val badlookup = "NOTTHERE"
assert (WriteGraph.nodeLookup(nodes, badlookup) == None)
assert (WriteGraph.nodeLookup(nodes, lookup) == Some((lookup, 6)))
@@ -115,17 +125,17 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
val nodes = WriteGraph.nodesWithIds(sc.parallelize(network))
val empty = -1
val expected = 6
val lookup = WriteGraph.nodeLookup(nodes, "Source1")
val lookup = WriteGraph.nodeLookup(nodes, source1)
val badlookup = WriteGraph.nodeLookup(nodes, "NOTTHERE")
assert (WriteGraph.nodeIdFromLabel(lookup) == expected)
assert (WriteGraph.nodeIdFromLabel(badlookup) == empty)
}

test ("Edge ids are captured from lookup") {
val edges = WriteGraph.edgeNodes(sc.parallelize(network))
val expected = Array(("Date1", 6, 3, 3),
("Date2", 7, 4, 4),
("Date3", 0, 5, 100)).deep
val expected = Array((date1, 6, 3, 3),
(date2, 7, 4, 4),
(date3, 0, 5, 100)).deep
assert(edges.collect.deep == expected)
}

@@ -135,7 +145,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
WriteGraph.asGraphml(networkrdd, testFile)
assert(Files.exists(Paths.get(testFile)))
val lines = Source.fromFile(testFile).getLines.toList
assert(lines(testLines._1) == """<?xml version="1.0" encoding="UTF-8"?>""")
assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """<data key="label">Source3</data>""")
assert(lines(testLines._3) == """<data key="weight">3</data>""")
assert(lines(testLines._4) == """<edge source="0" target="5" type="directed">""")
@@ -147,7 +157,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
WriteGraph.asGraphml(networkrdd, testFile)
assert(Files.exists(Paths.get(testFile)))
val lines = Source.fromFile(testFile).getLines.toList
assert(lines(testLines._1) == """<?xml version="1.0" encoding="UTF-8"?>""")
assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """<data key="label">Destination&lt;3</data>""")
assert(lines(testLines._3) == """<data key="weight">100</data>""")
assert(lines(testLines._4) == """<edge source="7" target="4" type="directed">""")
@@ -159,7 +169,7 @@ class WriteGraphTest extends FunSuite with BeforeAndAfter{
WriteGraph(networkrdd, testFile2)
assert(Files.exists(Paths.get(testFile2)))
val lines = Source.fromFile(testFile2).getLines.toList
assert(lines(testLines._1) == """<?xml version="1.0" encoding="UTF-8"?>""")
assert(lines(testLines._1) == xmlDeclaration)
assert(lines(testLines._2) == """<node id="3" label="Source&lt;3" />""")
assert(lines(testLines._3) == """<edge source="7" target="4" weight="4" type="directed">""")
assert(lines(testLines._4) == """<attvalue for="0" value="Date2" />""")
@@ -33,6 +33,9 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
private val url = "url"
private val mime_type = "mime_type"
private val md5 = "md5"

before {
val conf = new SparkConf()
@@ -48,9 +51,9 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
val imageLinks = df.extractImageLinks(arcPath)
val images = df.extractImages(arcPath)

val r_1 = validPages.select("url", "mime_type").take(1)(0)
assert(r_1.getAs[String]("url") == "http://www.archive.org/")
assert(r_1.getAs[String]("mime_type") == "text/html")
val r_1 = validPages.select(url, mime_type).take(1)(0)
assert(r_1.getAs[String](url) == "http://www.archive.org/")
assert(r_1.getAs[String](mime_type) == "text/html")

val r_2 = hyperlinks.select("Dest", "Anchor").take(3)(2)
assert(r_2(0) == "http://web.archive.org/collections/web/advanced.html")
@@ -61,8 +64,8 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
assert(r_3.get(1) == "http://www.archive.org/images/star.png")

val r_4 = images.take(1)(0)
assert(r_4.getAs[String]("url") == "http://www.archive.org/images/logoc.jpg")
assert(r_4.getAs[String]("md5") == "8211d1fbb9b03d8522a1ae378f9d1b24")
assert(r_4.getAs[String](url) == "http://www.archive.org/images/logoc.jpg")
assert(r_4.getAs[String](md5) == "8211d1fbb9b03d8522a1ae378f9d1b24")
}

after {

0 comments on commit b2d7394

Please sign in to comment.
You can’t perform that action at this time.