Skip to content
Permalink
Browse files

Improve and clean-up Scaladocs; resolves #184 (#193)

- update gitignore
- add site build to TravisCI config
- add scalastyle config
- improve scala docs on every scala file
- incorporate @greebie's work on scaladocs
  • Loading branch information...
ruebot authored and ianmilligan1 committed Apr 11, 2018
1 parent 3163ace commit 47f7a97ba5470d9273be7e4ab78045b0169f5774
Showing with 529 additions and 150 deletions.
  1. +2 −0 .gitignore
  2. +1 −0 .travis.yml
  3. +190 −0 config/checkstyle/scalastyle_config.xml
  4. +24 −0 pom.xml
  5. +1 −1 src/main/scala/io/archivesunleashed/ArchiveRecord.scala
  6. +5 −0 src/main/scala/io/archivesunleashed/ArchiveRecordImpl.scala
  7. +17 −7 src/main/scala/io/archivesunleashed/app/ExtractEntities.scala
  8. +28 −10 src/main/scala/io/archivesunleashed/app/ExtractGraph.scala
  9. +10 −6 src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala
  10. +31 −14 src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala
  11. +14 −9 src/main/scala/io/archivesunleashed/app/WriteGEXF.scala
  12. +12 −14 src/main/scala/io/archivesunleashed/app/WriteGraphML.scala
  13. +10 −3 src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala
  14. +6 −7 src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala
  15. +7 −0 src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala
  16. +7 −4 src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala
  17. +1 −0 src/main/scala/io/archivesunleashed/matchbox/ExtractAtMentions.scala
  18. +14 −5 src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala
  19. +3 −8 src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala
  20. +7 −0 src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
  21. +5 −0 src/main/scala/io/archivesunleashed/matchbox/ExtractHashtags.scala
  22. +5 −8 src/main/scala/io/archivesunleashed/matchbox/ExtractImageLinks.scala
  23. +7 −10 src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala
  24. +1 −1 src/main/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFs.scala
  25. +5 −0 src/main/scala/io/archivesunleashed/matchbox/ExtractUrls.scala
  26. +16 −14 src/main/scala/io/archivesunleashed/matchbox/NER3Classifier.scala
  27. +7 −1 src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala
  28. +7 −3 src/main/scala/io/archivesunleashed/matchbox/RemoveHttpHeader.scala
  29. +5 −12 src/main/scala/io/archivesunleashed/matchbox/TupleFormatter.scala
  30. +1 −3 src/main/scala/io/archivesunleashed/matchbox/package.scala
  31. +50 −5 src/main/scala/io/archivesunleashed/package.scala
  32. +16 −2 src/main/scala/io/archivesunleashed/util/JsonUtils.scala
  33. +14 −3 src/main/scala/io/archivesunleashed/util/TweetUtils.scala
@@ -11,3 +11,5 @@ src/main/solr/lib/
.*.swp
workbench.xmi
build
derby.log
metastore_db
@@ -13,6 +13,7 @@ script:
- mvn javadoc:jar
- mvn javadoc:test-aggregate
- mvn cobertura:cobertura
- mvn site

after_success:
- bash <(curl -s https://codecov.io/bash)
@@ -0,0 +1,190 @@
<scalastyle commentFilter="enabled">
<name>Scalastyle standard configuration</name>
<check class="org.scalastyle.file.FileTabChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.file.FileLengthChecker" level="warning" enabled="true">
<parameters>
<parameter name="maxFileLength"><![CDATA[800]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.file.HeaderMatchesChecker" level="warning" enabled="true">
<parameters>
<parameter name="header"><![CDATA[/*
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.SpacesAfterPlusChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.file.WhitespaceEndOfLineChecker" level="warning" enabled="true">
<parameters>
<parameter name="ignoreWhitespaceLines"><![CDATA[false]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.SpacesBeforePlusChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.file.FileLineLengthChecker" level="warning" enabled="true">
<parameters>
<parameter name="maxLineLength"><![CDATA[160]]></parameter>
<parameter name="tabSize"><![CDATA[4]]></parameter>
<parameter name="ignoreImports"><![CDATA[false]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.ClassNamesChecker" level="warning" enabled="true">
<parameters>
<parameter name="regex"><![CDATA[^[A-Z][A-Za-z]*$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.ObjectNamesChecker" level="warning" enabled="true">
<parameters>
<parameter name="regex"><![CDATA[^[A-Z][A-Za-z]*$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.PackageObjectNamesChecker" level="warning" enabled="true">
<parameters>
<parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.EqualsHashCodeChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.IllegalImportsChecker" level="warning" enabled="true">
<parameters>
<parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.ParameterNumberChecker" level="warning" enabled="true">
<parameters>
<parameter name="maxParameters"><![CDATA[8]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.MagicNumberChecker" level="warning" enabled="true">
<parameters>
<parameter name="ignore"><![CDATA[-1,0,1,2,3]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.ReturnChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NullChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NoCloneChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NoFinalizeChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.CovariantEqualsChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.StructuralTypeChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.file.RegexChecker" level="warning" enabled="true">
<parameters>
<parameter name="regex"><![CDATA[println]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.NumberOfTypesChecker" level="warning" enabled="true">
<parameters>
<parameter name="maxTypes"><![CDATA[30]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.CyclomaticComplexityChecker" level="warning" enabled="true">
<parameters>
<parameter name="maximum"><![CDATA[10]]></parameter>
<parameter name="countCases"><![CDATA[true]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.UppercaseLChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.IfBraceChecker" level="warning" enabled="true">
<parameters>
<parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
<parameter name="doubleLineAllowed"><![CDATA[false]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.MethodLengthChecker" level="warning" enabled="true">
<parameters>
<parameter name="maxLength"><![CDATA[50]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.MethodNamesChecker" level="warning" enabled="true">
<parameters>
<parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*(_=)?$]]></parameter>
<parameter name="ignoreRegex"><![CDATA[^$]]></parameter>
<parameter name="ignoreOverride"><![CDATA[false]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" level="warning" enabled="true">
<parameters>
<parameter name="maxMethods"><![CDATA[30]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" level="warning" enabled="true">
<parameters>
<parameter name="ignoreOverride"><![CDATA[false]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.file.NewLineAtEofChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.file.NoNewLineAtEofChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.WhileChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.VarFieldChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.VarLocalChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.RedundantIfChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.TokenChecker" level="warning" enabled="false">
<parameters>
<parameter name="regex"><![CDATA[println]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.DeprecatedJavaChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.OverrideJavaChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.EmptyClassChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.ClassTypeParameterChecker" level="warning" enabled="true">
<parameters>
<parameter name="regex"><![CDATA[^[A-Z_]$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.UnderscoreImportChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.LowercasePatternMatchChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.EmptyInterpolatedStringChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.MultipleStringLiteralsChecker" level="warning" enabled="true">
<parameters>
<parameter name="allowed"><![CDATA[2]]></parameter>
<parameter name="ignoreRegex"><![CDATA[^""$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.ImportGroupingChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NotImplementedErrorUsage" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.BlockImportChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.ProcedureDeclarationChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.ForBraceChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.ForLoopChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.ScalaDocChecker" level="warning" enabled="false">
<parameters>
<parameter name="ignoreRegex"><![CDATA[^$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.DisallowSpaceAfterTokenChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.NonASCIICharacterChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.file.IndentationChecker" level="warning" enabled="false">
<parameters>
<parameter name="tabSize"><![CDATA[2]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.FieldNamesChecker" level="warning" enabled="false">
<parameters>
<parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter>
<parameter name="objectFieldRegex"><![CDATA[^[A-Z][A-Za-z]*$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.TodoCommentChecker" level="warning" enabled="true">
<parameters>
<parameter name="words"><![CDATA[TODO|FIXME]]></parameter>
</parameters>
</check>
</scalastyle>
24 pom.xml
@@ -177,6 +177,30 @@
</execution>
</executions>
</plugin>
<!-- for scalastyle -->
<plugin>
<groupId>org.scalastyle</groupId>
<artifactId>scalastyle-maven-plugin</artifactId>
<version>1.0.0</version>
<configuration>
<verbose>false</verbose>
<failOnViolation>true</failOnViolation>
<includeTestSourceDirectory>true</includeTestSourceDirectory>
<failOnWarning>false</failOnWarning>
<sourceDirectory>${project.basedir}/src/main/scala</sourceDirectory>
<testSourceDirectory>${project.basedir}/src/test/scala</testSourceDirectory>
<configLocation>${project.basedir}/config/checkstyle/scalastyle_config.xml</configLocation>
<outputFile>${project.basedir}/target/scalastyle-output.xml</outputFile>
<outputEncoding>UTF-8</outputEncoding>
</configuration>
<executions>
<execution>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- for codecov.io -->
<plugin>
<groupId>org.codehaus.mojo</groupId>
@@ -33,4 +33,4 @@ trait ArchiveRecord extends Serializable {
def getDomain: String

def getImageBytes: Array[Byte]
}
}
@@ -29,6 +29,11 @@ import org.archive.io.arc.ARCRecord
import org.archive.io.warc.WARCRecord
import org.archive.util.ArchiveUtils

/** Used by RecordLoader to extract data from WARC and ARC files.
*
* @constructor an archive record.
* @param r
*/
class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends ArchiveRecord {
var arcRecord: ARCRecord = null
var warcRecord: WARCRecord = null
@@ -21,27 +21,35 @@ import io.archivesunleashed.matchbox.{NER3Classifier, RemoveHTML}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

/**
* Extracts entities
/** Performs Named Entity Recognition (NER) on a WARC or ARC file.
*
* [[http://nlp.stanford.edu/software/CRF-NER.html Named Entity Recognition]]
* applies rules formed in a [[https://stanfordnlp.github.io/CoreNLP Named
* Entity Classifier]] to identify locations, people or other objects from data.
*/
object ExtractEntities {

/**
* @param iNerClassifierFile path of classifier file
/** Extracts named entities from WARC or ARC files at a given path to a given output directory.
*
* @param iNerClassifierFile path to NER classifier file
* @param inputRecordFile path of ARC or WARC file from which to extract entities
* @param outputFile path of output directory
* @param sc the Apache Spark context
* @return an rdd with classification entities.
*/
def extractFromRecords(iNerClassifierFile: String, inputRecordFile: String, outputFile: String, sc: SparkContext): RDD[(String, String, String)] = {
val rdd = RecordLoader.loadArchives(inputRecordFile, sc)
.map(r => (r.getCrawlDate, r.getUrl, RemoveHTML(r.getContentString)))
extractAndOutput(iNerClassifierFile, rdd, outputFile)
}

/**
/** Extracts named entities from tuple-formatted derivatives scraped from a website.
*
* @param iNerClassifierFile path of classifier file
* @param inputFile path of file with tuples (date: String, url: String, content: String)
* @param inputFile path of file containing tuples (date: String, url: String, content: String)
* from which to extract entities
* @param outputFile path of output directory
* @return an rdd with classification entities.
*/
def extractFromScrapeText(iNerClassifierFile: String, inputFile: String, outputFile: String, sc: SparkContext): RDD[(String, String, String)] = {
val rdd = sc.textFile(inputFile)
@@ -55,10 +63,12 @@ object ExtractEntities {
extractAndOutput(iNerClassifierFile, rdd, outputFile)
}

/**
/** Saves the NER output to file from a given RDD.
*
* @param iNerClassifierFile path of classifier file
* @param rdd with values (date, url, content)
* @param outputFile path of output directory
* @return an rdd of tuples with classification entities extracted.
*/
def extractAndOutput(iNerClassifierFile: String, rdd: RDD[(String, String, String)], outputFile: String): RDD[(String, String, String)] = {
val r = rdd.mapPartitions(iter => {
@@ -22,22 +22,31 @@ import io.archivesunleashed.util.JsonUtils
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

/**
*
* e.g. when done:
* $ cat nodes.partjson/part-* > nodes.json && cat links.partjson/part-* > links.json
* $ jq -c -n --slurpfile nodes nodes.json --slurpfile links links.json '{nodes: $nodes, links: $links}' > graph.json
*
*/

/** Extracts a network graph using Spark's GraphX utility. */
object ExtractGraph {

/** Creates a hashcode from a url to use as a unique id.
*
* @param url
* @return unique id as long integer.
*/
def pageHash(url: String): VertexId = {
url.hashCode.toLong
}

case class VertexData(domain: String, pageRank: Double, inDegree: Int, outDegree: Int)
case class EdgeData(date: String, src: String, dst: String)

/** Creates a network graph from loaded Archive Records with optional pageRank calculations.
*
* @param records an RDD of archive records
* @param dynamic whether to calculate PageRank (an O(n^2) calculation, so not
* recommended for very large graphs)
* @param tolerance the percentage of the time the PR algorithm "jumps" to
* a random location in its random walks
* @param numIter the number of iterations applied to the PR algorithm
* @return a Graph object containing data for vertices and edges as extracted.
*/
def apply(records: RDD[ArchiveRecord], dynamic: Boolean = false,
tolerance: Double = 0.005, numIter: Int = 20): Graph[VertexData, EdgeData] = {
val extractedLinks = records.keepValidPages()
@@ -73,7 +82,18 @@ object ExtractGraph {
}
}

/** Writes a Graph object to a Json file.
*
* @constructor graph - a SparkX graph object containing vertex and edge data
* @return Unit().
*/
implicit class GraphWriter(graph: Graph[VertexData, EdgeData]) {
/** Writes a graph object to json files containing vertex and edge data.
*
* @param verticesPath Filepath for vertices output
* @param edgesPath Filepath for edges output
* @return Unit().
*/
def writeAsJson(verticesPath: String, edgesPath: String) = {
// Combine edges of a given (date, src, dst) combination into single record with count value.
val edgesCounted = graph.edges.countItems().map {
@@ -82,10 +102,8 @@ object ExtractGraph {
"dst" -> r._1.attr.dst,
"count" -> r._2)
}

edgesCounted.map(r => JsonUtils.toJson(r)).saveAsTextFile(edgesPath)
graph.vertices.map(r => JsonUtils.toJson(r._2)).saveAsTextFile(verticesPath)
}
}
}

Oops, something went wrong.

0 comments on commit 47f7a97

Please sign in to comment.
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.