Skip to content
Permalink
Browse files

Improve and clean-up Scaladocs; resolves #184 (#193)

- update gitignore
- add site build to TravisCI config
- add scalastyle config
- improve scala docs on every scala file
- incorporate @greebie's work on scaladocs
  • Loading branch information...
ruebot authored and ianmilligan1 committed Apr 11, 2018
1 parent 3163ace commit 47f7a97ba5470d9273be7e4ab78045b0169f5774
Showing with 529 additions and 150 deletions.
  1. +2 −0 .gitignore
  2. +1 −0 .travis.yml
  3. +190 −0 config/checkstyle/scalastyle_config.xml
  4. +24 −0 pom.xml
  5. +1 −1 src/main/scala/io/archivesunleashed/ArchiveRecord.scala
  6. +5 −0 src/main/scala/io/archivesunleashed/ArchiveRecordImpl.scala
  7. +17 −7 src/main/scala/io/archivesunleashed/app/ExtractEntities.scala
  8. +28 −10 src/main/scala/io/archivesunleashed/app/ExtractGraph.scala
  9. +10 −6 src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala
  10. +31 −14 src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala
  11. +14 −9 src/main/scala/io/archivesunleashed/app/WriteGEXF.scala
  12. +12 −14 src/main/scala/io/archivesunleashed/app/WriteGraphML.scala
  13. +10 −3 src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala
  14. +6 −7 src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala
  15. +7 −0 src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala
  16. +7 −4 src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala
  17. +1 −0 src/main/scala/io/archivesunleashed/matchbox/ExtractAtMentions.scala
  18. +14 −5 src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala
  19. +3 −8 src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala
  20. +7 −0 src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
  21. +5 −0 src/main/scala/io/archivesunleashed/matchbox/ExtractHashtags.scala
  22. +5 −8 src/main/scala/io/archivesunleashed/matchbox/ExtractImageLinks.scala
  23. +7 −10 src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala
  24. +1 −1 src/main/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFs.scala
  25. +5 −0 src/main/scala/io/archivesunleashed/matchbox/ExtractUrls.scala
  26. +16 −14 src/main/scala/io/archivesunleashed/matchbox/NER3Classifier.scala
  27. +7 −1 src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala
  28. +7 −3 src/main/scala/io/archivesunleashed/matchbox/RemoveHttpHeader.scala
  29. +5 −12 src/main/scala/io/archivesunleashed/matchbox/TupleFormatter.scala
  30. +1 −3 src/main/scala/io/archivesunleashed/matchbox/package.scala
  31. +50 −5 src/main/scala/io/archivesunleashed/package.scala
  32. +16 −2 src/main/scala/io/archivesunleashed/util/JsonUtils.scala
  33. +14 −3 src/main/scala/io/archivesunleashed/util/TweetUtils.scala
@@ -11,3 +11,5 @@ src/main/solr/lib/
.*.swp
workbench.xmi
build
derby.log
metastore_db
@@ -13,6 +13,7 @@ script:
- mvn javadoc:jar
- mvn javadoc:test-aggregate
- mvn cobertura:cobertura
- mvn site

after_success:
- bash <(curl -s https://codecov.io/bash)
@@ -0,0 +1,190 @@
<scalastyle commentFilter="enabled">
<name>Scalastyle standard configuration</name>
<check class="org.scalastyle.file.FileTabChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.file.FileLengthChecker" level="warning" enabled="true">
<parameters>
<parameter name="maxFileLength"><![CDATA[800]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.file.HeaderMatchesChecker" level="warning" enabled="true">
<parameters>
<parameter name="header"><![CDATA[/*
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.SpacesAfterPlusChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.file.WhitespaceEndOfLineChecker" level="warning" enabled="true">
<parameters>
<parameter name="ignoreWhitespaceLines"><![CDATA[false]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.SpacesBeforePlusChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.file.FileLineLengthChecker" level="warning" enabled="true">
<parameters>
<parameter name="maxLineLength"><![CDATA[160]]></parameter>
<parameter name="tabSize"><![CDATA[4]]></parameter>
<parameter name="ignoreImports"><![CDATA[false]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.ClassNamesChecker" level="warning" enabled="true">
<parameters>
<parameter name="regex"><![CDATA[^[A-Z][A-Za-z]*$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.ObjectNamesChecker" level="warning" enabled="true">
<parameters>
<parameter name="regex"><![CDATA[^[A-Z][A-Za-z]*$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.PackageObjectNamesChecker" level="warning" enabled="true">
<parameters>
<parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.EqualsHashCodeChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.IllegalImportsChecker" level="warning" enabled="true">
<parameters>
<parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.ParameterNumberChecker" level="warning" enabled="true">
<parameters>
<parameter name="maxParameters"><![CDATA[8]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.MagicNumberChecker" level="warning" enabled="true">
<parameters>
<parameter name="ignore"><![CDATA[-1,0,1,2,3]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.ReturnChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NullChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NoCloneChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NoFinalizeChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.CovariantEqualsChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.StructuralTypeChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.file.RegexChecker" level="warning" enabled="true">
<parameters>
<parameter name="regex"><![CDATA[println]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.NumberOfTypesChecker" level="warning" enabled="true">
<parameters>
<parameter name="maxTypes"><![CDATA[30]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.CyclomaticComplexityChecker" level="warning" enabled="true">
<parameters>
<parameter name="maximum"><![CDATA[10]]></parameter>
<parameter name="countCases"><![CDATA[true]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.UppercaseLChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.IfBraceChecker" level="warning" enabled="true">
<parameters>
<parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
<parameter name="doubleLineAllowed"><![CDATA[false]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.MethodLengthChecker" level="warning" enabled="true">
<parameters>
<parameter name="maxLength"><![CDATA[50]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.MethodNamesChecker" level="warning" enabled="true">
<parameters>
<parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*(_=)?$]]></parameter>
<parameter name="ignoreRegex"><![CDATA[^$]]></parameter>
<parameter name="ignoreOverride"><![CDATA[false]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" level="warning" enabled="true">
<parameters>
<parameter name="maxMethods"><![CDATA[30]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" level="warning" enabled="true">
<parameters>
<parameter name="ignoreOverride"><![CDATA[false]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.file.NewLineAtEofChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.file.NoNewLineAtEofChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.WhileChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.VarFieldChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.VarLocalChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.RedundantIfChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.TokenChecker" level="warning" enabled="false">
<parameters>
<parameter name="regex"><![CDATA[println]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.DeprecatedJavaChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.OverrideJavaChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.EmptyClassChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.ClassTypeParameterChecker" level="warning" enabled="true">
<parameters>
<parameter name="regex"><![CDATA[^[A-Z_]$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.UnderscoreImportChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.LowercasePatternMatchChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.EmptyInterpolatedStringChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.MultipleStringLiteralsChecker" level="warning" enabled="true">
<parameters>
<parameter name="allowed"><![CDATA[2]]></parameter>
<parameter name="ignoreRegex"><![CDATA[^""$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.ImportGroupingChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.NotImplementedErrorUsage" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.BlockImportChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.ProcedureDeclarationChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.ForBraceChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.ForLoopChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" level="warning" enabled="true"></check>
<check class="org.scalastyle.scalariform.ScalaDocChecker" level="warning" enabled="false">
<parameters>
<parameter name="ignoreRegex"><![CDATA[^$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.DisallowSpaceAfterTokenChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.scalariform.NonASCIICharacterChecker" level="warning" enabled="false"></check>
<check class="org.scalastyle.file.IndentationChecker" level="warning" enabled="false">
<parameters>
<parameter name="tabSize"><![CDATA[2]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.FieldNamesChecker" level="warning" enabled="false">
<parameters>
<parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter>
<parameter name="objectFieldRegex"><![CDATA[^[A-Z][A-Za-z]*$]]></parameter>
</parameters>
</check>
<check class="org.scalastyle.scalariform.TodoCommentChecker" level="warning" enabled="true">
<parameters>
<parameter name="words"><![CDATA[TODO|FIXME]]></parameter>
</parameters>
</check>
</scalastyle>
24 pom.xml
@@ -177,6 +177,30 @@
</execution>
</executions>
</plugin>
<!-- for scalastyle -->
<plugin>
<groupId>org.scalastyle</groupId>
<artifactId>scalastyle-maven-plugin</artifactId>
<version>1.0.0</version>
<configuration>
<verbose>false</verbose>
<failOnViolation>true</failOnViolation>
<includeTestSourceDirectory>true</includeTestSourceDirectory>
<failOnWarning>false</failOnWarning>
<sourceDirectory>${project.basedir}/src/main/scala</sourceDirectory>
<testSourceDirectory>${project.basedir}/src/test/scala</testSourceDirectory>
<configLocation>${project.basedir}/config/checkstyle/scalastyle_config.xml</configLocation>
<outputFile>${project.basedir}/target/scalastyle-output.xml</outputFile>
<outputEncoding>UTF-8</outputEncoding>
</configuration>
<executions>
<execution>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- for codecov.io -->
<plugin>
<groupId>org.codehaus.mojo</groupId>
@@ -33,4 +33,4 @@ trait ArchiveRecord extends Serializable {
def getDomain: String

def getImageBytes: Array[Byte]
}
}
@@ -29,6 +29,11 @@ import org.archive.io.arc.ARCRecord
import org.archive.io.warc.WARCRecord
import org.archive.util.ArchiveUtils

/** Used by RecordLoader to extract data from WARC and ARC files.
*
* @constructor an archive record.
* @param r
*/
class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends ArchiveRecord {
var arcRecord: ARCRecord = null
var warcRecord: WARCRecord = null
@@ -21,27 +21,35 @@ import io.archivesunleashed.matchbox.{NER3Classifier, RemoveHTML}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

/**
* Extracts entities
/** Performs Named Entity Recognition (NER) on a WARC or ARC file.
*
* [[http://nlp.stanford.edu/software/CRF-NER.html Named Entity Recognition]]
* applies rules formed in a [[https://stanfordnlp.github.io/CoreNLP Named
* Entity Classifier]] to identify locations, people or other objects from data.
*/
object ExtractEntities {

/**
* @param iNerClassifierFile path of classifier file
/** Extracts named entities from WARC or ARC files at a given path to a given output directory.
*
* @param iNerClassifierFile path to NER classifier file
* @param inputRecordFile path of ARC or WARC file from which to extract entities
* @param outputFile path of output directory
* @param sc the Apache Spark context
* @return an rdd with classification entities.
*/
def extractFromRecords(iNerClassifierFile: String, inputRecordFile: String, outputFile: String, sc: SparkContext): RDD[(String, String, String)] = {
val rdd = RecordLoader.loadArchives(inputRecordFile, sc)
.map(r => (r.getCrawlDate, r.getUrl, RemoveHTML(r.getContentString)))
extractAndOutput(iNerClassifierFile, rdd, outputFile)
}

/**
/** Extracts named entities from tuple-formatted derivatives scraped from a website.
*
* @param iNerClassifierFile path of classifier file
* @param inputFile path of file with tuples (date: String, url: String, content: String)
* @param inputFile path of file containing tuples (date: String, url: String, content: String)
* from which to extract entities
* @param outputFile path of output directory
* @return an rdd with classification entities.
*/
def extractFromScrapeText(iNerClassifierFile: String, inputFile: String, outputFile: String, sc: SparkContext): RDD[(String, String, String)] = {
val rdd = sc.textFile(inputFile)
@@ -55,10 +63,12 @@ object ExtractEntities {
extractAndOutput(iNerClassifierFile, rdd, outputFile)
}

/**
/** Saves the NER output to file from a given RDD.
*
* @param iNerClassifierFile path of classifier file
* @param rdd with values (date, url, content)
* @param outputFile path of output directory
* @return an rdd of tuples with classification entities extracted.
*/
def extractAndOutput(iNerClassifierFile: String, rdd: RDD[(String, String, String)], outputFile: String): RDD[(String, String, String)] = {
val r = rdd.mapPartitions(iter => {
@@ -22,22 +22,31 @@ import io.archivesunleashed.util.JsonUtils
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

/**
*
* e.g. when done:
* $ cat nodes.partjson/part-* > nodes.json && cat links.partjson/part-* > links.json
* $ jq -c -n --slurpfile nodes nodes.json --slurpfile links links.json '{nodes: $nodes, links: $links}' > graph.json
*
*/

/** Extracts a network graph using Spark's GraphX utility. */
object ExtractGraph {

/** Creates a hashcode from a url to use as a unique id.
*
* @param url
* @return unique id as long integer.
*/
def pageHash(url: String): VertexId = {
url.hashCode.toLong
}

case class VertexData(domain: String, pageRank: Double, inDegree: Int, outDegree: Int)
case class EdgeData(date: String, src: String, dst: String)

/** Creates a network graph from loaded Archive Records with optional pageRank calculations.
*
* @param records an RDD of archive records
* @param dynamic whether to calculate PageRank (an O(n^2) calculation, so not
* recommended for very large graphs)
* @param tolerance the percentage of the time the PR algorithm "jumps" to
* a random location in its random walks
* @param numIter the number of iterations applied to the PR algorithm
* @return a Graph object containing data for vertices and edges as extracted.
*/
def apply(records: RDD[ArchiveRecord], dynamic: Boolean = false,
tolerance: Double = 0.005, numIter: Int = 20): Graph[VertexData, EdgeData] = {
val extractedLinks = records.keepValidPages()
@@ -73,7 +82,18 @@ object ExtractGraph {
}
}

/** Writes a Graph object to a Json file.
*
* @constructor graph - a SparkX graph object containing vertex and edge data
* @return Unit().
*/
implicit class GraphWriter(graph: Graph[VertexData, EdgeData]) {
/** Writes a graph object to json files containing vertex and edge data.
*
* @param verticesPath Filepath for vertices output
* @param edgesPath Filepath for edges output
* @return Unit().
*/
def writeAsJson(verticesPath: String, edgesPath: String) = {
// Combine edges of a given (date, src, dst) combination into single record with count value.
val edgesCounted = graph.edges.countItems().map {
@@ -82,10 +102,8 @@ object ExtractGraph {
"dst" -> r._1.attr.dst,
"count" -> r._2)
}

edgesCounted.map(r => JsonUtils.toJson(r)).saveAsTextFile(edgesPath)
graph.vertices.map(r => JsonUtils.toJson(r._2)).saveAsTextFile(verticesPath)
}
}
}

0 comments on commit 47f7a97

Please sign in to comment.
You can’t perform that action at this time.