Skip to content
Permalink
Browse files

Enable S3 access (#332)

* Update POM to access data stored in Amazon S3, per #319
* In RecordLoader detect FileSystem based on path.
* Resolves #319
  • Loading branch information...
jrwiebe authored and ruebot committed Jul 25, 2019
1 parent 19b49e1 commit 64c1f1f039cb929adddeea78c1271971d7ff8aca
Showing with 11 additions and 2 deletions.
  1. +8 −1 pom.xml
  2. +3 −1 src/main/scala/io/archivesunleashed/package.scala
@@ -116,7 +116,9 @@
<shadedClassifierName>fatjar</shadedClassifierName>
<artifactSet>
<excludes>
<exclude>org.apache.hadoop:*</exclude>
<exclude>org.apache.hadoop:hadoop-core</exclude>
<exclude>org.apache.hadoop:hadoop-common</exclude>
<exclude>org.apache.hadoop:hadoop-mapreduce-client-core</exclude>
<exclude>org.apache.spark:*</exclude>
</excludes>
</artifactSet>
@@ -738,6 +740,11 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-aws</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>

<developers>
@@ -22,6 +22,7 @@ import ArchiveRecordWritable.ArchiveFormat
import io.archivesunleashed.matchbox.{ComputeMD5, DetectLanguage, ExtractDate, ExtractDomain, ExtractImageDetails, ExtractImageLinks, ExtractLinks, RemoveHTML}
import io.archivesunleashed.matchbox.ImageDetails
import io.archivesunleashed.matchbox.ExtractDate.DateComponent
import java.net.URI
import org.apache.hadoop.fs.{FileSystem, Path}
// scalastyle:off underscore.import
import io.archivesunleashed.matchbox.ExtractDate.DateComponent._
@@ -59,7 +60,8 @@ package object archivesunleashed {
* @return an RDD of ArchiveRecords for mapping.
*/
def loadArchives(path: String, sc: SparkContext): RDD[ArchiveRecord] = {
val fs = FileSystem.get(sc.hadoopConfiguration)
val uri = new URI(path)
val fs = FileSystem.get(uri, sc.hadoopConfiguration)
val p = new Path(path)
sc.newAPIHadoopFile(getFiles(p, fs), classOf[ArchiveRecordInputFormat], classOf[LongWritable], classOf[ArchiveRecordWritable])
.filter(r => (r._2.getFormat == ArchiveFormat.ARC) ||

0 comments on commit 64c1f1f

Please sign in to comment.
You can’t perform that action at this time.