Skip to content
Please note that GitHub no longer supports your web browser.

We recommend upgrading to the latest Google Chrome or Firefox.

Learn more
Permalink
Browse files

Add crawl_date to binary DataFrames and imageLinks. (#414)

- Resolves #413
- Update tests where necessary
  • Loading branch information
ruebot authored and ianmilligan1 committed Jan 18, 2020
1 parent 9277e68 commit 9e357cca1c66566a8446d9b2f1692025e5494777
@@ -363,7 +363,7 @@ package object archivesunleashed {
val records = rdd
.keepValidPages()
.flatMap(r => ExtractLinksRDD(r.getUrl, r.getContentString)
.map(t => (r.getCrawlDate, t._1, t._2, t._3)))
.map(t => (r.getCrawlDate, t._1, t._2, t._3)))
.filter(t => t._2 != "" && t._3 != "")
.map(t => Row(t._1, t._2, t._3, t._4))

@@ -381,14 +381,16 @@ package object archivesunleashed {
def imageLinks(): DataFrame = {
val records = rdd
.keepValidPages()
.flatMap(r => {
.flatMap(r => ({
val src = r.getUrl
val imageUrls = ExtractImageLinksRDD(src, r.getContentString)
imageUrls.map(url => (src, url))
})
.map(t => Row(t._1, t._2))
.map(t => (r.getCrawlDate, t._1, t._2)))
.map(t => Row(t._1, t._2, t._3))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("src", StringType, true))
.add(StructField("image_url", StringType, true))

@@ -406,12 +408,13 @@ package object archivesunleashed {
val url = new URL(r.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMimeRDD(url.getPath(), mimeTypeTika)
(r.getUrl, filename, extension, r.getMimeType, mimeTypeTika,
(r.getCrawlDate, r.getUrl, filename, extension, r.getMimeType, mimeTypeTika,
image.width, image.height, image.md5Hash, image.sha1Hash, image.body)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10, t._11))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
@@ -442,12 +445,13 @@ package object archivesunleashed {
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMimeRDD(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
(r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
@@ -476,12 +480,13 @@ package object archivesunleashed {
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMimeRDD(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
(r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
@@ -510,12 +515,13 @@ package object archivesunleashed {
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMimeRDD(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
(r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
@@ -577,12 +583,13 @@ package object archivesunleashed {
}
}
val extension = GetExtensionMimeRDD(url.getPath(), mimeType)
(r._1.getUrl, filename, extension, r._1.getMimeType,
(r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
@@ -623,12 +630,13 @@ package object archivesunleashed {
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMimeRDD(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
(r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
@@ -674,12 +682,13 @@ package object archivesunleashed {
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMimeRDD(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
(r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
@@ -714,12 +723,13 @@ package object archivesunleashed {
val url = new URL(r.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = FilenameUtils.getExtension(url.getPath())
(r.getUrl, filename, extension, r.getMimeType,
(r.getCrawlDate, r.getUrl, filename, extension, r.getMimeType,
DetectMimeTypeTika(r.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
@@ -68,8 +68,9 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
assert(r_2(1) == "Advanced Search")

val r_3 = imageLinks.take(100)(99)
assert(r_3.get(0) == "http://www.archive.org/details/secretarmiesb00spivrich")
assert(r_3.get(1) == "http://www.archive.org/images/star.png")
assert(r_3.get(0) == "20080430")
assert(r_3.get(1) == "http://www.archive.org/details/secretarmiesb00spivrich")
assert(r_3.get(2) == "http://www.archive.org/images/star.png")

val r_4 = images.take(1)(0)
assert(r_4.getAs[String](url) == "http://www.archive.org/images/logoc.jpg")

0 comments on commit 9e357cc

Please sign in to comment.
You can’t perform that action at this time.