Skip to content
Permalink
Browse files

Use Tika's detected MIME type instead of ArchiveRecord getMimeType. (#…

…344)

- Move audio, pdf, and video DF extraction to tuple map
- Provide two MimeType columns; mime_type_web_server and mime_type_tika
- Update tests
- Resolves #342
  • Loading branch information...
ruebot authored and ianmilligan1 committed Aug 14, 2019
1 parent 54c0c3e commit 01d12b45a3b2209f8ff3cf85bf703cffdf91d94f
@@ -106,7 +106,7 @@ package object archivesunleashed {
val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("mime_type", StringType, true))
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("content", StringType, true))

val sqlContext = SparkSession.builder()
@@ -154,13 +154,15 @@ package object archivesunleashed {
.keepImages()
.map(r => {
val image = ExtractImageDetails(r.getUrl, r.getMimeType, r.getBinaryBytes)
(r.getUrl, r.getMimeType, image.width, image.height, image.hash, image.body)
(r.getUrl, r.getMimeType, DetectMimeTypeTika(r.getBinaryBytes),
image.width, image.height, image.hash, image.body)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))

val schema = new StructType()
.add(StructField("url", StringType, true))
.add(StructField("mime_type", StringType, true))
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("width", IntegerType, true))
.add(StructField("height", IntegerType, true))
.add(StructField("md5", StringType, true))
@@ -173,23 +175,28 @@ package object archivesunleashed {
/* Extract PDF bytes and PDF metadata. */
def extractPDFDetailsDF(): DataFrame = {
val records = rdd
.filter(r => (DetectMimeTypeTika(r.getBinaryBytes) == "application/pdf"))
.map(r =>
(r, (DetectMimeTypeTika(r.getBinaryBytes)))
)
.filter(r => r._2 == "application/pdf")
.map(r => {
val bytes = r.getBinaryBytes
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r.getUrl)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = FilenameUtils.getExtension(url.getPath())
(r.getUrl, filename, extension, r.getMimeType, hash, encodedBytes)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))

val schema = new StructType()
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
.add(StructField("mime_type", StringType, true))
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("bytes", StringType, true))

@@ -200,35 +207,40 @@ package object archivesunleashed {
/* Extract audio bytes and audio metadata. */
def extractAudioDetailsDF(): DataFrame = {
val records = rdd
.filter(r => (DetectMimeTypeTika(r.getBinaryBytes).startsWith("audio/"))
|| r.getUrl.endsWith("aac")
|| r.getUrl.endsWith("mid")
|| r.getUrl.endsWith("midi")
|| r.getUrl.endsWith("mp3")
|| r.getUrl.endsWith("wav")
|| r.getUrl.endsWith("oga")
|| r.getUrl.endsWith("ogg")
|| r.getUrl.endsWith("weba")
|| r.getUrl.endsWith("ra")
|| r.getUrl.endsWith("rm")
|| r.getUrl.endsWith("3gp")
|| r.getUrl.endsWith("3g2"))
.map(r =>
(r, (DetectMimeTypeTika(r.getBinaryBytes)))
)
.filter(r => r._2.startsWith("audio/")
|| r._1.getUrl.endsWith("aac")
|| r._1.getUrl.endsWith("mid")
|| r._1.getUrl.endsWith("midi")
|| r._1.getUrl.endsWith("mp3")
|| r._1.getUrl.endsWith("wav")
|| r._1.getUrl.endsWith("oga")
|| r._1.getUrl.endsWith("ogg")
|| r._1.getUrl.endsWith("weba")
|| r._1.getUrl.endsWith("ra")
|| r._1.getUrl.endsWith("rm")
|| r._1.getUrl.endsWith("3gp")
|| r._1.getUrl.endsWith("3g2"))
.map(r => {
val bytes = r.getBinaryBytes
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r.getUrl)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = FilenameUtils.getExtension(url.getPath())
(r.getUrl, filename, extension, r.getMimeType, hash, encodedBytes)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))

val schema = new StructType()
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
.add(StructField("mime_type", StringType, true))
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("bytes", StringType, true))

@@ -239,35 +251,40 @@ package object archivesunleashed {
/* Extract video bytes and video metadata. */
def extractVideoDetailsDF(): DataFrame = {
val records = rdd
.filter(r => (DetectMimeTypeTika(r.getBinaryBytes).startsWith("video/"))
|| r.getUrl.endsWith("flv")
|| r.getUrl.endsWith("mp4")
|| r.getUrl.endsWith("mov")
|| r.getUrl.endsWith("avi")
|| r.getUrl.endsWith("wmv")
|| r.getUrl.endsWith("rv")
|| r.getUrl.endsWith("mpeg")
|| r.getUrl.endsWith("ogv")
|| r.getUrl.endsWith("webm")
|| r.getUrl.endsWith("ts")
|| r.getUrl.endsWith("3gp")
|| r.getUrl.endsWith("3g2"))
.map(r =>
(r, (DetectMimeTypeTika(r.getBinaryBytes)))
)
.filter(r => r._2.startsWith("video/")
|| r._1.getUrl.endsWith("flv")
|| r._1.getUrl.endsWith("mp4")
|| r._1.getUrl.endsWith("mov")
|| r._1.getUrl.endsWith("avi")
|| r._1.getUrl.endsWith("wmv")
|| r._1.getUrl.endsWith("rv")
|| r._1.getUrl.endsWith("mpeg")
|| r._1.getUrl.endsWith("ogv")
|| r._1.getUrl.endsWith("webm")
|| r._1.getUrl.endsWith("ts")
|| r._1.getUrl.endsWith("3gp")
|| r._1.getUrl.endsWith("3g2"))
.map(r => {
val bytes = r.getBinaryBytes
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r.getUrl)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = FilenameUtils.getExtension(url.getPath())
(r.getUrl, filename, extension, r.getMimeType, hash, encodedBytes)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))

val schema = new StructType()
.add(StructField("url", StringType, true))
.add(StructField("filename", StringType, true))
.add(StructField("extension", StringType, true))
.add(StructField("mime_type", StringType, true))
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("bytes", StringType, true))

@@ -34,7 +34,7 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
private val appName = "example-df"
private var sc: SparkContext = _
private val url = "url"
private val mime_type = "mime_type"
private val mime_type = "mime_type_web_server"
private val md5 = "md5"

before {
@@ -46,14 +46,16 @@ class ExtractAudioDetailsTest extends FunSuite with BeforeAndAfter {
val df = RecordLoader.loadArchives(warcPath, sc)
.extractAudioDetailsDF()

val extracted = df.select("url", "filename", "extension", "mime_type", "md5")
val extracted = df.select("url", "filename", "extension",
"mime_type_web_server", "mime_type_tika", "md5")
.orderBy(desc("md5")).head(1).toList
assert(extracted.size == 1)
assert("https://ruebot.net/files/feniz.mp3" == extracted(0)(0))
assert("feniz.mp3" == extracted(0)(1))
assert("mp3" == extracted(0)(2))
assert("audio/mpeg" == extracted(0)(3))
assert("f7e7ec84b12c294e19af1ba41732c733" == extracted(0)(4))
assert("audio/mpeg" == extracted(0)(4))
assert("f7e7ec84b12c294e19af1ba41732c733" == extracted(0)(5))
}

after {
@@ -46,17 +46,20 @@ class ExtractImageDetailsTest extends FunSuite with BeforeAndAfter {
val df = RecordLoader.loadArchives(arcPath, sc)
.extractImageDetailsDF()

val extracted = df.select("url", "mime_type", "width", "height", "md5")
val extracted = df.select("url", "mime_type_web_server", "mime_type_tika",
"width", "height", "md5")
.orderBy(desc("md5")).head(2).toList
assert(extracted.size == 2)
assert("http://www.archive.org/images/mediatype_movies.gif" == extracted(0)(0))
assert("image/gif" == extracted(0)(1))
assert(21 == extracted(0)(2))
assert("image/gif" == extracted(0)(2))
assert(21 == extracted(0)(3))
assert(21 == extracted(0)(4))
assert("http://www.archive.org/images/LOCLogoSmall.jpg" == extracted(1)(0))
assert("image/jpeg" == extracted(1)(1))
assert(275 == extracted(1)(2))
assert(300 == extracted(1)(3))
assert("image/jpeg" == extracted(1)(2))
assert(275 == extracted(1)(3))
assert(300 == extracted(1)(4))
}

after {
@@ -46,19 +46,22 @@ class ExtractPDFDetailsTest extends FunSuite with BeforeAndAfter {
val df = RecordLoader.loadArchives(warcPath, sc)
.extractPDFDetailsDF()

val extracted = df.select("url", "filename", "extension", "mime_type", "md5")
val extracted = df.select("url", "filename", "extension",
"mime_type_web_server", "mime_type_tika", "md5")
.orderBy(desc("md5")).head(2).toList
assert(extracted.size == 2)
assert("https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y" == extracted(0)(0))
assert("cost-analysis.pdf" == extracted(0)(1))
assert("pdf" == extracted(0)(2))
assert("application/pdf" == extracted(0)(3))
assert("aaba59d2287afd40c996488a39bbc0dd" == extracted(0)(4))
assert("application/pdf" == extracted(0)(4))
assert("aaba59d2287afd40c996488a39bbc0dd" == extracted(0)(5))
assert("https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/JCDL%20-%20Cost%20of%20a%20WARC%20Presentation-4.pdf?sequence=3&isAllowed=y" == extracted(1)(0))
assert("JCDL%20-%20Cost%20of%20a%20WARC%20Presentation-4.pdf" == extracted(1)(1))
assert("pdf" == extracted(1)(2))
assert("application/pdf" == extracted(1)(3))
assert("322cd5239141408c42f7441f15eed9af" == extracted(1)(4))
assert("application/pdf" == extracted(1)(4))
assert("322cd5239141408c42f7441f15eed9af" == extracted(1)(5))
}

after {
@@ -46,14 +46,16 @@ class ExtractVideoDetailsTest extends FunSuite with BeforeAndAfter {
val df = RecordLoader.loadArchives(warcPath, sc)
.extractVideoDetailsDF()

val extracted = df.select("url", "filename", "extension", "mime_type", "md5")
val extracted = df.select("url", "filename", "extension",
"mime_type_web_server", "mime_type_tika", "md5")
.orderBy(desc("md5")).head(1).toList
assert(extracted.size == 1)
assert("https://ruebot.net/2018-11-12%2016.14.11.mp4" == extracted(0)(0))
assert("2018-11-12%2016.14.11.mp4" == extracted(0)(1))
assert("mp4" == extracted(0)(2))
assert("video/mp4" == extracted(0)(3))
assert("2cde7de3213a87269957033f6315fce2" == extracted(0)(4))
assert("video/mp4" == extracted(0)(4))
assert("2cde7de3213a87269957033f6315fce2" == extracted(0)(5))
}

after {

0 comments on commit 01d12b4

Please sign in to comment.
You can’t perform that action at this time.