Skip to content
Please note that GitHub no longer supports your web browser.

We recommend upgrading to the latest Google Chrome or Firefox.

Learn more
Permalink
Browse files

Add ComputeSHA1 method; resolves #363. (#364)

- Update tests where needed
- Add SHA1 method to ExtractImageDetails
- Add SHA1 to DataFrames binary extraction and analysis
  • Loading branch information...
ruebot authored and ianmilligan1 committed Oct 9, 2019
1 parent 9b3e025 commit 03ac99c87953536e218c680131f9b3c1dcbe4f1a
@@ -0,0 +1,32 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.archivesunleashed.matchbox

import java.security.MessageDigest

/** Compute SHA1 checksum. */
// scalastyle:off object.name
object ComputeSHA1 {
// scalastyle:on object.name
/** Computes the MD5 checksum of a byte array (eg. an image).
*
* @param bytes
* @return SHA1 checksum.
*/
def apply(bytes: Array[Byte]): String = {
MessageDigest.getInstance("SHA1").digest(bytes).map("%02x".format(_)).mkString
}
}
@@ -26,7 +26,8 @@ class ImageDetails(imageUrl: String, imageType: String, bytes: Array[Byte]) {
val height = dimensions._2
val url: String = imageUrl
val mimeType: String = imageType
val hash: String = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash: String = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash: String = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val body: String = Base64.getEncoder.encodeToString(bytes)
}

@@ -162,9 +162,9 @@ package object archivesunleashed {
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMime(url.getPath(), mimeTypeTika)
(r.getUrl, filename, extension, r.getMimeType, mimeTypeTika,
image.width, image.height, image.hash, image.body)
image.width, image.height, image.md5Hash, image.sha1Hash, image.body)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10))

val schema = new StructType()
.add(StructField("url", StringType, true))
@@ -175,6 +175,7 @@ package object archivesunleashed {
.add(StructField("width", IntegerType, true))
.add(StructField("height", IntegerType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
@@ -190,15 +191,16 @@ package object archivesunleashed {
.filter(r => r._2 == "application/pdf")
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMime(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
@@ -207,6 +209,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
@@ -222,15 +225,16 @@ package object archivesunleashed {
.filter(r => r._2.startsWith("audio/"))
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMime(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
@@ -239,6 +243,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
@@ -254,15 +259,16 @@ package object archivesunleashed {
.filter(r => r._2.startsWith("video/"))
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMime(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
@@ -271,6 +277,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
@@ -311,7 +318,8 @@ package object archivesunleashed {
&& r._2 == "text/plain"))
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
@@ -325,9 +333,9 @@ package object archivesunleashed {
}
val extension = GetExtensionMime(url.getPath(), mimeType)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
@@ -336,6 +344,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
@@ -363,15 +372,16 @@ package object archivesunleashed {
|| r._2 == "application/vnd.ms-powerpoint.template.macroEnabled.12")
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMime(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
@@ -380,6 +390,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
@@ -412,15 +423,16 @@ package object archivesunleashed {
|| r._2 == "application/rtf")
.map(r => {
val bytes = r._1.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMime(url.getPath(), r._2)
(r._1.getUrl, filename, extension, r._1.getMimeType,
DetectMimeTypeTika(r._1.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
@@ -429,6 +441,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
@@ -447,15 +460,16 @@ package object archivesunleashed {
|| !r.getUrl.toLowerCase.endsWith(".html"))
.map(r => {
val bytes = r.getBinaryBytes
val hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = FilenameUtils.getExtension(url.getPath())
(r.getUrl, filename, extension, r.getMimeType,
DetectMimeTypeTika(r.getBinaryBytes), hash, encodedBytes)
DetectMimeTypeTika(r.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7))
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8))

val schema = new StructType()
.add(StructField("url", StringType, true))
@@ -464,6 +478,7 @@ package object archivesunleashed {
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("md5", StringType, true))
.add(StructField("sha1", StringType, true))
.add(StructField("bytes", StringType, true))

val sqlContext = SparkSession.builder();
@@ -46,7 +46,7 @@ class ExtractImageDetailsTest extends FunSuite with BeforeAndAfter {
.extractImageDetailsDF()

val extracted = df.select("url", "mime_type_web_server", "mime_type_tika",
"width", "height", "md5")
"width", "height", "md5", "sha1")
.orderBy(desc("md5")).head(2).toList
assert(extracted.size == 2)
assert("http://www.archive.org/images/mediatype_movies.gif" == extracted(0)(0))
@@ -47,4 +47,9 @@ class StringUtilsTest extends FunSuite {
val s: String = "unesco.org";
assert(ComputeMD5(s.getBytes) == "8e8decc8e8107bcf9d3896f3222b77d8");
}
test ("sh1 hash") {
val s: String = "unesco.org";
assert(ComputeSHA1(s.getBytes) == "2d0e5377157172045d87befe46e157cda42c4f6e");
}

}

0 comments on commit 03ac99c

Please sign in to comment.
You can’t perform that action at this time.