Skip to content
Permalink
Browse files

Replace computeHash with ComputeMD5; resolves #333. (#338)

* Replace computeHash with ComputeMD5; resolves #333.

* I suppose these are redundant.
  • Loading branch information...
ruebot authored and jrwiebe committed Aug 7, 2019
1 parent 1818596 commit 9623c7a11cf88a37f749d9c2290cf0ed5fc8a36b
@@ -62,18 +62,18 @@ object WriteGEXF {
val endAttribute = "\" />\n"
val nodeStart = "<node id=\""
val labelStart = "\" label=\""
val edges = rdd.map(r => "<edge source=\"" + r._1._2.computeHash() + "\" target=\"" +
r._1._3.computeHash() + "\" weight=\"" + r._2 +
val edges = rdd.map(r => "<edge source=\"" + ComputeMD5(r._1._2.getBytes) + "\" target=\"" +
ComputeMD5(r._1._3.getBytes) + "\" weight=\"" + r._2 +
"\" type=\"directed\">\n" +
"<attvalues>\n" +
"<attvalue for=\"0\" value=\"" + r._1._1 + endAttribute +
"</attvalues>\n" +
"</edge>\n").collect
val nodes = rdd.flatMap(r => List(nodeStart +
r._1._2.computeHash() + labelStart +
ComputeMD5(r._1._2.getBytes) + labelStart +
r._1._2.escapeInvalidXML() + endAttribute,
nodeStart +
r._1._3.computeHash() + labelStart +
ComputeMD5(r._1._3.getBytes) + labelStart +
r._1._3.escapeInvalidXML() + endAttribute)).distinct.collect
outFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
"<gexf xmlns=\"http://www.gexf.net/1.3draft\"\n" +
@@ -122,13 +122,13 @@ object WriteGEXF {
"<nodes>\n")
vertices foreach { v =>
outFile.write("<node id=\"" +
v.computeHash() + "\" label=\"" +
ComputeMD5(v.getBytes) + "\" label=\"" +
v.escapeInvalidXML() + endAttribute)
}
outFile.write("</nodes>\n<edges>\n")
data foreach { e =>
outFile.write("<edge source=\"" + e.get(1).asInstanceOf[String].computeHash() + "\" target=\"" +
e.get(2).asInstanceOf[String].computeHash() + "\" weight=\"" + e.get(3) +
outFile.write("<edge source=\"" + ComputeMD5(e.get(1).asInstanceOf[String].getBytes) + "\" target=\"" +
ComputeMD5(e.get(2).asInstanceOf[String].getBytes) + "\" weight=\"" + e.get(3) +
"\" type=\"directed\">\n" +
"<attvalues>\n" +
"<attvalue for=\"0\" value=\"" + e.get(0) + endAttribute +
@@ -176,13 +176,13 @@ object WriteGraph {
"<nodes>\n")
vertices.foreach { v =>
outFile.write(nodeStart +
v.computeHash() + "\" label=\"" +
ComputeMD5(v.getBytes) + "\" label=\"" +
v.escapeInvalidXML() + endAttribute)
}
outFile.write("</nodes>\n<edges>\n")
data.foreach { e =>
outFile.write(edgeStart + e.get(1).asInstanceOf[String].computeHash() + targetChunk +
e.get(2).asInstanceOf[String].computeHash() + "\" weight=\"" + e.get(3) +
outFile.write(edgeStart + ComputeMD5(e.get(1).asInstanceOf[String].getBytes) + targetChunk +
ComputeMD5(e.get(2).asInstanceOf[String].getBytes) + "\" weight=\"" + e.get(3) +
"\" type=\"directed\">\n" +
"<attvalues>\n" +
"<attvalue for=\"0\" value=\"" + e.get(0) + endAttribute +
@@ -48,14 +48,14 @@ object WriteGraphML {
*/
def makeFile (rdd: RDD[((String, String, String), Int)], graphmlPath: String): Boolean = {
val outFile = Files.newBufferedWriter(Paths.get(graphmlPath), StandardCharsets.UTF_8)
val edges = rdd.map(r => "<edge source=\"" + r._1._2.computeHash() + "\" target=\"" +
r._1._3.computeHash() + "\" type=\"directed\">\n" +
val edges = rdd.map(r => "<edge source=\"" + ComputeMD5(r._1._2.getBytes) + "\" target=\"" +
ComputeMD5(r._1._3.getBytes) + "\" type=\"directed\">\n" +
"<data key=\"weight\">" + r._2 + "</data>\n" +
"<data key=\"crawlDate\">" + r._1._1 + "</data>\n" +
"</edge>\n").collect
val nodes = rdd.flatMap(r => List("<node id=\"" + r._1._2.computeHash() + "\">\n" +
val nodes = rdd.flatMap(r => List("<node id=\"" + ComputeMD5(r._1._2.getBytes) + "\">\n" +
"<data key=\"label\">" + r._1._2.escapeInvalidXML() + "</data>\n</node>\n",
"<node id=\"" + r._1._3.computeHash() + "\">\n" +
"<node id=\"" + ComputeMD5(r._1._3.getBytes) + "\">\n" +
"<data key=\"label\">" + r._1._3.escapeInvalidXML() + "</data>\n</node>\n")).distinct.collect
outFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
"<graphml xmlns=\"http://graphml.graphdrawing.org/xmlns\"\n" +
@@ -66,7 +66,7 @@ package object df {
val image = reader.read(0)

val format = reader.getFormatName()
val suffix = encodedBytes.computeHash()
val suffix = ComputeMD5(bytes)
val file = new File(fileName + "-" + suffix + "." + format);
if (image != null) {
ImageIO.write(image, format, file);
@@ -23,13 +23,11 @@ import java.security.MessageDigest
object ComputeMD5 {
// scalastyle:on object.name
/** Computes the MD5 checksum of a byte array (eg. an image).
*
* For string data, it is better to use `StringUtils.computeHash()`.
*
* @param bytes
* @return MD5 checksum.
*/
def apply(bytes: Array[Byte]): String = {
new String(MessageDigest.getInstance("MD5").digest(bytes))
MessageDigest.getInstance("MD5").digest(bytes).map("%02x".format(_)).mkString
}
}
@@ -43,10 +43,5 @@ package object matchbox {
case e: Exception => throw new IOException("Caught exception processing input row ", e)
}
}

def computeHash(): String = {
val md5 = MessageDigest.getInstance("MD5")
md5.digest(s.getBytes).map("%02x".format(_)).mkString
}
}
}
@@ -45,9 +45,9 @@ class ExtractPopularImagesTest extends FunSuite with BeforeAndAfter {
val examplerdd = RecordLoader.loadArchives(arcPath, sc)
val imagesLowLimit = ExtractPopularImages(examplerdd, 3, sc)
val imagesHighLimit = ExtractPopularImages(examplerdd, highTest, sc)
val response = Array("1\thttp://creativecommons.org/images/public/somerights20.gif",
"1\thttp://www.archive.org/images/blendbar.jpg",
"1\thttp://www.archive.org/images/main-header.jpg")
val response = Array("1\thttp://www.archive.org/images/books-small.jpg",
"1\thttp://i.creativecommons.org/l/by-sa/3.0/88x31.png",
"1\thttp://www.archive.org/images/blendbar.jpg")
assert (imagesLowLimit.take(3).deep == response.deep)
assert (imagesHighLimit.take(3).deep == response.deep)
}
@@ -61,8 +61,9 @@ class SaveImageTest extends FunSuite with BeforeAndAfter {
extracted.saveToDisk(testString, "/tmp/foo")

val encodedBytes: String = extracted.take(1)(0).getAs(testString)
val bytes = Base64.getDecoder.decode(encodedBytes);

val suffix = encodedBytes.computeHash()
val suffix = ComputeMD5(bytes)
val fileName = "/tmp/foo-" + suffix + ".png"
assert(Files.exists(Paths.get(fileName)))

@@ -85,7 +86,7 @@ class SaveImageTest extends FunSuite with BeforeAndAfter {
test("Attempt to save invalid image") {
val dummyEncBytes = Base64.getEncoder.encodeToString(Array.range(0, 127)
.map(_.toByte))
val dummyMD5 = dummyEncBytes.computeHash()
val dummyMD5 = ComputeMD5(dummyEncBytes.getBytes)
val dummyImg = TestImageDetails("http://example.com/fake.jpg", "image/jpeg",
"600", "800", dummyMD5, dummyEncBytes)

@@ -96,11 +97,11 @@ class SaveImageTest extends FunSuite with BeforeAndAfter {
// scalastyle:on
val df = Seq(dummyImg).toDF

df.saveToDisk("bytes", "/tmp/foo")
df.saveToDisk("bytes", "/tmp/bar")

// Check that no file was written.
assert(new File("/tmp").listFiles.filter(_.isFile).toList
.count(_.getName.startsWith("foo-" + dummyMD5)) == 0)
.count(_.getName.startsWith("bar-" + dummyMD5)) == 0)
}

after {
@@ -46,6 +46,6 @@ class StringUtilsTest extends FunSuite {

test ("md5 hash") {
val s: String = "unesco.org";
assert(s.computeHash() == "8e8decc8e8107bcf9d3896f3222b77d8");
assert(ComputeMD5(s.getBytes) == "8e8decc8e8107bcf9d3896f3222b77d8");
}
}

0 comments on commit 9623c7a

Please sign in to comment.
You can’t perform that action at this time.