Permalink
Please
sign in to comment.
Browse files
Add audio & video binary extraction (#341)
- Add Audio & Video binary extraction. - Add filename, and extenstion column to audio, pdf, and video DF - Pass binary bytes instread of string to DetectMimeTypeTika in DF (s/getContentString/getBinaryBytes) - Updates saveToDisk to use file extension from DF column - Adds tests for Audio, PDF, and Video DF extraction - Add test fixtures for Audio, PDF, and Video DF extraction - Rename SaveBytesTest to SaveImageBytes test - Eliminate bytes->string->bytes conversion that was causing data loss in DetectMimeTypeTika - Update tika-parsers dep from JitPack - Remove tweet cruft - Resolves #306 - Resolves #307 - Includes work by @jrwiebe, see #341 for all commits before squash
- Loading branch information...
Showing
with
402 additions
and 23 deletions.
- +1 −1 pom.xml
- +8 −7 src/main/scala/io/archivesunleashed/df/package.scala
- +4 −6 src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala
- +88 −3 src/main/scala/io/archivesunleashed/package.scala
- +0 −2 src/test/resources/arc/tweetsTest.json
- BIN src/test/resources/warc/example.media.warc.gz
- BIN src/test/resources/warc/example.pdf.warc.gz
- +1 −1 src/test/scala/io/archivesunleashed/ArcTest.scala
- +64 −0 src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala
- +1 −1 src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala
- +69 −0 src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala
- +64 −0 src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala
- +2 −2 src/test/scala/io/archivesunleashed/df/{SaveBytesTest.scala → SaveImageBytesTest.scala}
- +100 −0 src/test/scala/io/archivesunleashed/df/SaveMediaBytesTest.scala
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,64 @@ | |||
/* | |||
* Archives Unleashed Toolkit (AUT): | |||
* An open-source toolkit for analyzing web archives. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
*/ | |||
|
|||
package io.archivesunleashed | |||
|
|||
import com.google.common.io.Resources | |||
import org.apache.spark.sql.SparkSession | |||
// scalastyle:off underscore.import | |||
import io.archivesunleashed.df._ | |||
import org.apache.spark.sql.functions._ | |||
// scalastyle:on underscore.import | |||
import org.apache.spark.{SparkConf, SparkContext} | |||
import org.junit.runner.RunWith | |||
import org.scalatest.junit.JUnitRunner | |||
import org.scalatest.{BeforeAndAfter, FunSuite} | |||
|
|||
@RunWith(classOf[JUnitRunner]) | |||
class ExtractAudioDetailsTest extends FunSuite with BeforeAndAfter { | |||
private val warcPath = Resources.getResource("warc/example.media.warc.gz").getPath | |||
private val master = "local[4]" | |||
private val appName = "example-df" | |||
private var sc: SparkContext = _ | |||
|
|||
before { | |||
val conf = new SparkConf() | |||
.setMaster(master) | |||
.setAppName(appName) | |||
sc = new SparkContext(conf) | |||
} | |||
|
|||
test("Audio DF extraction") { | |||
val df = RecordLoader.loadArchives(warcPath, sc) | |||
.extractAudioDetailsDF() | |||
|
|||
val extracted = df.select("url", "filename", "extension", "mime_type", "md5") | |||
.orderBy(desc("md5")).head(1).toList | |||
assert(extracted.size == 1) | |||
assert("https://ruebot.net/files/feniz.mp3" == extracted(0)(0)) | |||
assert("feniz.mp3" == extracted(0)(1)) | |||
assert("mp3" == extracted(0)(2)) | |||
assert("audio/mpeg" == extracted(0)(3)) | |||
assert("f7e7ec84b12c294e19af1ba41732c733" == extracted(0)(4)) | |||
} | |||
|
|||
after { | |||
if (sc != null) { | |||
sc.stop() | |||
} | |||
} | |||
} |
@@ -0,0 +1,69 @@ | |||
/* | |||
* Archives Unleashed Toolkit (AUT): | |||
* An open-source toolkit for analyzing web archives. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
*/ | |||
|
|||
package io.archivesunleashed | |||
|
|||
import com.google.common.io.Resources | |||
import org.apache.spark.sql.SparkSession | |||
// scalastyle:off underscore.import | |||
import io.archivesunleashed.df._ | |||
import org.apache.spark.sql.functions._ | |||
// scalastyle:on underscore.import | |||
import org.apache.spark.{SparkConf, SparkContext} | |||
import org.junit.runner.RunWith | |||
import org.scalatest.junit.JUnitRunner | |||
import org.scalatest.{BeforeAndAfter, FunSuite} | |||
|
|||
@RunWith(classOf[JUnitRunner]) | |||
class ExtractPDFDetailsTest extends FunSuite with BeforeAndAfter { | |||
private val warcPath = Resources.getResource("warc/example.pdf.warc.gz").getPath | |||
private val master = "local[4]" | |||
private val appName = "example-df" | |||
private var sc: SparkContext = _ | |||
|
|||
before { | |||
val conf = new SparkConf() | |||
.setMaster(master) | |||
.setAppName(appName) | |||
sc = new SparkContext(conf) | |||
} | |||
|
|||
test("PDF DF extraction") { | |||
val df = RecordLoader.loadArchives(warcPath, sc) | |||
.extractPDFDetailsDF() | |||
|
|||
val extracted = df.select("url", "filename", "extension", "mime_type", "md5") | |||
.orderBy(desc("md5")).head(2).toList | |||
assert(extracted.size == 2) | |||
assert("https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y" == extracted(0)(0)) | |||
assert("cost-analysis.pdf" == extracted(0)(1)) | |||
assert("pdf" == extracted(0)(2)) | |||
assert("application/pdf" == extracted(0)(3)) | |||
assert("aaba59d2287afd40c996488a39bbc0dd" == extracted(0)(4)) | |||
assert("https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/JCDL%20-%20Cost%20of%20a%20WARC%20Presentation-4.pdf?sequence=3&isAllowed=y" == extracted(1)(0)) | |||
assert("JCDL%20-%20Cost%20of%20a%20WARC%20Presentation-4.pdf" == extracted(1)(1)) | |||
assert("pdf" == extracted(1)(2)) | |||
assert("application/pdf" == extracted(1)(3)) | |||
assert("322cd5239141408c42f7441f15eed9af" == extracted(1)(4)) | |||
} | |||
|
|||
after { | |||
if (sc != null) { | |||
sc.stop() | |||
} | |||
} | |||
} |
Oops, something went wrong.
0 comments on commit
54c0c3e