Skip to content
Permalink
Browse files

Add img alt text to imagegraph(); resolves #420. (#422)

- Update ExtractImageLinksRDD to grab alt text
- Add alt_text column to imagegraph
- Update tests
  • Loading branch information
ruebot committed Feb 10, 2020
1 parent 87c9734 commit 8f1a9f10e0f6358c204592480685000eae491d04
@@ -27,20 +27,22 @@ object ExtractImageLinksRDD {
*
* @param src the src link
* @param html the content from which links are to be extracted
* @return a sequence of image links.
* @return a sequence of (source, image link, alt-text).
*/
def apply(src: String, html: String): Seq[String] = {
def apply(src: String, html: String): Seq[(String, String, String)] = {
if (html.isEmpty) Nil
try {
val output = mutable.MutableList[String]()
val output = mutable.MutableList[(String, String, String)]()
val doc = Jsoup.parse(html)
val links: Elements = doc.select("img[src]")
val alt: Elements = doc.select("img[alt]")
val it = links.iterator()
while (it.hasNext) {
val link = it.next()
link.setBaseUri(src)
val target = link.attr("abs:src")
output += (target)
val altText = link.attr("alt")
output += ((src, target, altText))
}
output
} catch {
@@ -381,18 +381,16 @@ package object archivesunleashed {
def imagegraph(): DataFrame = {
val records = rdd
.keepValidPages()
.flatMap(r => ({
val src = r.getUrl
val imageUrls = ExtractImageLinksRDD(src, r.getContentString)
imageUrls.map(url => (src, url))
})
.map(t => (r.getCrawlDate, t._1, t._2)))
.map(t => Row(t._1, t._2, t._3))
.flatMap(r => ExtractImageLinksRDD(r.getUrl, r.getContentString)
.map(t => (r.getCrawlDate, t._1, t._2, t._3)))
.filter(t => t._2 != "" && t._3 != "")
.map(t => Row(t._1, t._2, t._3, t._4))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("src", StringType, true))
.add(StructField("image_url", StringType, true))
.add(StructField("alt_text", StringType, true))

val sqlContext = SparkSession.builder();
sqlContext.getOrCreate().createDataFrame(records, schema)
@@ -25,26 +25,31 @@ import org.scalatest.junit.JUnitRunner
@RunWith(classOf[JUnitRunner])
class ExtractImageLinksRDDTest extends FunSuite {
test("Extract simple image links RDD") {
val fragment =
"""Image here: <img src="http://foo.bar.com/pic.png"> and another <img src="http://baz.org/a/b/banner.jpg"/>"""
val extracted = ExtractImageLinksRDD("", fragment).toList
val fragment: String =
"""Image here: <img src="http://foo.bar.com/pic.png" alt="picture"> and another <img src="http://baz.org/a/b/banner.jpg" alt="baz banner"/>"""
val extracted: Seq[(String, String, String)] = ExtractImageLinksRDD("", fragment)
assert(extracted.size == 2)
assert("http://foo.bar.com/pic.png" == extracted(0))
assert("http://baz.org/a/b/banner.jpg" == extracted(1))
assert("http://foo.bar.com/pic.png" == extracted(0)._2)
assert("picture" == extracted(0)._3)
assert("http://baz.org/a/b/banner.jpg" == extracted(1)._2)
assert("baz banner" == extracted(1)._3)
}

test("Extract relative image links RDD") {
val fragment =
"""Image here: <img src="pic.png"> and another <img src="http://baz.org/a/b/banner.jpg"/> and <img src="../logo.gif"/>"""
val extracted = ExtractImageLinksRDD("http://foo.bar.com/a/page.html", fragment)
val fragment: String =
"""Image here: <img src="pic.png" alt="picture"> and another <img src="http://baz.org/a/b/banner.jpg" alt="baz banner" /> and <img src="../logo.gif" alt="LOGO" />"""
val extracted: Seq[(String, String, String)] = ExtractImageLinksRDD("http://foo.bar.com/a/page.html", fragment)
assert(extracted.size == 3)
assert("http://foo.bar.com/a/pic.png" == extracted(0))
assert("http://baz.org/a/b/banner.jpg" == extracted(1))
assert("http://foo.bar.com/logo.gif" == extracted(2))
assert("http://foo.bar.com/a/pic.png" == extracted(0)._2)
assert("picture" == extracted(0)._3)
assert("http://baz.org/a/b/banner.jpg" == extracted(1)._2)
assert("baz banner" == extracted(1)._3)
assert("http://foo.bar.com/logo.gif" == extracted(2)._2)
assert("LOGO" == extracted(2)._3)
}

test("Test image link errors RDD") {
val fragment =
val fragment: String =
"""Image here: <img src="pic.png"> and another <img src="http://baz.org/a/b/banner.jpg"/> and <img src="../logo.gif"/>"""
assert(ExtractImageLinksRDD("", "") == Nil)
// Need way of creating an exception here

0 comments on commit 8f1a9f1

Please sign in to comment.
You can’t perform that action at this time.