Skip to content
Please note that GitHub no longer supports your web browser.

We recommend upgrading to the latest Google Chrome or Firefox.

Learn more
Permalink
Browse files

Add language detection column to webpages. (#403)

- Addresses #402
  • Loading branch information
ruebot authored and ianmilligan1 committed Jan 12, 2020
1 parent 0ecc4f8 commit bc0d663fb42125af2ec9326c71a08e53f8a86457
Showing with 6 additions and 3 deletions.
  1. +6 −3 src/main/scala/io/archivesunleashed/package.scala
@@ -27,7 +27,7 @@ import io.archivesunleashed.df.{DetectLanguageDF, DetectMimeTypeTikaDF, ExtractD

import io.archivesunleashed.matchbox.{DetectLanguageRDD, DetectMimeTypeTika, ExtractDateRDD,
ExtractDomainRDD, ExtractImageDetails, ExtractImageLinksRDD,
ExtractLinksRDD, GetExtensionMimeRDD, RemoveHTMLRDD}
ExtractLinksRDD, GetExtensionMimeRDD, RemoveHTMLRDD, RemoveHTTPHeaderRDD}
import io.archivesunleashed.matchbox.ExtractDateRDD.DateComponent
import io.archivesunleashed.matchbox.ExtractDateRDD.DateComponent.DateComponent
import java.net.URI
@@ -340,13 +340,16 @@ package object archivesunleashed {
def webpages(): DataFrame = {
val records = rdd.keepValidPages()
.map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType,
DetectMimeTypeTika(r.getBinaryBytes), r.getContentString))
DetectMimeTypeTika(r.getBinaryBytes),
DetectLanguageRDD(RemoveHTMLRDD(RemoveHTTPHeaderRDD(r.getContentString))),
r.getContentString))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("language", StringType, true))
.add(StructField("content", StringType, true))

val sqlContext = SparkSession.builder()
@@ -899,4 +902,4 @@ package object archivesunleashed {
rdd.filter(r => !lang.contains(DetectLanguageRDD(RemoveHTMLRDD(r.getContentString))))
}
}
}
}

0 comments on commit bc0d663

Please sign in to comment.
You can’t perform that action at this time.