Skip to content
Permalink
Browse files

Update keepValidPages to include a filter on 200 OK. (#360)

- Add status code filter to keepValidPages
- Add MimeTypeTika to valid pages DF
- Update tests since we filter more and better now 😄
- Resolves #359
  • Loading branch information...
ruebot authored and ianmilligan1 committed Sep 11, 2019
1 parent 7305ed7 commit 9b3e025ef989260e5e09a9ba62b04af0dd5c78f1
@@ -95,17 +95,20 @@ package object archivesunleashed {
|| r.getMimeType == "application/xhtml+xml"
|| r.getUrl.toLowerCase.endsWith("htm")
|| r.getUrl.toLowerCase.endsWith("html"))
&& !r.getUrl.toLowerCase.endsWith("robots.txt"))
&& !r.getUrl.toLowerCase.endsWith("robots.txt")
&& r.getHttpStatus == "200")
}

def extractValidPagesDF(): DataFrame = {
val records = rdd.keepValidPages()
.map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType, r.getContentString))
.map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType,
DetectMimeTypeTika(r.getBinaryBytes), r.getContentString))

val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
.add(StructField("url", StringType, true))
.add(StructField("mime_type_web_server", StringType, true))
.add(StructField("mime_type_tika", StringType, true))
.add(StructField("content", StringType, true))

val sqlContext = SparkSession.builder()
@@ -115,7 +118,8 @@ package object archivesunleashed {
def extractHyperlinksDF(): DataFrame = {
val records = rdd
.keepValidPages()
.flatMap(r => ExtractLinks(r.getUrl, r.getContentString).map(t => (r.getCrawlDate, t._1, t._2, t._3)))
.flatMap(r => ExtractLinks(r.getUrl, r.getContentString)
.map(t => (r.getCrawlDate, t._1, t._2, t._3)))
.map(t => Row(t._1, t._2, t._3, t._4))

val schema = new StructType()
@@ -71,7 +71,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
assert (r2.sameElements(r)) }

test ("keep http status codes") {
val expected = 129
val expected = 94
val base = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val statusCodes: Set[String] = Set ("200", "404")
@@ -98,7 +98,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
}

test ("check for domains") {
val expected = 132
val expected = 91
val base2 = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val urls: Set[String] = Set("www.archive.org", "www.sloan.org")
@@ -185,7 +185,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
}

test ("discard urls") {
val expected = 135
val expected = 94
val base = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val urls: Set[String] = Set (sloan)
@@ -194,7 +194,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
}

test ("discard url patterns") {
val expected = 134
val expected = 93
val base = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val urls = Set (archive.r, sloan.r, "".r)
@@ -203,16 +203,15 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
}

test ("discard http status codes") {
val expected = 6
val expected = 46
val base = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val statusCodes: Set[String] = Set ("200", "404")
val r2 = base.discardHttpStatus(statusCodes).count
assert (r2 == expected)
}

test ("discard domains") {
val expected = 135
val expected = 94
val base = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val urls: Set[String] = Set ("www.sloan.org")
@@ -221,7 +220,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
}

test ("discard content") {
val expected = 134
val expected = 93
val base = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val regno = Set(regex, raw"UNINTELLIBLEDFSJKLS".r)
@@ -49,20 +49,20 @@ class DomainFrequencyExtractorTest extends FunSuite with BeforeAndAfter {
// +------------------+-----+
// | Domain|count|
// +------------------+-----+
// | www.archive.org| 132|
// | www.archive.org| 91|
// | deadlists.com| 2|
// |www.hideout.com.br| 1|
// +------------------+-----+

assert(dfResults(0).get(0) == "www.archive.org")
assert(dfResults(0).get(1) == 132)
assert(dfResults(0).get(1) == 91)
assert(dfResults(1).get(0) == "deadlists.com")
assert(dfResults(1).get(1) == 2)
assert(dfResults(2).get(0) == "www.hideout.com.br")
assert(dfResults(2).get(1) == 1)

assert(rddResults(0)._1 == "www.archive.org")
assert(rddResults(0)._2 == 132)
assert(rddResults(0)._2 == 91)
assert(rddResults(1)._1 == "deadlists.com")
assert(rddResults(1)._2 == 2)
assert(rddResults(2)._1 == "www.hideout.com.br")
@@ -40,7 +40,7 @@ class DomainGraphExtractorDfTest extends FunSuite with BeforeAndAfter {

test("DomainGraphExtractor") {
val TESTLENGTH = 166
val TESTRESULT = 316
val TESTRESULT = 280
val df = RecordLoader.loadArchives(arcPath, sc).extractHyperlinksDF()
val dfResult = DomainGraphExtractor(df).collect()
assert(dfResult.length == TESTLENGTH)
@@ -47,7 +47,7 @@ class DomainGraphExtractorTest extends FunSuite with BeforeAndAfter {
assert(rddResult(0)._1._1 == "20080430")
assert(rddResult(0)._1._2 == "www.archive.org")
assert(rddResult(0)._1._3 == "www.archive.org")
assert(rddResult(0)._2 == 305)
assert(rddResult(0)._2 == 269)
}

after {
@@ -43,7 +43,7 @@ class PlainTextExtractorTest extends FunSuite with BeforeAndAfter {
val df = RecordLoader.loadArchives(arcPath, sc).extractValidPagesDF()
val rddResults = PlainTextExtractor(rdd).collect()
val dfResults = PlainTextExtractor(df).collect()
val RESULTSLENGTH = 135
val RESULTSLENGTH = 94

assert(rddResults.length == RESULTSLENGTH)
assert(rddResults(0)._1 == "20080430")
@@ -58,13 +58,13 @@ class SimpleDfTest extends FunSuite with BeforeAndAfter {
// +------------------+-----+
// | Domain|count|
// +------------------+-----+
// | www.archive.org| 132|
// | www.archive.org| 91|
// | deadlists.com| 2|
// |www.hideout.com.br| 1|
// +------------------+-----+

assert(results(0).get(0) == "www.archive.org")
assert(results(0).get(1) == 132)
assert(results(0).get(1) == 91)

assert(results(1).get(0) == "deadlists.com")
assert(results(1).get(1) == 2)

0 comments on commit 9b3e025

Please sign in to comment.
You can’t perform that action at this time.