Skip to content
Permalink
Browse files

Add discardLanguage filter to RecordLoader. (#353)

- Clean up doc comments
- Add test
- Resolves #352
  • Loading branch information...
ruebot authored and ianmilligan1 committed Aug 23, 2019
1 parent bced854 commit 0284d33bff8f251775d07ecc24908ebcd8b8e88d
@@ -472,7 +472,7 @@ package object archivesunleashed {
&& DetectMimeTypeTika(r.getBinaryBytes).startsWith("image/"))
}

/** Removes all data but selected mimeTypes specified in ArchiveRecord.
/** Removes all data but selected mimeTypes specified.
*
* @param mimeTypes a list of Mime Types
*/
@@ -488,15 +488,15 @@ package object archivesunleashed {
rdd.filter(r => mimeTypes.contains(DetectMimeTypeTika(r.getBinaryBytes)))
}

/** Removes all data that does not have selected status codes.
/** Removes all data that does not have selected HTTP status codes.
*
* @param statusCodes a list of HTTP status codes
*/
def keepHttpStatus(statusCodes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => statusCodes.contains(r.getHttpStatus))
}

/** Removes all data that does not have selected data.
/** Removes all data that does not have selected date.
*
* @param dates a list of dates
* @param component the selected DateComponent enum value
@@ -513,7 +513,7 @@ package object archivesunleashed {
rdd.filter(r => urls.contains(r.getUrl))
}

/** Removes all data but selected url patterns.
/** Removes all data but selected URL patterns.
*
* @param urlREs a list of regular expressions
*/
@@ -555,47 +555,47 @@ package object archivesunleashed {
}).exists(identity))
}

/** Filters ArchiveRecord MimeTypes from RDDs.
/** Filters ArchiveRecord MimeTypes (web server).
*
* @param mimeTypes a list of Mime Types
*/
def discardMimeTypes(mimeTypes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !mimeTypes.contains(r.getMimeType))
}

/** Filters detected MimeTypes from RDDs.
/** Filters detected MimeTypes (Tika).
*
* @param mimeTypes a list of Mime Types
*/
def discardMimeTypesTika(mimeTypes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !mimeTypes.contains(DetectMimeTypeTika(r.getBinaryBytes)))
}

/** Filters detected dates from RDDs.
/** Filters detected dates.
*
* @param date a list of dates
*/
def discardDate(date: String): RDD[ArchiveRecord] = {
rdd.filter(r => r.getCrawlDate != date)
}

/** Filters detected urls from RDDs.
/** Filters detected URLs.
*
* @param urls a list of urls
*/
def discardUrls(urls: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !urls.contains(r.getUrl))
}

/** Filters detected status codes from RDDs.
/** Filters detected HTTP status codes.
*
* @param statusCodes a list of HTTP status codes
*/
def discardHttpStatus(statusCodes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !statusCodes.contains(r.getHttpStatus))
}

/** Filters detected url patterns from RDDs.
/** Filters detected URL patterns (regex).
*
* @param urlREs a list of Regular expressions
*/
@@ -608,15 +608,15 @@ package object archivesunleashed {
}).exists(identity))
}

/** Filters detected domains (regex) from RDDs.
/** Filters detected domains (regex).
*
* @param urls a list of urls for the source domains
*/
def discardDomains(urls: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !urls.contains(r.getDomain))
}

/** Filters detected content (regex) from RDDs.
/** Filters detected content (regex).
*
* @param contentREs a list of regular expressions
*/
@@ -628,5 +628,13 @@ package object archivesunleashed {
case None => false
}).exists(identity))
}

/** Filters detected language.
*
* @param lang a set of ISO 639-2 codes
*/
def discardLanguages(lang: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => !lang.contains(DetectLanguage(RemoveHTML(r.getContentString))))
}
}
}
@@ -117,6 +117,16 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
assert (r2.sameElements(r))
}

test ("discard languages") {
val base2 = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val langs: Set[String] = Set("fr")
val r = Array("http://www.archive.org/", "http://www.archive.org/index.php")
val r2 = base2.discardLanguages(langs)
.map(r => r.getUrl).take(2)
assert (r2.sameElements(r))
}

test ("keep mime tika") {
val base = RecordLoader.loadArchives(arcPath, sc)
val mime = Set ("text/plain", "image/jpeg")

0 comments on commit 0284d33

Please sign in to comment.
You can’t perform that action at this time.