Permalink
Browse files

ExtractBoilerpipeText to remove headers as well. #253 (#256)

* ExtractBoilerpipeText now removes headers.
  • Loading branch information...
greebie authored and ruebot committed Aug 11, 2018
1 parent e4cf9a7 commit 84a4c094df4595ac88cfd50b5aaf2bb1f1d9cacc
@@ -28,12 +28,17 @@ object ExtractBoilerpipeText {
* @param input an html string possibly containing boilerpipe text
* @return text with boilerplate removed or Nil if the text is empty.
*/
def apply(input: String): String = {
val maybeInput = Option(input)
removeBoilerplate(RemoveHttpHeader(input))
}
private def removeBoilerplate(input: String): String = {
val maybeInput = Option(DefaultExtractor.INSTANCE
.getText(input).replaceAll("[\\r\\n]+", " ").trim())
maybeInput match {
case Some(text) =>
DefaultExtractor.INSTANCE
.getText(input).replaceAll("[\\r\\n]+", " ").trim()
text
case None =>
""
}
@@ -25,6 +25,9 @@ import org.scalatest.junit.JUnitRunner
@RunWith(classOf[JUnitRunner])
class ExtractBoilerPipeTextTest extends FunSuite {
val header = "HTTP/1.0 200 OK Content-Type: text/html;" +
"charset=UTF-8 Expires: Fri, 20 Jul 2018 19:09:28 GMT Date:" +
"Fri, 20 Jul 2018 19:09:28 GMT Cache-Control: private,;\r\n\r\n"
var text = """<p>Text with a boiler plate.<p>
<footer>Copyright 2017</footer>"""
var boiler = """Copyright 2017"""
@@ -36,4 +39,8 @@ class ExtractBoilerPipeTextTest extends FunSuite {
// scalastyle:on null
assert(ExtractBoilerpipeText("All Rights Reserved.") == "")
}
test("Removes Header information") {
assert(ExtractBoilerpipeText(header + text) == boiler)
}
}

0 comments on commit 84a4c09

Please sign in to comment.