Permalink
Browse files
ExtractBoilerpipeText to remove headers as well. #253 (#256)
* ExtractBoilerpipeText now removes headers.
- Loading branch information...
|
@@ -28,12 +28,17 @@ object ExtractBoilerpipeText { |
|
|
* @param input an html string possibly containing boilerpipe text
|
|
|
* @return text with boilerplate removed or Nil if the text is empty.
|
|
|
*/
|
|
|
|
|
|
def apply(input: String): String = {
|
|
|
val maybeInput = Option(input)
|
|
|
removeBoilerplate(RemoveHttpHeader(input))
|
|
|
}
|
|
|
|
|
|
private def removeBoilerplate(input: String): String = {
|
|
|
val maybeInput = Option(DefaultExtractor.INSTANCE
|
|
|
.getText(input).replaceAll("[\\r\\n]+", " ").trim())
|
|
|
maybeInput match {
|
|
|
case Some(text) =>
|
|
|
DefaultExtractor.INSTANCE
|
|
|
.getText(input).replaceAll("[\\r\\n]+", " ").trim()
|
|
|
text
|
|
|
case None =>
|
|
|
""
|
|
|
}
|
|
|
|
@@ -25,6 +25,9 @@ import org.scalatest.junit.JUnitRunner |
|
|
|
|
|
@RunWith(classOf[JUnitRunner])
|
|
|
class ExtractBoilerPipeTextTest extends FunSuite {
|
|
|
val header = "HTTP/1.0 200 OK Content-Type: text/html;" +
|
|
|
"charset=UTF-8 Expires: Fri, 20 Jul 2018 19:09:28 GMT Date:" +
|
|
|
"Fri, 20 Jul 2018 19:09:28 GMT Cache-Control: private,;\r\n\r\n"
|
|
|
var text = """<p>Text with a boiler plate.<p>
|
|
|
<footer>Copyright 2017</footer>"""
|
|
|
var boiler = """Copyright 2017"""
|
|
@@ -36,4 +39,8 @@ class ExtractBoilerPipeTextTest extends FunSuite { |
|
|
// scalastyle:on null
|
|
|
assert(ExtractBoilerpipeText("All Rights Reserved.") == "")
|
|
|
}
|
|
|
|
|
|
test("Removes Header information") {
|
|
|
assert(ExtractBoilerpipeText(header + text) == boiler)
|
|
|
}
|
|
|
}
|
0 comments on commit
84a4c09