Skip to content
Please note that GitHub no longer supports your web browser.

We recommend upgrading to the latest Google Chrome or Firefox.

Learn more
Permalink
Browse files

Align NER output to WANE format; addresses #297 (#361)

- Update Stanford core NLP
- Format NER output in json
- Add getPayloadDigest to ArchiveRecord
- Add test for getPayloadDigest
- Add payload digest to NER output
- Remove extractFromScrapeText
- Remove extractFromScrapeText test
- TODO: PERSON -> persons, LOCATION -> locations, ORGANIZATION -> organizations (involves writing a new class or overriding NER output 🤢
  • Loading branch information
ruebot authored and ianmilligan1 committed Nov 5, 2019
1 parent 6686519 commit 379cc68e0adfd12e8b8ba62b20f0213f0dea261a
@@ -640,7 +640,7 @@
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.8.0</version>
<version>3.9.2</version>
<exclusions>
<exclusion>
<groupId>org.apache.commons</groupId>
@@ -149,7 +149,7 @@ public static String getWarcResponseMimeType(final byte[] contents) {
return null;
}

// Just using parseHeaders to move down input stream to body
// Just using parseHeaders to move down input stream to body.
HttpParser.parseHeaders(record, WARC_HEADER_ENCODING);
record.dump(baos);
return baos.toByteArray();
@@ -18,9 +18,10 @@ package io.archivesunleashed

import java.text.SimpleDateFormat
import java.io.ByteArrayInputStream
import java.security.MessageDigest

import io.archivesunleashed.data.{ArcRecordUtils, WarcRecordUtils, ArchiveRecordWritable}
import io.archivesunleashed.matchbox.{ExtractDate, ExtractDomain, RemoveHttpHeader}
import io.archivesunleashed.matchbox.{ComputeMD5, ExtractDate, ExtractDomain, RemoveHttpHeader}
import org.apache.spark.SerializableWritable
import org.archive.io.arc.ARCRecord
import org.archive.io.warc.WARCRecord
@@ -60,6 +61,8 @@ trait ArchiveRecord extends Serializable {
/** Returns the http status of the crawl. */
def getHttpStatus: String

/** Returns payload digest (SHA1). */
def getPayloadDigest: String
}

/** Default implementation of a record in a web archive.
@@ -158,4 +161,12 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends
getContentBytes
}
}

val getPayloadDigest: String = {
if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC){
"sha1:" + MessageDigest.getInstance("SHA1").digest(getContentBytes).map("%02x".format(_)).mkString
} else {
r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getHeaderValue("WARC-Payload-Digest").asInstanceOf[String]
}
}
}
@@ -16,7 +16,7 @@
package io.archivesunleashed.app

import io.archivesunleashed.RecordLoader
import io.archivesunleashed.matchbox.{NERClassifier, RemoveHTML}
import io.archivesunleashed.matchbox.{ComputeMD5, NERClassifier, RemoveHTML}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

@@ -36,45 +36,35 @@ object ExtractEntities {
* @param sc the Apache Spark context
* @return an rdd with classification entities.
*/
def extractFromRecords(iNerClassifierFile: String, inputRecordFile: String, outputFile: String, sc: SparkContext): RDD[(String, String, String)] = {
def extractFromRecords(iNerClassifierFile: String, inputRecordFile: String,
outputFile: String,
sc: SparkContext): RDD[(String, String, String, String)] = {
val rdd = RecordLoader.loadArchives(inputRecordFile, sc)
.map(r => (r.getCrawlDate, r.getUrl, RemoveHTML(r.getContentString)))
extractAndOutput(iNerClassifierFile, rdd, outputFile)
}

/** Extracts named entities from tuple-formatted derivatives scraped from a website.
*
* @param iNerClassifierFile path of classifier file
* @param inputFile path of file containing tuples (date: String, url: String, content: String)
* from which to extract entities
* @param outputFile path of output directory
* @return an rdd with classification entities.
*/
def extractFromScrapeText(iNerClassifierFile: String, inputFile: String, outputFile: String, sc: SparkContext): RDD[(String, String, String)] = {
val rdd = sc.textFile(inputFile)
.map(line => {
val ind1 = line.indexOf(",")
val ind2 = line.indexOf(",", ind1 + 1)
(line.substring(1, ind1),
line.substring(ind1 + 1, ind2),
line.substring(ind2 + 1, line.length - 1))
})
.keepValidPages()
.map(r => (("\"timestamp\":\"" + r.getCrawlDate + "\""),
("\"url\":\"" + r.getUrl + "\""),
(RemoveHTML(r.getContentString)),
("\"digest\":\"" + r.getPayloadDigest + "\"")))
extractAndOutput(iNerClassifierFile, rdd, outputFile)
}

/** Saves the NER output to file from a given RDD.
*
* @param iNerClassifierFile path of classifier file
* @param rdd with values (date, url, content)
* @param rdd with values (date, url, content, content digest)
* @param outputFile path of output directory
* @return an rdd of tuples with classification entities extracted.
* @return a json object with classification entities extracted.
*/
def extractAndOutput(iNerClassifierFile: String, rdd: RDD[(String, String, String)], outputFile: String): RDD[(String, String, String)] = {
def extractAndOutput(iNerClassifierFile: String,
rdd: RDD[(String, String, String, String)],
outputFile: String): RDD[(String, String, String, String)] = {
val r = rdd.mapPartitions(iter => {
NERClassifier.apply(iNerClassifierFile)
iter.map(r => (r._1, r._2, NERClassifier.classify(r._3)))
iter.map(r => (("{" + r._1), r._2,
("\"named_entities\":" + NERClassifier.classify(r._3)), (r._4 + "}")))
})
r.saveAsTextFile(outputFile)
r.map(r => r._1 + "," + r._2 + "," + r._3 + "," + r._4)
.saveAsTextFile(outputFile)
r
}
}
@@ -114,7 +114,7 @@ class NERCombinedJson extends Serializable {
iter.map(r => {
val nerRec = new NerRecord(r._1._1, r._1._2)
r._2.foreach(entityMap => {
// e.g., entityMap = "PERSON" -> List(("Jack", 1), ("Diane", 3))
// e.g., entityMap = "persons" -> List(("Jack", 1), ("Diane", 3))
val ec = new EntityCounts(entityMap._1)
entityMap._2.foreach(e => {
ec.entities += new Entity(e._1, e._2)
@@ -54,10 +54,10 @@ object NERClassifier {
/** Performs NER classificiation based on NER Classifier.
*
* @param input
* @return json string containing lists of people, organizations and locations.
* @return json string containing lists of persons, organizations, and locations.
*/
def classify(input: String): String = {
val emptyString: String = "{\"PERSON\":[],\"ORGANIZATION\"=[],\"LOCATION\"=[]}"
val emptyString: String = "\"persons\":[],\"organizations\":[],\"locations\":[]"
val entitiesByType = mutable.LinkedHashMap[NERClassType.Value, mutable.Seq[String]]()
for (t <- NERClassType.values) {
if (t != NERClassType.O) { entitiesByType.put(t, mutable.Seq()) }
@@ -114,6 +114,19 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
exampleStatusCode2).deep)
}

test("Get Payload Digest") {
val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
.map(x => x.getPayloadDigest).take(3)
val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
.map(x => x.getPayloadDigest).take(3)
assert (textSampleArc.deep == Array("sha1:252efd6dd414d91812dd9b0f897cdb2b44f64601",
"sha1:8d115d0e83c5dcd66b13619e04d60a36cb2c1ee4",
"sha1:ede22581685942721c7b9743dced317633d00e33").deep)
assert (textSampleWarc.deep == Array(null,
"sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U",
"sha1:2WAXX5NUWNNCS2BDKCO5OVDQBJVNKIVV").deep)
}

after {
if (sc != null) {
sc.stop()
@@ -51,21 +51,6 @@ class ExtractEntitiesTest extends FunSuite with BeforeAndAfter {
LOG.info("Output can be found in " + tempDir.getPath)
}

test("extract entities") {
val e = ExtractEntities.extractFromScrapeText(iNerClassifierFile, scrapePath, tempDir + "/scrapeTextEntities", sc).take(3).last
val expectedEntityMap = mutable.Map[NERClassType.Value, List[String]]()
expectedEntityMap.put(NERClassType.PERSON, List())
expectedEntityMap.put(NERClassType.LOCATION, List("Teoma"))
expectedEntityMap.put(NERClassType.ORGANIZATION, List())
assert(e._1 == "20080430")
assert(e._2 == "http://www.archive.org/robots.txt")
val actual = mapper.readValue(e._3, classOf[Map[String, List[String]]])

expectedEntityMap.toStream.foreach(f => {
assert(f._2 == actual.get(f._1.toString).get)
})
}

test("Extract from Record") {
val e = ExtractEntities.extractFromRecords(iNerClassifierFile, archivePath, tempDir + "/scrapeArcEntities", sc).take(3).last
assert(e._1 == "hello")

0 comments on commit 379cc68

Please sign in to comment.
You can’t perform that action at this time.