Skip to content
Permalink
Browse files

Inline NER as an experiment

  • Loading branch information...
pudo committed Mar 13, 2019
1 parent a2abe5d commit 63a13dd72dff7ceb895439bee9a94b629ae3af27
Showing with 36 additions and 60 deletions.
  1. +2 −0 Dockerfile
  2. +10 −6 aleph/logic/collections.py
  3. +24 −35 aleph/logic/extractors/extract.py
  4. +0 −8 docker-compose.dev.yml
  5. +0 −11 docker-compose.yml
@@ -1,6 +1,8 @@
FROM alephdata/aleph-base:8

# Install Python dependencies
RUN pip3 install spacy-nightly
RUN python3 -m spacy download xx
COPY requirements-generic.txt /tmp/
RUN pip3 install --no-cache-dir -r /tmp/requirements-generic.txt
COPY requirements-toolkit.txt /tmp/
@@ -38,16 +38,20 @@ def refresh_collection(collection_id, sync=False):
cache.kv.delete(cache.object_key(Collection, collection_id))


def index_collection(collection, entities=False, refresh=False):
log.info("Index [%s]: %s", collection.id, collection.label)
if entities and collection.deleted_at is None:
index_collection_entities.delay(collection_id=collection.id)
if refresh:
refresh_collection(collection.id)
index.index_collection(collection)


def index_collections(entities=False, refresh=False):
q = Collection.all(deleted=True)
q = q.order_by(Collection.updated_at.desc())
for collection in q:
log.info("Index [%s]: %s", collection.id, collection.label)
if entities and collection.deleted_at is None:
index_collection_entities.delay(collection_id=collection.id)
if refresh:
refresh_collection(collection.id)
index.index_collection(collection)
index_collection(collection, entities=entities, refresh=refresh)


def delete_collection(collection, sync=False):
@@ -1,43 +1,32 @@
import spacy
import logging
import textwrap
from servicelayer.rpc import ExtractedEntity
from servicelayer.rpc import EntityExtractService

from aleph import settings
from aleph.tracing import trace_function
from aleph.logic.extractors.result import PersonResult, LocationResult
from aleph.logic.extractors.result import OrganizationResult, LanguageResult
from aleph.logic.extractors.result import PersonResult
from aleph.logic.extractors.result import LocationResult
from aleph.logic.extractors.result import OrganizationResult

log = logging.getLogger(__name__)


class NERService(EntityExtractService):
MIN_LENGTH = 60
MAX_LENGTH = 100000
TYPES = {
ExtractedEntity.ORGANIZATION: OrganizationResult,
ExtractedEntity.PERSON: PersonResult,
ExtractedEntity.LOCATION: LocationResult,
ExtractedEntity.LANGUAGE: LanguageResult
}

@trace_function(span_name='NER')
def extract_all(self, text, languages):
if text is None or len(text) < self.MIN_LENGTH:
return
if len(text) > self.MAX_LENGTH:
texts = textwrap.wrap(text, self.MAX_LENGTH)
else:
texts = [text]
for text in texts:
for res in self.Extract(text, languages):
clazz = self.TYPES.get(res.type)
yield (res.text, clazz, res.start, res.end)
MIN_LENGTH = 60
MAX_LENGTH = 100000
# https://spacy.io/api/annotation#named-entities
SPACY_TYPES = {
'PER': PersonResult,
'PERSON': PersonResult,
'ORG': OrganizationResult,
'LOC': LocationResult,
'GPE': LocationResult
}


def extract_entities(ctx, text, languages):
if not hasattr(settings, '_ner_service'):
settings._ner_service = NERService()
entities = settings._ner_service.extract_all(text, languages=languages)
for (text, clazz, start, end) in entities:
yield clazz.create(ctx, text, start, end)
if text is None or len(text) < MIN_LENGTH:
return
if not hasattr(settings, '_nlp'):
settings._nlp = spacy.load('xx')
doc = settings._nlp(text)
for ent in doc.ents:
clazz = SPACY_TYPES.get(ent.label_)
label = ent.text.strip()
if clazz is not None and len(label):
yield clazz.create(ctx, label, ent.start, ent.end)
@@ -46,12 +46,6 @@ services:
restart: on-failure
expose:
- 50000

extract-entities:
image: alephdata/extract-entities:1.2.0
restart: on-failure
expose:
- 50000

app:
image: alephdata/aleph
@@ -63,7 +57,6 @@ services:
- redis
- convert-document
- recognize-text
- extract-entities
tmpfs: /tmp
volumes:
- archive-data:/data
@@ -96,7 +89,6 @@ services:
- redis
- convert-document
- recognize-text
- extract-entities
tmpfs: /tmp
volumes:
- archive-data:/data
@@ -44,13 +44,6 @@ services:
expose:
- 50000

extract-entities:
image: alephdata/extract-entities:1.2.0
restart: on-failure
mem_limit: 4g
expose:
- 50000

worker:
image: alephdata/aleph
command: celery -A aleph.queues -B -c 4 -l INFO worker
@@ -62,7 +55,6 @@ services:
- redis
- convert-document
- recognize-text
- extract-entities
tmpfs:
- /tmp
volumes:
@@ -81,7 +73,6 @@ services:
- redis
- convert-document
- recognize-text
- extract-entities
- worker
tmpfs:
- /tmp
@@ -103,7 +94,6 @@ services:
- worker
- convert-document
- recognize-text
- extract-entities
tmpfs:
- /tmp
volumes:
@@ -124,7 +114,6 @@ services:
- worker
- convert-document
- recognize-text
- extract-entities
tmpfs:
- /tmp
volumes:

0 comments on commit 63a13dd

Please sign in to comment.
You can’t perform that action at this time.