update

Yaleesa · Aug 13, 2019 · c16a246ae211b62a7e4f60a79e69a7aa2941be70 · c16a246
1 parent a17745e
commit c16a246ae211b62a7e4f60a79e69a7aa2941be70
diff --git a/.gitignore b/.gitignore
@@ -1,12 +1,13 @@
 snippets.py
 .DS_Store
-  
-services/elastic-stack/.esdata/
-  
-trained_models/
+  
+elastic-stack/.esdata/
+  
+trained_models/*
 images/
-  
-pipeline-reports/
+  
+pipeline-reports/*
 #VSCode
 .vscode/
+  
+.env
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -0,0 +1,60 @@
+  
+<h1 align="center">
+  
+  <img src="media/bee.png" alt="Logo" />
+  
+</h1>
+  
+<h2 align="center">
+  
+</h2>
+  
+
+  
+<div align="center">
+  
+
+  
+![img](https://img.shields.io/badge/Python-3.7-blue.svg?style=popout&logo=python)
+  
+![img](https://img.shields.io/badge/ElasticSearch-7.0.0-purple.svg?style=popout&logo=Elasticsearch)
+  
+![img](https://img.shields.io/badge/Kibana-7.7.0-purple.svg?style=popout&logo=Kibana)
+  
+![img](https://img.shields.io/badge/Jupyter-notebook-orange.svg?style=popout&logo=Jupyter)
+  
+![img](https://img.shields.io/badge/Docker-compose-blue.svg?style=popout&logo=docker)
+  
+</div>
+  
+
+  
+### Overview
+  
+
+  
+
+  
+### Infra & Services
+  
+<div align="center">
+  
+<img  src="media/overview.png" alt="Logo"/>
+  
+</div>
+  
+
+  
+
+  
+
+  
+### Resources
+  
+##### Python modules:
+  
+`joblib` , 
+  
+`matplotlib`,
+  
+`numpy`,
+  
+`pandas`,
+  
+`requests`,
+  
+`requests`,
+  
+`scikit`,
+  
+`seaborn`,
+  
+`nltk`,
+  
+`fasttext`,
+  
+`elasticsearch`,
+  
+`xmltodict`,
+  
+
+  
+##### Other:
+  
+1. Notebook: https://jupyter-docker-stacks.readthedocs.io/en/latest/index.html
+  
+2. Elastic: https://www.elastic.co/products/elastic-stack
+  
+
+  
+### Containers:
+  
+| name of container         | needed for |
+  
+| ------------------      | ----------     |
+  
+| `elasticsearch`   | indexing vacancy data |
+  
+| `kibana` | Dashboard for metrics on the vacancy data |
+  
+| `scipy-notebook`  | notebook with scipy tools and access to the ML pipeline modules |
+  
+
+  
+### Data Collection:
+  
+ /link/to/projectmonarch
+  
+
+  
+### Requirements & Setup
+  
+
+  
+1. Docker
+  
+2. `pip install requirements.txt`
+  
+
+  
+
diff --git a/app/elasticer.py b/app/elasticer.py
@@ -1,10 +1,16 @@
+  
+'''
+  
+Author: Yaleesa Borgman
+  
+Date: 8-8-2019
+  
+GET and POST for elasticsearch
+  
+'''
 from elasticsearch import Elasticsearch, helpers
 import json
 class Elasticer:
    def __init__(self):
        self.es = Elasticsearch(host="127.0.0.1")
+  
+        #self.es = Elasticsearch(host="elasticsearch")
    def import_dataset(self, indexname, include_list):
        '''
@@ -17,7 +23,7 @@ def import_dataset(self, indexname, include_list):
        return documents
-  
-    def to_elastic(self, indexname, data):
+  
+    def dict_to_elastic(self, indexname, data):
        actions = [
            {
            "_index" : indexname,
@@ -37,6 +43,4 @@ def list_to_elastic(self, indexname, data):
            }
        for record in data
        ]
-  
-        actions = json.dumps(actions)
-  
-        actions = json.loads(actions)
        helpers.bulk(self.es,actions, index=indexname)
diff --git a/app/explorer.py b/app/explorer.py
@@ -1,3 +1,8 @@
+  
+'''
+  
+Author: Yaleesa Borgman
+  
+Date: 8-8-2019
+  
+explorer.py - exploration of data
+  
+'''
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
 # Pandas & Numpy & vis

diff --git a/app/fasttexter.py b/app/fasttexter.py
@@ -1,25 +1,32 @@
+  
+'''
+  
+Author: Yaleesa Borgman
+  
+Date: 8-8-2019
+  
+fasttexter.py - handles the fastText python library and return scoring 
+  
+'''
 from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, StratifiedKFold, GridSearchCV
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
+  
+import pandas as pd
+  
+import numpy as np
+  
+import fasttext
+  
+import nltk
+  
+from nltk import word_tokenize
 from elasticer import Elasticer
 from explorer import DataExploration
 from preprocessor import DataPreProcessor, DataCleaner
 from predicter import xmlRemapper
 from reporter import ClassificationReports
-  
-import pandas as pd
-  
-import numpy as np
-  
-import fasttext
-  
-import nltk
-  
-from nltk import word_tokenize
-  
-import os, csv
-  
-from datetime import datetime
 now = datetime.now()
 timestamp = now.strftime("%d%m%Y-%H:%M")
-  
-
-  
-
 class FastTexter:
+  
+    '''
+  
+    fastText class. 
+  
+    - name & filepath
+  
+    - trainfile & testfile
+  
+    returns self.model
+  
+    '''
    def __init__(self, name):
        self.name = name
        self.filepath = '../data/trained_models/fasttext_models'
@@ -65,29 +72,29 @@ def classification(self, trainfile, testfile, ngrams=1):
        validation = self.model.test(testfile)
        score = {ngrams:{"N":int(validation[0]), "P@1": "{0:.3f}".format(validation[1]),"R@1": "{0:.3f}".format(validation[2])}}
-  
-        #self.print_results(*self.model.test(testfile))
-  
-
        words, freq = self.model.get_words(include_freq=True)
        return score
-  
-    def compress_save(self):
-  
-        self.model.quantize(input=self.trainfile, retrain=True)
+  
+    def compress_save(self, trainfile):
+  
+        self.model.quantize(input=trainfile, retrain=True)
        modelname = f"{self.filepath}/model_fasttext_{self.name}-{timestamp}.ftz"
        self.model.save_model(modelname)
        return modelname
 class FastTextPipeline:
-  
-    def __init__(self, name, lowercase=True, stopw=True, report='score'):
+  
+    '''
+  
+    pre-set pipeline, uses the other modules for cleaning etc in pipeline
+  
+    '''
+  
+    def __init__(self, name, lowercase=True, stopw=True, report='full'):
        self.name = name
        self.lowercase = lowercase
        self.stopw = stopw
        self.report = report
        self.modelname = ''
        self.fasttexter = FastTexter(name=self.name)
        self.explorer = DataExploration(filepath='../data/fasttext-report')
-  
-        self.reporter = ClassificationReports()
+  
+        self.reporter = ClassificationReports(title=self.name)
-  
-
    def preprocessing(self, data, missing='Unknown'):
        Processor = DataPreProcessor(data)
        Cleaner = DataCleaner()
@@ -111,7 +118,7 @@ def prepare_files(self, data):
    def train_model(self, ngrams, save_model=False):
        score = self.fasttexter.classification(self.trainfile, self.testfile, ngrams)
        if save_model:
-  
-            self.modelname = self.fasttexter.compress_save()  
+  
+            self.modelname = self.fasttexter.compress_save(self.trainfile)  
        return score
    def ngrams_performance(self):
@@ -122,22 +129,6 @@ def ngrams_performance(self):
            score_dict.update(score)
        return score_dict
-  
-    def file_predictions(self):
-  
-        model = fasttext.load_model(self.modelname)
-  
-
-  
-
-  
-        label_scores = model.test_label(self.testfile)
-  
-        dataframe = pd.DataFrame(label_scores)
-  
-
-  
-        if self.report == 'full':
-  
-            print(f'''
-  
-            Trained model tested on testfile.txt data:\n\n
-  
-            -> {self.name}\n
-  
-            \tclassifier: fasttext \n\n
-  
-            \t{self.fasttexter.print_results(*self.fasttexter.model.test(testfile))}\n\n
-  
-            \t{dataframe.T}\n
-  
-            ''')
-  
-
    def scoring(self, y, y_pred):
        accuracy = 'accuracy %s' % accuracy_score(y_pred, y)
        columns = np.unique(y)
@@ -146,15 +137,29 @@ def scoring(self, y, y_pred):
    def scoring_report(self, title, y, y_pred):
        accuracy, report = self.scoring(y, y_pred)
-  
-        print(f'''
+  
+        return f'''
        Trained model tested on test data:\n\n
        -> {title}\n
        \tclassifier: fasttext \n
        \t{accuracy}\n
        \n{report}
-  
-        ''')
-  
-        self.reporter.confusion_matrix_vis(y=y, y_pred=y_pred, filepath='../data/fasttext-reports',title=f'fasttext-{title}')
-  
-        #self.DataExploration.kijkdoos(X, y, y_pred, 'location')
+  
+        '''
+  
+        self.reporter.confusion_matrix_vis(y=y, y_pred=y_pred, filepath='../data/fasttext-reports',title=f'fasttext{self.name}', figsize=(11,13))
+  
+        self.reporter.kijkdoos(X, y, y_pred, 'location')
+  
+
+  
+    def file_predictions(self):
+  
+        model = fasttext.load_model(self.modelname)
+  
+        label_scores = model.test_label(self.testfile)
+  
+        dataframe = pd.DataFrame(label_scores)
+  
+
+  
+        if self.report == 'full':
+  
+            print(f'''
+  
+            Trained model tested on testfile.txt data:\n\n
+  
+            \t{self.name}\n
+  
+            \tclassifier: fasttext \n\n
+  
+            \t{self.fasttexter.print_results(*self.fasttexter.model.test(testfile))}\n\n
+  
+            \t{dataframe.T}\n
+  
+            ''')
    def dataframe_predictions(self, modelname, dataframe):
        model = fasttext.load_model(modelname)
@@ -177,38 +182,8 @@ def dataframe_predictions(self, modelname, dataframe):
        accuracy, report = self.scoring(y, y_pred)
        if self.report == 'full':
-  
-            self.scoring_report(self.name,y, y_pred)
-  
-            self.reporter.confusion_matrix_vis(y=y, y_pred=y_pred, filepath='../data/fasttext-reports',title=f'fasttext-{self.name}')
+  
+            score = self.scoring_report(self.name,y, y_pred)
+  
+            print(score)
        return accuracy
-  
-if __name__ == '__main__':
-  
-        include = ['company_name', 'introduction', 'location', 'vacancy_title', 'description', 'job_category', 'contract_type']
-  
-        scrp_dataset = Elasticer().import_dataset('scrapy_test-early_mornin_4', include)
-  
-        #xml_data = xmlRemapper().get_dataframe()
-  
-        #print(xml_data[xml_data['label'] == 'introduction'])
-  
-
-  
-        #modelname = 'model_fasttext_07082019-01:29.ftz'
-  
-        #modelname_low = 'model_fasttext_scrp_lowercased-07082019-06:17.ftz'
-  
-        testfile = '../data/unseen_test.txt.txt'
-  
-
-  
-        # original = FastTextPipeline('original_dataset', lowercase=False, stopw=False)
-  
-        # original.prepare_files(scrp_dataset)
-  
-        # score = original.ngrams_performance()
-  
-        # df = pd.DataFrame(score)
-  
-        # print(df)
-  
-
-  
-        # low_case = FastTextPipeline('scrp_lowercased', stopw=False)
-  
-        # low_case.prepare_files(scrp_dataset)
-  
-        # score = low_case.ngrams_performance()
-  
-        # df = pd.DataFrame(score)
-  
-        # print(df)
-  
-
-  
-        low_case_minstop = FastTextPipeline('scrp_lowercased_minstop')
-  
-        low_case_minstop.prepare_files(scrp_dataset)
-  
-        low_case_minstop.train_model(1, save_model=True)
-  
-        # score = low_case_minstop.ngrams_performance()
-  
-        # df = pd.DataFrame(score)
-  
-        # print(df)
-  
-        low_case_minstop.file_predictions()
-  
-        #FastTextPipeline('scrp_to_xml').dataframe_predictions(modelname, xml_data)