Python formatting, and gitignore additions. (#326)

- Run black and isort on Python files. - Move Spark config to example file. - Update gitignore for 7a61f0e additions.
archivesunleashed · Jul 18, 2019 · bd5ef14abd990c707a00b2f4df79756e73200718 · bd5ef14
1 parent f35d54e
commit bd5ef14abd990c707a00b2f4df79756e73200718
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,8 @@ workbench.xmi
 build
 derby.log
 metastore_db
+  
+__pycache__/
+  
+src/main/python/tf/model.zip
+  
+src/main/python/tf/util/spark.conf
+  
+src/main/python/tf/model/graph/
+  
+src/main/python/tf/model/category/
diff --git a/src/main/python/aut/__init__.py b/src/main/python/aut/__init__.py
@@ -1,5 +1,4 @@
 from aut.common import WebArchive
 from aut.udfs import extract_domain
-  
-__all__ = ['WebArchive', 'extract_domain']
-  
-
+  
+__all__ = ["WebArchive", "extract_domain"]
diff --git a/src/main/python/aut/common.py b/src/main/python/aut/common.py
@@ -1,5 +1,6 @@
 from pyspark.sql import DataFrame
+  
+
 class WebArchive:
    def __init__(self, sc, sqlContext, path):
        self.sc = sc
@@ -12,4 +13,3 @@ def pages(self):
    def links(self):
        return DataFrame(self.loader.extractHyperlinks(self.path), self.sqlContext)
-  
-
diff --git a/src/main/python/aut/udfs.py b/src/main/python/aut/udfs.py
@@ -1,11 +1,13 @@
 from pyspark.sql.functions import udf
 from pyspark.sql.types import StringType
+  
+
 def extract_domain_func(url):
-  
-    url = url.replace('http://', '').replace('https://', '')
-  
-    if '/' in url:
-  
-        return url.split('/')[0].replace('www.', '')
+  
+    url = url.replace("http://", "").replace("https://", "")
+  
+    if "/" in url:
+  
+        return url.split("/")[0].replace("www.", "")
    else:
-  
-        return url.replace('www.', '')
+  
+        return url.replace("www.", "")
+  
+
 extract_domain = udf(extract_domain_func, StringType())
diff --git a/src/main/python/tf/detect.py b/src/main/python/tf/detect.py
@@ -1,13 +1,15 @@
 import os
 import sys
-  
-from util.init import *
+  
+
+  
+from pyspark.sql import DataFrame
+  
+
 from model.object_detection import *
+  
+from util.init import *
+  
+
 PYAUT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(PYAUT_DIR)
 from aut.common import WebArchive
-  
-from pyspark.sql import DataFrame
-  
-
 if __name__ == "__main__":
    # initialization
@@ -23,11 +25,13 @@
    arc = WebArchive(sc, sql_context, args.web_archive)
    df = DataFrame(arc.loader.extractImages(arc.path), sql_context)
    filter_size = tuple(args.filter_size)
-  
-    print("height >= %d and width >= %d"%filter_size)
-  
-    preprocessed = df.filter("height >= %d and width >= %d"%filter_size)
+  
+    print("height >= %d and width >= %d" % filter_size)
+  
+    preprocessed = df.filter("height >= %d and width >= %d" % filter_size)
    # detection
    model_broadcast = detector.broadcast()
    detect_udf = detector.get_detect_udf(model_broadcast)
-  
-    res = preprocessed.select("url", detect_udf(col("bytes")).alias("prediction"), "bytes")
+  
+    res = preprocessed.select(
+  
+        "url", detect_udf(col("bytes")).alias("prediction"), "bytes"
+  
+    )
    res.write.json(args.output_path)
diff --git a/src/main/python/tf/extract_images.py b/src/main/python/tf/extract_images.py
@@ -1,13 +1,19 @@
-  
-import numpy as np
 import argparse
+  
+
+  
+import numpy as np
+  
+
 from model.object_detection import SSDExtractor
 def get_args():
-  
-    parser = argparse.ArgumentParser(description='Extracting images from model output.')
-  
-    parser.add_argument('--res_dir', help='Path of result (model output) directory.')
-  
-    parser.add_argument('--output_dir', help='Path of extracted image file output directory.')
-  
-    parser.add_argument('--threshold', type=float, help='Threshold of detection confidence scores.')
+  
+    parser = argparse.ArgumentParser(description="Extracting images from model output.")
+  
+    parser.add_argument("--res_dir", help="Path of result (model output) directory.")
+  
+    parser.add_argument(
+  
+        "--output_dir", help="Path of extracted image file output directory."
+  
+    )
+  
+    parser.add_argument(
+  
+        "--threshold", type=float, help="Threshold of detection confidence scores."
+  
+    )
    return parser.parse_args()

diff --git a/src/main/python/tf/model/object_detection.py b/src/main/python/tf/model/object_detection.py
@@ -17,23 +17,21 @@ def __init__(self, res_dir, output_dir):
        self.res_dir = res_dir
        self.output_dir = output_dir
-  
-
    def _extract_and_save(self, rec, class_ids, threshold):
        raise NotImplementedError("Please overwrite this method.")
-  
-
    def extract_and_save(self, class_ids, threshold):
        if class_ids == "all":
            class_ids = list(self.cate_dict.keys())
        for idx in class_ids:
            cls = self.cate_dict[idx]
-  
-            check_dir(self.output_dir + "/%s/"%cls, create=True)
+  
+            check_dir(self.output_dir + "/%s/" % cls, create=True)
        for fname in os.listdir(self.res_dir):
            if fname.startswith("part-"):
-  
-                print("Extracting:", self.res_dir+"/"+fname)
-  
-                with open(self.res_dir+"/"+fname) as f:
+  
+                print("Extracting:", self.res_dir + "/" + fname)
+  
+                with open(self.res_dir + "/" + fname) as f:
                    for line in f:
                        rec = json.loads(line)
                        self._extract_and_save(rec, class_ids, threshold)
@@ -43,47 +41,56 @@ class SSD:
    def __init__(self, sc, sql_context, args):
        self.sc = sc
        self.sql_context = sql_context
-  
-        self.category = load_cate_dict_from_pbtxt("%s/category/mscoco_label_map.pbtxt"%PKG_DIR)
-  
-        self.checkpoint = "%s/graph/ssd_mobilenet_v1_fpn_640x640/frozen_inference_graph.pb"%PKG_DIR
+  
+        self.category = load_cate_dict_from_pbtxt(
+  
+            "%s/category/mscoco_label_map.pbtxt" % PKG_DIR
+  
+        )
+  
+        self.checkpoint = (
+  
+            "%s/graph/ssd_mobilenet_v1_fpn_640x640/frozen_inference_graph.pb" % PKG_DIR
+  
+        )
        self.args = args
-  
-        with tf.io.gfile.GFile(self.checkpoint, 'rb') as f:
+  
+        with tf.io.gfile.GFile(self.checkpoint, "rb") as f:
            model_params = f.read()
        self.model_params = model_params
-  
-
    def broadcast(self):
        return self.sc.broadcast(self.model_params)
-  
-
    def get_detect_udf(self, model_broadcast):
        def batch_proc(bytes_batch):
            with tf.Graph().as_default() as g:
                graph_def = tf.GraphDef()
                graph_def.ParseFromString(model_broadcast.value)
-  
-                tf.import_graph_def(graph_def, name='')
-  
-                image_tensor = g.get_tensor_by_name('image_tensor:0')
-  
-                detection_scores = g.get_tensor_by_name('detection_scores:0')
-  
-                detection_classes = g.get_tensor_by_name('detection_classes:0')
+  
+                tf.import_graph_def(graph_def, name="")
+  
+                image_tensor = g.get_tensor_by_name("image_tensor:0")
+  
+                detection_scores = g.get_tensor_by_name("detection_scores:0")
+  
+                detection_classes = g.get_tensor_by_name("detection_classes:0")
                with tf.Session().as_default() as sess:
                    result = []
                    image_size = (640, 640)
                    images = np.array([img2np(b, image_size) for b in bytes_batch])
-  
-                    res = sess.run([detection_scores, detection_classes], feed_dict={image_tensor: images})
+  
+                    res = sess.run(
+  
+                        [detection_scores, detection_classes],
+  
+                        feed_dict={image_tensor: images},
+  
+                    )
                    for i in range(res[0].shape[0]):
                        result.append([res[0][i], res[1][i]])
            return pd.Series(result)
-  
-        return pandas_udf(ArrayType(ArrayType(FloatType())), PandasUDFType.SCALAR)(batch_proc)
+  
+
+  
+        return pandas_udf(ArrayType(ArrayType(FloatType())), PandasUDFType.SCALAR)(
+  
+            batch_proc
+  
+        )
 class SSDExtractor(ImageExtractor):
    def __init__(self, res_dir, output_dir):
        super().__init__(res_dir, output_dir)
-  
-        self.cate_dict = load_cate_dict_from_pbtxt("%s/category/mscoco_label_map.pbtxt"%PKG_DIR)
-  
-
+  
+        self.cate_dict = load_cate_dict_from_pbtxt(
+  
+            "%s/category/mscoco_label_map.pbtxt" % PKG_DIR
+  
+        )
    def _extract_and_save(self, rec, class_ids, threshold):
-  
-        pred = rec['prediction']
+  
+        pred = rec["prediction"]
        scores = np.array(pred[0])
        classes = np.array(pred[1])
        valid_classes = np.unique(classes[scores >= threshold])
@@ -102,8 +109,7 @@ def _extract_and_save(self, rec, class_ids, threshold):
                cls = self.cate_dict[cls_idx]
                try:
                    img = str2img(rec["bytes"])
-  
-                    img.save(self.output_dir+ "/%s/"%cls + url_parse(rec["url"]))
+  
+                    img.save(self.output_dir + "/%s/" % cls + url_parse(rec["url"]))
                except:
-  
-                    fname = self.output_dir+ "/%s/"%cls + url_parse(rec["url"])
+  
+                    fname = self.output_dir + "/%s/" % cls + url_parse(rec["url"])
                    print("Failing to save:", fname)
-  
-
diff --git a/src/main/python/tf/model/preprocess.py b/src/main/python/tf/model/preprocess.py
@@ -7,7 +7,7 @@
 def str2img(byte_str):
-  
-    return Image.open(io.BytesIO(base64.b64decode(bytes(byte_str, 'utf-8'))))
+  
+    return Image.open(io.BytesIO(base64.b64decode(bytes(byte_str, "utf-8"))))
 def img2np(byte_str, resize=None):
@@ -22,7 +22,7 @@ def img2np(byte_str, resize=None):
        if len(img_shape) == 2:
            img = np.stack([img, img, img], axis=-1)
        elif img_shape[-1] >= 3:
-  
-            img = img[:,:,:3]
+  
+            img = img[:, :, :3]
        return img
@@ -58,4 +58,3 @@ def load_cate_dict_from_pbtxt(path, key="id", value="display_name"):
                    cur_cate = re.findall(r'"(.*?)"', entry[1])[0]
                    cate_dict[cur_key] = cur_cate
    return cate_dict
-  
-
diff --git a/src/main/python/tf/util/init.py b/src/main/python/tf/util/init.py
@@ -1,14 +1,15 @@
 import argparse
 import os
+  
+import re
 import zipfile
+  
+
 from pyspark import SparkConf, SparkContext, SQLContext
-  
-import re
-  
-import os
+  
+
 def init_spark(master, aut_jar):
    conf = SparkConf()
    conf.set("spark.jars", aut_jar)
-  
-    conf_path = os.path.dirname(os.path.abspath(__file__))+"/spark.conf"
+  
+    conf_path = os.path.dirname(os.path.abspath(__file__)) + "/spark.conf"
    conf_dict = read_conf(conf_path)
    for item, value in conf_dict.items():
        conf.set(item, value)
@@ -18,29 +19,63 @@ def init_spark(master, aut_jar):
 def get_args():
-  
-    parser = argparse.ArgumentParser(description='PySpark for Web Archive Image Retrieval.')
-  
-    parser.add_argument('--web_archive', help='Path to warcs.', default='/tuna1/scratch/nruest/geocites/warcs')
-  
-    parser.add_argument('--aut_jar', help='Path to compiled aut jar.', default='aut/target/aut-0.17.1-SNAPSHOT-fatjar.jar')
-  
-    parser.add_argument('--spark', help='Path to Apache Spark.', default='spark-2.3.2-bin-hadoop2.7/bin')
-  
-    parser.add_argument('--master', help='Apache Spark master IP address and port.', default='spark://127.0.1.1:7077')
-  
-    parser.add_argument('--img_model', help='Model for image processing.', default='ssd')
-  
-    parser.add_argument('--filter_size', nargs='+', type=int, help='Filter out images smaller than filter_size', default=[640, 640])
-  
-    parser.add_argument('--output_path', help='Path to image model output.', default='warc_res')
+  
+    parser = argparse.ArgumentParser(
+  
+        description="PySpark for Web Archive Image Retrieval."
+  
+    )
+  
+    parser.add_argument(
+  
+        "--web_archive",
+  
+        help="Path to warcs.",
+  
+        default="/tuna1/scratch/nruest/geocites/warcs",
+  
+    )
+  
+    parser.add_argument(
+  
+        "--aut_jar",
+  
+        help="Path to compiled aut jar.",
+  
+        default="aut/target/aut-0.17.1-SNAPSHOT-fatjar.jar",
+  
+    )
+  
+    parser.add_argument(
+  
+        "--spark", help="Path to Apache Spark.", default="spark-2.3.2-bin-hadoop2.7/bin"
+  
+    )
+  
+    parser.add_argument(
+  
+        "--master",
+  
+        help="Apache Spark master IP address and port.",
+  
+        default="spark://127.0.1.1:7077",
+  
+    )
+  
+    parser.add_argument(
+  
+        "--img_model", help="Model for image processing.", default="ssd"
+  
+    )
+  
+    parser.add_argument(
+  
+        "--filter_size",
+  
+        nargs="+",
+  
+        type=int,
+  
+        help="Filter out images smaller than filter_size",
+  
+        default=[640, 640],
+  
+    )
+  
+    parser.add_argument(
+  
+        "--output_path", help="Path to image model output.", default="warc_res"
+  
+    )
    return parser.parse_args()
 def zip_model_module(PYAUT_DIR):
    zip = zipfile.ZipFile(os.path.join(PYAUT_DIR, "tf", "model.zip"), "w")
-  
-    zip.write(os.path.join(PYAUT_DIR, "tf", "model", "__init__.py"), os.path.join("model", "__init__.py"))
-  
-    zip.write(os.path.join(PYAUT_DIR, "tf", "model", "object_detection.py"), os.path.join("model", "object_detection.py"))
-  
-    zip.write(os.path.join(PYAUT_DIR, "tf", "model", "preprocess.py"), os.path.join("model", "preprocess.py"))
+  
+    zip.write(
+  
+        os.path.join(PYAUT_DIR, "tf", "model", "__init__.py"),
+  
+        os.path.join("model", "__init__.py"),
+  
+    )
+  
+    zip.write(
+  
+        os.path.join(PYAUT_DIR, "tf", "model", "object_detection.py"),
+  
+        os.path.join("model", "object_detection.py"),
+  
+    )
+  
+    zip.write(
+  
+        os.path.join(PYAUT_DIR, "tf", "model", "preprocess.py"),
+  
+        os.path.join("model", "preprocess.py"),
+  
+    )
 def read_conf(conf_path):
    conf_dict = {}
    with open(conf_path) as f:
        for line in f:
-  
-            conf = re.findall(r'\S+', line.strip())
+  
+            conf = re.findall(r"\S+", line.strip())
            conf_dict[conf[0]] = conf[1]
    return conf_dict
-  
-
diff --git a/src/main/python/tf/util/spark.conf → src/main/python/tf/util/spark.conf.example b/src/main/python/tf/util/spark.conf → src/main/python/tf/util/spark.conf.example