Supervisor WIP

2018-07-25 17:29:17 +02:00 · 2018-07-25 17:29:17 +02:00 · 578249f644
commit 578249f644
parent ad68cafe1e
3 changed files with 487 additions and 33 deletions
--- a/Serialization.ipynb
+++ b/Serialization.ipynb
@ -8,11 +8,11 @@
    "\n",
    "- [X] Read a YAML recipe\n",
    "- [X] Brew recipe\n",
-    "- [] Compute hashes\n",
+    "- [X] Compute hashes\n",
-    "- [] Write hashes\n",
+    "- [X] Write hashes\n",
-    "- [] Time metrics\n",
+    "- [X] Time metrics\n",
-    "- [] Result metrics\n",
+    "- [X] Result metrics\n",
-    "- [] Write metrics\n",
+    "- [X] Write metrics\n",
    "- [] Write/move results\n",
    "- [] Watch folder\n",
    "- [] Main loop\n",
@ -34,6 +34,13 @@
    "import numpy as np\n",
    "import importlib\n",
    "import sys\n",
    "import hashlib\n",
    "from collections import OrderedDict\n",
    "import time\n",
    "import os\n",
    "import datetime\n",
    "from sklearn import metrics\n",
    "from pathlib import Path\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
@ -45,6 +52,21 @@
    "import triskele"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Keep yaml ordered\n",
    "\n",
    "def setup_yaml():\n",
    "  \"\"\" https://stackoverflow.com/a/8661021 \"\"\"\n",
    "  represent_dict_order = lambda self, data:  self.represent_mapping('tag:yaml.org,2002:map', data.items())\n",
    "  yaml.add_representer(OrderedDict, represent_dict_order)    \n",
    "setup_yaml()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -52,6 +74,16 @@
    "## Serial Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "expe_in = '../test.yml'\n",
    "expe_out = '../test_out.yml'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -60,32 +92,301 @@
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
-    "with open('../test.yml') as f:\n",
+    "with open(expe_in) as f:\n",
-    "    expe = yaml.safe_load(f)['expe']\n",
+    "    expe = OrderedDict(yaml.safe_load(f)['expe'])\n",
-    "display(expe)\n",
+    "display(expe)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compute hashes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_hashes(expe):\n",
    "    glob = hashlib.sha1()\n",
    "\n",
-    "# Ground truth\n",
+    "    expe_hashes = OrderedDict()\n",
    "gt = triskele.read(expe['ground_truth'])\n",
    "\n",
-    "# Descriptors\n",
+    "    for k in ['ground_truth', 'descriptors_script', 'cross_validation', 'classifier']:\n",
-    "script = expe['descriptors_script']\n",
+    "        v = str(expe[k]).encode('utf-8')\n",
-    "desc = importlib.import_module(script['name'], package=Descriptors)\n",
+    "        expe_hashes[k] = hashlib.sha1(v).hexdigest()\n",
-    "importlib.reload(Descriptors)\n",
+    "        glob.update(v)\n",
-    "att = desc.run(**script['parameters'])\n",
+    "    expe_hashes['global'] = glob.hexdigest()\n",
    "    return expe_hashes\n",
    "\n",
-    "# CrossVal and ML\n",
+    "expe_hashes = compute_hashes(expe)\n",
-    "cv = expe['cross_validation']\n",
+    "expe_hashes"
-    "cl = expe['classifier']\n",
+   ]
-    "\n",
+  },
-    "prediction = np.zeros_like(gt)\n",
+  {
-    "\n",
+   "cell_type": "markdown",
-    "for xt, xv, yt, yv, ti in APsCVG(gt, att, **cv['parameters']):\n",
+   "metadata": {},
-    "    rfc = RandomForestClassifier(**cl['parameters'])\n",
+   "source": [
-    "    rfc.fit(xt, yt)\n",
+    "### Write hashes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(expe_out, 'w') as of:\n",
    "    yaml.dump({'expe': expe, 'expe_hashes': expe_hashes}, of, default_flow_style=False, encoding=None, allow_unicode=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Keep track of time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Kronos(object):\n",
    "    def __init__(self):\n",
    "        self._pt = time.process_time()\n",
    "        self._times = OrderedDict()\n",
    "        \n",
    "    def time(self, name):\n",
    "        self._times[name + '_process_time'] = time.process_time() - self._pt\n",
    "        self._pt = time.process_time()\n",
    "        \n",
    "    def get_times(self):\n",
    "        return self._times\n",
    "    \n",
-    "    ypred = rfc.predict(xv)\n",
+    "kronos = Kronos()\n",
    "start_time = time.time()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compute descriptors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_descriptors(expe):\n",
    "    \"\"\"Compute descriptors from a standard expe recipe\"\"\"\n",
    "    script = expe['descriptors_script']\n",
    "    desc = importlib.import_module(script['name'], package=Descriptors)\n",
    "    #importlib.reload(Descriptors)\n",
    "    att = desc.run(**script['parameters'])\n",
    "    \n",
-    "    prediction[ti] = ypred"
+    "    return att\n",
    "\n",
    "att = compute_descriptors(expe)\n",
    "kronos.time('description')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compute classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_classification(expe, att):\n",
    "    \"\"\"Read a standard expe recipe and attributes, return the result classification\"\"\"\n",
    "    # Ground truth\n",
    "    gt = triskele.read(expe['ground_truth'])\n",
    "\n",
    "\n",
    "    # CrossVal and ML\n",
    "    cv = expe['cross_validation']\n",
    "    cl = expe['classifier']\n",
    "\n",
    "    prediction = np.zeros_like(gt)\n",
    "\n",
    "    for xt, xv, yt, yv, ti in APsCVG(gt, att, **cv['parameters']):\n",
    "        rfc = RandomForestClassifier(**cl['parameters'])\n",
    "        rfc.fit(xt, yt)\n",
    "\n",
    "        ypred = rfc.predict(xv)\n",
    "\n",
    "        prediction[ti] = ypred\n",
    "        \n",
    "    return prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classification = compute_classification(expe, att)\n",
    "kronos.time('classification')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_metrics(ground_truth, classication):\n",
    "    \"\"\"Return dict of metrics for ground_truth and classification prediction in parameters\"\"\"\n",
    "    f = np.nonzero(classification)\n",
    "    pred = classification[f].ravel()\n",
    "    gt = ground_truth[f].ravel()\n",
    "    \n",
    "    results = OrderedDict() \n",
    "    results['overall_accuracy'] = float(metrics.accuracy_score(gt, pred))\n",
    "    results['cohen_kappa'] = float(metrics.cohen_kappa_score(gt, pred))\n",
    "    \n",
    "    return results\n",
    "\n",
    "def run_metrics(expe, classification):\n",
    "    \"\"\"Compute the metrics from a standard expe recipe and an given classification\"\"\"\n",
    "    \n",
    "    ### Extensible: meta-classes\n",
    "    gt = triskele.read(expe['ground_truth'])\n",
    "    return compute_metrics(gt, classification)\n",
    "\n",
    "expe_results = run_metrics(expe, classification)\n",
    "expe_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kronos.time('metrics')\n",
    "end_time = time.time()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_report(kronos):\n",
    "    expe_report = OrderedDict()\n",
    "\n",
    "    expe_report['supervisor'] = os.uname()[1]\n",
    "\n",
    "    for timev, datek in zip((start_time, end_time), ('start_date', 'end_date')):\n",
    "        expe_report[datek] = datetime.datetime.fromtimestamp(timev).strftime('Le %d/%m/%Y à %H:%M:%S')\n",
    "\n",
    "    ressources = kronos.get_times()\n",
    "    ressources['ram'] = None\n",
    "\n",
    "    expe_report['ressources'] = ressources\n",
    "    return expe_report\n",
    "\n",
    "expe_report = create_report(kronos)\n",
    "expe_report"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**TODO**\n",
    "\n",
    "améliorer kronos pour le start et le end time (build and get_times)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Name and write prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "oname = '{}_{}'.format(Path(expe_in).stem, expe_hashes['global'][:6])\n",
    "oname_tif = oname + '.tif'\n",
    "oname_yml = oname + '.yml'\n",
    "\n",
    "triskele.write(oname_tif, classification)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Write report and results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(oname_yml, 'w') as of:\n",
    "    yaml.dump(OrderedDict({'expe': expe, \n",
    "               'expe_hashes': expe_hashes, \n",
    "               'expe_report': expe_report,\n",
    "               'expe_classification': oname_tif,\n",
    "               'expe_results': expe_results}), of, default_flow_style=False, encoding=None, allow_unicode=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "att.dtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import watchdog"
   ]
  },
  {
--- a/supervisor.py
+++ b/supervisor.py
@ -0,0 +1,156 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 # \file supervisor.py
 # \brief TODO
 # \author Florent Guiotte <florent.guiotte@gmail.com>
 # \version 0.1
 # \date 25 juil. 2018
 #
 # TODO details
 import yaml
 import numpy as np
 import importlib
 import sys
 import hashlib
 from collections import OrderedDict
 import time
 import os
 import datetime
 from sklearn import metrics
 from pathlib import Path
 from sklearn.ensemble import RandomForestClassifier
 #sys.path.append('.')
 import Descriptors
 from CrossValidationGenerator import APsCVG
 sys.path.append('./triskele/python')
 import triskele
 ### Keep yaml ordered
 def setup_yaml():
  """ https://stackoverflow.com/a/8661021 """
  represent_dict_order = lambda self, data:  self.represent_mapping('tag:yaml.org,2002:map', data.items())
  yaml.add_representer(OrderedDict, represent_dict_order)    
 setup_yaml()
 def run(expe_file):
    with open(expe_file) as f:
        expe = OrderedDict(yaml.safe_load(f)['expe'])
    ### Compute hashes
    expe_hashes = compute_hashes(expe)
    ### Keep track of time 
    kronos = Kronos()
    start_time = time.time()
    ### Compute descriptors
    descriptors = compute_descriptors(expe)
    kronos.time('description')
    ### Compute classification
    classification = compute_classification(expe, descriptors)
    kronos.time('classification')
    ### Metrics  
    metrics = run_metrics(expe, classification)
    kronos.time('metrics')
    ### Create report WIP WIP WIP WIP WIP WIP WIP
 expe_report = OrderedDict()
 expe_report['supervisor'] = os.uname()[1]
 for timev, datek in zip((start_time, end_time), ('start_date', 'end_date')):
    expe_report[datek] = datetime.datetime.fromtimestamp(timev).strftime('Le %d/%m/%Y à %H:%M:%S')
 ressources = kronos.get_times()
 ressources['ram'] = None
 expe_report['ressources'] = ressources
 def compute_hashes(expe):
    glob = hashlib.sha1()
    expe_hashes = OrderedDict()
    for k in ['ground_truth', 'descriptors_script', 'cross_validation', 'classifier']:
        v = str(expe[k]).encode('utf-8')
        expe_hashes[k] = hashlib.sha1(v).hexdigest()
        glob.update(v)
    expe_hashes['global'] = glob.hexdigest()
    return expe_hashes
 def compute_descriptors(expe):
    """Compute descriptors from a standard expe recipe"""
    script = expe['descriptors_script']
    desc = importlib.import_module(script['name'], package=Descriptors)
    #importlib.reload(Descriptors)
    att = desc.run(**script['parameters'])
    return att
 def compute_classification(expe, descriptors):
    """Read a standard expe recipe and descriptors, return the result classification"""
    # Ground truth
    gt = triskele.read(expe['ground_truth'])
    # CrossVal and ML
    cv = expe['cross_validation']
    cl = expe['classifier']
    prediction = np.zeros_like(gt)
    for xt, xv, yt, yv, ti in APsCVG(gt, descriptors, **cv['parameters']):
        rfc = RandomForestClassifier(**cl['parameters'])
        rfc.fit(xt, yt)
        ypred = rfc.predict(xv)
        prediction[ti] = ypred
    return prediction
 def compute_metrics(ground_truth, classication):
    """Return dict of metrics for ground_truth and classification prediction in parameters"""
    f = np.nonzero(classification)
    pred = classification[f].ravel()
    gt = ground_truth[f].ravel()
    results = OrderedDict() 
    results['overall_accuracy'] = float(metrics.accuracy_score(gt, pred))
    results['cohen_kappa'] = float(metrics.cohen_kappa_score(gt, pred))
    return results
 def run_metrics(expe, classification):
    """Compute the metrics from a standard expe recipe and an given classification"""
    ### Extensible: meta-classes
    gt = triskele.read(expe['ground_truth'])
    return compute_metrics(gt, classification)
 class Kronos(object):
    def __init__(self):
        self._pt = time.process_time()
        self._times = OrderedDict()
    def time(self, name):
        self._times[name + '_process_time'] = time.process_time() - self._pt
        self._pt = time.process_time()
    def get_times(self):
        return self._times
--- a/test.yml
+++ b/test.yml
@ -10,21 +10,18 @@ expe:
            rasters:
                - '../Data/phase1_rasters/DEM+B_C123/UH17_GEM051_TR.tif'
                - '../Data/phase1_rasters/DEM_C123_3msr/UH17_GEG051_TR.tif'
-            areas:
+            areas: [100, 1000]
-                - 10
+            moi: [.5, .9]
                - 100
                - 1e4
            moi: [.5, .7, .9]
    cross_validation:
        name: CrossValidationGenerator.APsCVG
        parameters:
-            n_test: 5
+            n_test: 2
    classifier:
        name: sklearn.ensemble.RandomForestClassifier
        parameters:
            n_jobs: -1
            random_state: 0
-            n_estimators: 100
+            n_estimators: 50
            min_samples_leaf: 10
 expe_hashes:
    ground_truth: XXX