minigrida/minigrida/supervisor.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
# \file supervisor.py
# \brief TODO
# \author Florent Guiotte <florent.guiotte@gmail.com>
# \version 2.1
# \date 07 sept. 2018
#
# TODO details

import importlib
import time
import os
from datetime import datetime
import traceback
import logging
import logger
from protocols.protocol import TestError
import database
from multiprocessing import Process
import json
import argparse

host = os.uname()[1]
log = logging.getLogger('Supervisor [{}]'.format(host))

parser = argparse.ArgumentParser(description='Run minigrida supervisor')
parser.add_argument('--conf',
                    metavar='config',
                    default='config.json',
                    type=str,
                    help='the path to the supervisor\'s config file.')
parser.add_argument('--cred',
                    metavar='credentials',
                    default='credentials.json',
                    type=str,
                    help='the path to the DB credentials file.')
args = parser.parse_args()

def run(expe, hostpid=host):
    database.update_experiment(expe,
                               worker=hostpid,
                               status='running',
                               start_date=datetime.now())

    # Load protocol
    log.info('Load protocol {}'.format(expe.protocol))
    protocol_module = importlib.import_module('protocols')
    importlib.reload(protocol_module)
    protocol = getattr(protocol_module, expe.protocol)
    test = protocol(expe.expe)

    # Run test
    try:
        test.run()
    except Exception as e:
        err = 'Experience error'
        log.warning(err)
        report = {'error': {'name': str(err),
                            'trace': traceback.format_exc()}}
        database.update_experiment(expe, report=report, status='error')
        raise TestError(err)

    # Write report
    log.info('Write report')
    database.update_experiment(expe,
                               end_date=datetime.now(),
                               oa=test.oa,
                               aa=test.aa,
                               k=test.k,
                               report=test.get_results(),
                               ressources={
                                   'process_time': test.get_process_time()
                                   },
                               status='complete')

    # End of test
    log.info('Expe {} complete'.format(expe.expe_hash))


def main(pid=None):
    hostpid = host + '_' + str(pid) if pid is not None else host
    log.name = 'Supervisor [{}]'.format(hostpid)

    log.info('Connecting to database')
    database.connect(args.cred)

    while(True):
        if not database.pending_experiments():
            log.info('No pending experiments, waiting...')
            time.sleep(30)

        else:
            log.info('Loading next experiment')
            expe = database.next_experiment()
            if not expe:
                continue
            log.info('Expe {} loaded'.format(expe.expe_hash))
            try:
                run(expe, hostpid)
            except Exception:
                log.exception('Error occured on expe {}'.format(expe.id))
                time.sleep(60)


if __name__ == '__main__':
    logger.setup_logging()
    log.info('Starting supervisor')
    try:
        with open(args.conf) as f:
            config = json.load(f)
            process_count = config['process_count']
    except Exception as e:
        log.warning(e)
        process_count = 1

    for i in range(process_count):
        Process(target=main, args=(i,)).start()
        time.sleep(1)