Add insight function

2019-03-26 17:31:09 +01:00 · 2019-03-26 17:31:09 +01:00 · 4f629c9be1
commit 4f629c9be1
parent 32f4bb28ff
2 changed files with 113 additions and 1 deletions
--- a/idefix/vxl.py
+++ b/idefix/vxl.py
@ -10,6 +10,7 @@ General functions to transform point clouds to voxels compatible with numpy.

 import logging
 import numpy as np
+import humanize
 from .utils import bbox
 import ipdb

@ -153,7 +154,93 @@ def _bin_mode(grid, spatial, feature):
        winner = score > max_score
        max_score[winner] = score[winner]
        max_value[winner] = value
-        del score, winner
+        del score, winner, mask

    return np.ma.masked_array(max_value, max_score == 0)

+def _bin_insight(grid):
+    '''Return the predicted number of cells contained in grid.
+    '''
+    return np.prod([x.size - 1 for x in grid])
+
+def _bin_density_insight(grid, dtype=np.float):
+    density = np.dtype(dtype).itemsize
+    res_data = density
+    res_mask = np.dtype(np.bool).itemsize
+    return _bin_insight(grid) * (density + res_data + res_mask)
+
+def _bin_mean_insight(grid, feature=None):
+    density = np.dtype(np.float).itemsize
+    weight = np.dtype(np.float).itemsize
+    mask = np.dtype(np.bool).itemsize
+    res_data = np.dtype(np.float).itemsize
+    res_mask = np.dtype(np.float).itemsize
+    return (density + weight + mask + res_data + res_mask) * _bin_insight(grid)
+
+def _bin_mode_insight(grid, feature=None):
+    max_score = np.dtype(np.float).itemsize
+    max_value = np.dtype(np.float).itemsize
+    score = np.dtype(np.float).itemsize
+    winner = np.dtype(np.bool).itemsize
+    res_data = np.dtype(np.float).itemsize
+    res_mask = np.dtype(np.bool).itemsize
+    return _bin_insight(grid) * (max_score + max_value + max(score + winner, res_data + res_mask))
+
+def insight(grid, feature=None, method='density', mem_limit=None):
+    '''Display memory usage of binning process.
+
+    Display in the logs (INFO level) the predicted memory usage needed by the
+    binning process. If `mem_limit` is set, then the method will throw an
+    exception (MemoryError) if the prediction exceed the limit.
+
+    Parameters
+    ----------
+    grid : array of array (n,)
+        Grid to bin spatial data.
+    feature : array (m)
+        Point feature to represent in the bins. If None, default float values
+        are assumed.
+    method : str
+        Method to synthetize the point features in the grid.
+    mem_limit : number, str
+        The limit allowed to further process the grid. If the insight
+        prediction exceed this limit a MemoryError is raised. If the parameter
+        is a string, it can be set with human readable memory size (e.g.
+        '3GB'). The default is bytes.
+
+    Return
+    ------
+    mem_usage : number
+        The future RAM usage required to further process the data binning.
+    '''
+    if mem_limit is not None:
+        mem_limit = _human_to_bytes(mem_limit) if isinstance(mem_limit, str) else mem_limit
+
+    if method == 'density':
+        mem_usage = _bin_density_insight(grid)
+    elif method == 'mean':
+        mem_usage = _bin_mean_insight(grid, feature)
+    elif method == 'mode':
+        mem_usage = _bin_mode_insight(grid, feature)
+    else:
+        raise IOError('Wrong method: \'{}\''.format(method))
+
+    log.info('--- GRID INSIGHT ---')
+    log.info('Grid size:     \t{}'.format([x.size - 1 for x in grid]))
+    log.info('Number of cells:\t{}'.format(humanize.intword(_bin_insight(grid))))
+    log.info('Predicted RAM usage:\t{}'.format(humanize.naturalsize(mem_usage, binary=True)))
+    log.info('Allowed max RAM usage:\t{}'.format(humanize.naturalsize(mem_limit, binary=True) if mem_limit else 'Not set'))
+    humanize.naturalsize(mem_usage)
+    log.info('--------------------')
+
+    if mem_limit and mem_usage > mem_limit:
+        msg = 'The memory requirement is higher than allowed memory'
+        log.error(msg)
+        raise MemoryError(msg)
+
+def _human_to_bytes(human_size):
+    bytes_count = {'KB': 1, 'MB': 2, 'GB': 3}
+    for k, v in bytes_count.items():
+        if human_size.endswith(k):
+            return float(human_size.strip(k)) * 1024 ** v
+    raise IOError('Did not understand size: \'{}\''.format(human_size))
--- a/test/test_vxl.py
+++ b/test/test_vxl.py
@ -132,3 +132,28 @@ def test_bin(datadir, grid_id, set_id, method):
    assert test.shape == tuple([x.size - 1 for x in grid]), 'Voxel grid shape and test grid missmatch'
    assert (test.mask == truth.mask).all(), 'The returned mask is different from test truth'
    assert np.allclose(test.compressed(), truth.compressed()), 'The returned values are different from test truth'
+
+@pytest.mark.parametrize('set_id, grid_id, cells', [
+    ('0', '1', 1000),
+    ('0', '2', 125),
+    ('0', '0_1', 753571),
+    ('0', '0_15', 226981),
+])
+def test__bin_insight(datadir, set_id, grid_id, cells):
+    grid = data_grid(datadir, set_id, grid_id)
+    assert vxl._bin_insight(grid) is not None, 'Tested function did not return anything :('
+    assert vxl._bin_insight(grid) == cells, 'Private insight function did not return the correct number of cells for grid'
+
+@pytest.mark.parametrize('method', [
+    ('density'), ('mean'), ('mode')])
+def test_insight(method):
+    # Create a huge grid
+    grid = [np.arange(1, 10, .0001)] * 3
+    with pytest.raises(MemoryError) as e_info:
+        vxl.insight(grid, method=method, mem_limit='3GB')
+    with pytest.raises(MemoryError) as e_info:
+        vxl.insight(grid, method=method, mem_limit='300 MB')
+    with pytest.raises(MemoryError) as e_info:
+        vxl.insight(grid, method=method, mem_limit='3KB')
+    with pytest.raises(MemoryError) as e_info:
+        vxl.insight(grid, method=method, mem_limit=3000)