Add insight function

2019-03-26 17:31:09 +01:00 · 2019-03-26 17:31:09 +01:00 · 4f629c9be1
commit 4f629c9be1
parent 32f4bb28ff
2 changed files with 113 additions and 1 deletions
--- a/idefix/vxl.py
+++ b/idefix/vxl.py
@ -10,6 +10,7 @@ General functions to transform point clouds to voxels compatible with numpy.
 import logging
 import numpy as np
 import humanize
 from .utils import bbox
 import ipdb
@ -153,7 +154,93 @@ def _bin_mode(grid, spatial, feature):
        winner = score > max_score
        max_score[winner] = score[winner]
        max_value[winner] = value
-        del score, winner
+        del score, winner, mask
    return np.ma.masked_array(max_value, max_score == 0)
 def _bin_insight(grid):
    '''Return the predicted number of cells contained in grid.
    '''
    return np.prod([x.size - 1 for x in grid])
 def _bin_density_insight(grid, dtype=np.float):
    density = np.dtype(dtype).itemsize
    res_data = density
    res_mask = np.dtype(np.bool).itemsize
    return _bin_insight(grid) * (density + res_data + res_mask)
 def _bin_mean_insight(grid, feature=None):
    density = np.dtype(np.float).itemsize
    weight = np.dtype(np.float).itemsize
    mask = np.dtype(np.bool).itemsize
    res_data = np.dtype(np.float).itemsize
    res_mask = np.dtype(np.float).itemsize
    return (density + weight + mask + res_data + res_mask) * _bin_insight(grid)
 def _bin_mode_insight(grid, feature=None):
    max_score = np.dtype(np.float).itemsize
    max_value = np.dtype(np.float).itemsize
    score = np.dtype(np.float).itemsize
    winner = np.dtype(np.bool).itemsize
    res_data = np.dtype(np.float).itemsize
    res_mask = np.dtype(np.bool).itemsize
    return _bin_insight(grid) * (max_score + max_value + max(score + winner, res_data + res_mask))
 def insight(grid, feature=None, method='density', mem_limit=None):
    '''Display memory usage of binning process.
    Display in the logs (INFO level) the predicted memory usage needed by the
    binning process. If `mem_limit` is set, then the method will throw an
    exception (MemoryError) if the prediction exceed the limit.
    Parameters
    ----------
    grid : array of array (n,)
        Grid to bin spatial data.
    feature : array (m)
        Point feature to represent in the bins. If None, default float values
        are assumed.
    method : str
        Method to synthetize the point features in the grid.
    mem_limit : number, str
        The limit allowed to further process the grid. If the insight
        prediction exceed this limit a MemoryError is raised. If the parameter
        is a string, it can be set with human readable memory size (e.g.
        '3GB'). The default is bytes.
    Return
    ------
    mem_usage : number
        The future RAM usage required to further process the data binning.
    '''
    if mem_limit is not None:
        mem_limit = _human_to_bytes(mem_limit) if isinstance(mem_limit, str) else mem_limit
    if method == 'density':
        mem_usage = _bin_density_insight(grid)
    elif method == 'mean':
        mem_usage = _bin_mean_insight(grid, feature)
    elif method == 'mode':
        mem_usage = _bin_mode_insight(grid, feature)
    else:
        raise IOError('Wrong method: \'{}\''.format(method))
    log.info('--- GRID INSIGHT ---')
    log.info('Grid size:     \t{}'.format([x.size - 1 for x in grid]))
    log.info('Number of cells:\t{}'.format(humanize.intword(_bin_insight(grid))))
    log.info('Predicted RAM usage:\t{}'.format(humanize.naturalsize(mem_usage, binary=True)))
    log.info('Allowed max RAM usage:\t{}'.format(humanize.naturalsize(mem_limit, binary=True) if mem_limit else 'Not set'))
    humanize.naturalsize(mem_usage)
    log.info('--------------------')
    if mem_limit and mem_usage > mem_limit:
        msg = 'The memory requirement is higher than allowed memory'
        log.error(msg)
        raise MemoryError(msg)
 def _human_to_bytes(human_size):
    bytes_count = {'KB': 1, 'MB': 2, 'GB': 3}
    for k, v in bytes_count.items():
        if human_size.endswith(k):
            return float(human_size.strip(k)) * 1024 ** v
    raise IOError('Did not understand size: \'{}\''.format(human_size))
--- a/test/test_vxl.py
+++ b/test/test_vxl.py
@ -132,3 +132,28 @@ def test_bin(datadir, grid_id, set_id, method):
    assert test.shape == tuple([x.size - 1 for x in grid]), 'Voxel grid shape and test grid missmatch'
    assert (test.mask == truth.mask).all(), 'The returned mask is different from test truth'
    assert np.allclose(test.compressed(), truth.compressed()), 'The returned values are different from test truth'
@pytest.mark.parametrize('set_id, grid_id, cells', [
    ('0', '1', 1000),
    ('0', '2', 125),
    ('0', '0_1', 753571),
    ('0', '0_15', 226981),
 ])
 def test__bin_insight(datadir, set_id, grid_id, cells):
    grid = data_grid(datadir, set_id, grid_id)
    assert vxl._bin_insight(grid) is not None, 'Tested function did not return anything :('
    assert vxl._bin_insight(grid) == cells, 'Private insight function did not return the correct number of cells for grid'
@pytest.mark.parametrize('method', [
    ('density'), ('mean'), ('mode')])
 def test_insight(method):
    # Create a huge grid
    grid = [np.arange(1, 10, .0001)] * 3
    with pytest.raises(MemoryError) as e_info:
        vxl.insight(grid, method=method, mem_limit='3GB')
    with pytest.raises(MemoryError) as e_info:
        vxl.insight(grid, method=method, mem_limit='300 MB')
    with pytest.raises(MemoryError) as e_info:
        vxl.insight(grid, method=method, mem_limit='3KB')
    with pytest.raises(MemoryError) as e_info:
        vxl.insight(grid, method=method, mem_limit=3000)