Binning is passing tests

2019-03-22 17:16:39 +01:00 · 2019-03-22 17:16:39 +01:00 · 32f4bb28ff
commit 32f4bb28ff
parent dae16f62c7
3 changed files with 188 additions and 31 deletions
--- a/idefix/vxl.py
+++ b/idefix/vxl.py
@ -15,28 +15,35 @@ import ipdb

 log = logging.getLogger(__name__)

-def _ui_step(step):
+def _ui_step(step, spatial):
    '''User input management for step (number or array)
    '''
    try:
        iter(step)
-        if len(step) != 3:
-            msg = 'Wrong steps input, 3 steps expected in step = \'{}\''.format(step)
+        if len(step) != spatial.shape[-1]:
+            msg = 'Missmatch between steps count and spatial dimensions, {} step(s) expected while step = \'{}\''.format(spatial.shape[-1], step)
            log.error(msg)
-            raise IOError(msg)
+            raise ValueError(msg)
+        out_step = step
    except TypeError:
-        step = [step] * 3
-    return step
+        out_step = [step] * spatial.shape[-1]
+
+    for s in out_step:
+        if s <= 0:
+            msg = 'Step should be greater than 0, steps = \'{}\''.format(step)
+            log.error(msg)
+            raise ValueError(msg)
+    return out_step

 def get_grid(spatial, step):
    '''Return grid bins.

-    Compute the grid bins of a spatial point cloud or corresponding bounding
-    box according to given step (or steps for anisotropic grid).
+    Compute the grid bins of a point cloud or the corresponding bounding box
+    according to given step (or steps for anisotropic grid).

    Parameters
    ----------
-    spatial : array (n, 3)
+    spatial : array (m, n)
        The spatial point cloud or the corresponding bounding box to grid.
    step : number or array or tuple
        The step of the grid, can be a number to get an isotropic grid, or an
@ -44,18 +51,109 @@ def get_grid(spatial, step):

    Returns
    -------
-    grid : array of array (3,)
+    grid : array of array (n,)
        Grid of spatial given step. Return three arrays (not necessarily of the
        same size) defining the bins of axis `x`, `y` and `z`.
    '''
+    spatial = np.array(spatial)
    bb = bbox(spatial)
-    step = _ui_step(step)
+    step = _ui_step(step, spatial)

-    #ipdb.set_trace()
    grid = []
    for a_min, a_max, a_s in zip(bb[0], bb[1], step):
+        # Beware of float underflow
        bins = np.trunc((a_max - a_min) / a_s).astype(int) + 1
        grid += [np.linspace(a_min, a_min + bins * a_s, bins + 1)]

    return grid

+def bin(grid, spatial, feature=None, method='density'):
+    '''Bin spatial data in a grid.
+
+    Return a voxel grid representing the binned point cloud defined by point
+    positions in `spatial`. The point cloud can be valued with the `feature`
+    attribute.
+
+    Parameters
+    ----------
+    grid : array of array (n,)
+        Grid to bin spatial data.
+    spatial : array (m, n)
+        Spatial position of the points in R^n.
+    feature : array (m)
+        Point feature to represent in the bins. If None, density method is
+        mandatory. Default is None.
+    method : str
+        Method to synthetize the point features in the grid. If the method is
+        density, then the feature values are ignored. Implemented methods are:
+        - 'density': The density of point in each cell.
+        - 'mean': The mean of feature value in each cell.
+        - 'mode': The modal (most common) in each cell. Designed for labels on
+          point cloud, can be long with rich spectral data. If there is an
+          equal number of elements, then the smallest is returned.
+        The default is 'density'.
+
+    Returns
+    -------
+    binned_pc : masked array (i, j, k)
+        The binned point cloud, "No data" are masked.
+    '''
+    log.info('Bining point cloud in grid...')
+
+    if method == 'density':
+        return _bin_density(grid, spatial)
+    else:
+        if feature is None:
+            msg = 'Missing required argument : \'feature\''
+            log.error(msg)
+            raise ValueError(msg)
+    if method == 'mean':
+        return _bin_mean(grid, spatial, feature)
+    if method == 'mode':
+        return _bin_mode(grid, spatial, feature)
+
+    msg = 'Method \'{}\' does not exist.'.format(method)
+    log.error(msg)
+    raise NotImplementedError(msg)
+
+def _bin_density(grid, spatial):
+    '''Bin spatial in a grid, density method.
+    '''
+    density, edge = np.histogramdd(spatial, grid)
+    vxl = np.ma.masked_array(density, density == 0)
+    return vxl
+
+def _bin_mean(grid, spatial, feature):
+    '''Bin spatial in a grid, mean method.
+    '''
+    density, edge = np.histogramdd(spatial, grid)
+    weightd, edge = np.histogramdd(spatial, grid, weights=feature)
+    mask = density == 0
+    return np.ma.masked_array(np.divide(weightd, density, where=~mask), mask)
+
+def _bin_mode(grid, spatial, feature):
+    '''Bin spatial in a grid, mode method.
+
+    This function aim for efficiency with ndarray but is linearly impacted by
+    number of unique values in spatial.
+    '''
+    log.info('Mode binning...')
+    values = np.unique(feature)
+
+    if values.size > 10:
+        log.warn('Mode called on data with {} unique values, processing may be long.'.format(values.size))
+
+    # Init
+    max_score = np.zeros([len(x) - 1 for x in grid])
+    max_value = np.zeros_like(max_score, dtype=feature.dtype)
+    for i, value in enumerate(values):
+        log.info('Processing value {}/{}'.format(i + 0, values.size))
+        mask = np.argwhere(feature == value).reshape(-1)
+        score = _bin_density(grid, spatial[mask])
+        winner = score > max_score
+        max_score[winner] = score[winner]
+        max_value[winner] = value
+        del score, winner
+
+    return np.ma.masked_array(max_value, max_score == 0)
+
--- a/test/test_vxl.py
+++ b/test/test_vxl.py
@ -22,14 +22,31 @@ def data_pc(datadir, set_id):
    data = np.loadtxt(path)
    return Pcloud(data[:,:3], data[:,3])

-#def data_vxl(datadir, set_id, step, method):
-#    pass
+def data_vxl(datadir, set_id, grid_id, method):
+    def _unpack_vxl(spatial, feature):
+        coords = tuple([spatial[:,i] for i in range(3)])

-@pytest.fixture
-def data_0_vxl():
-    def _data_0_vxl(method, resolution):
-        if method == 'mean':
-            pass
+        vxld = np.zeros(spatial.max(axis=0) + 1)
+        vxld[coords] = feature
+
+        vxlm = np.ones_like(vxld, dtype=np.bool)
+        vxlm[coords] = False
+
+        return np.ma.masked_array(vxld, vxlm)
+
+    def _load_vxl(fname, feature_name):
+        fields = ('x', 'y', 'z', 'density', 'mean', 'mode')
+
+        i = fields.index(feature_name)
+
+        data = np.loadtxt('test/test_vxl/pc0_vxl_s1.txt')
+        spatial = data[:,:3].astype(np.intp)
+        feature = data[:,i]
+
+        return _unpack_vxl(spatial, feature)
+
+    path = datadir.join('pc{}_vxl_s{}.txt'.format(set_id, grid_id))
+    return _load_vxl(path, method)

 def data_grid(datadir, set_id, step_id):
    def _read(fname):
@ -41,10 +58,15 @@ def data_grid(datadir, set_id, step_id):

    path = datadir.join('pc{}_grid_s{}.txt'.format(set_id, step_id))
    return _read(path)
-    
+
@pytest.mark.parametrize('set_id, step, grid_id', [
    ('0', 1., '1'),
+    ('0', 2., '2'),
    ('0', .1, '0_1'),
+    ('0', .6, '0_6'),
+    ('0', .7, '0_7'),
+    ('0', .15, '0_15'),
+    ('0', [1.,1.,2.] , '1-1-2'),
 ])
 def test_get_grid(datadir, set_id, step, grid_id):
    spatial = data_pc(datadir, set_id).spatial
@ -62,14 +84,51 @@ def test_get_grid(datadir, set_id, step, grid_id):

    for axis_test, axis_truth in zip(test, res):
        assert axis_test.size == axis_truth.size, 'Wrong size for axis'
-        assert (axis_test == axis_truth).all(), 'Axis inequality between truth and test'
-                         
-def test_grid():
-    """
-    - dtype
-    - method
-    - mask
-    - data
-    """
-    pass
+        assert np.allclose(axis_test, axis_truth), 'Axis inequality between truth and test'
+        #assert (axis_test - axis_truth == 0).all(), 'Float overflow in tested grid'

+def test_get_grid_ui():
+    np.random.seed(0)
+    spatial_2D = np.random.random((100,2))
+    spatial_3D = np.random.random((100,3))
+
+    with pytest.raises(ValueError,) as e_info:
+        vxl.get_grid(spatial_3D, -1), 'Negativ test'
+
+    with pytest.raises(ValueError) as e_info:
+        vxl.get_grid(spatial_3D, [1., -1., 1.])
+
+    with pytest.raises(ValueError) as e_info:
+        vxl.get_grid(spatial_3D, [1., 1.])
+
+    with pytest.raises(ValueError) as e_info:
+        vxl.get_grid(spatial_2D, [1., 1., 1.])
+
+def test_bin_ui():
+    spatial = np.random.random((10,3))
+    feature = np.random.random((10))
+    grid    = [np.arange(0,1,.1)] * 3
+
+    with pytest.raises(ValueError) as e_info:
+        vxl.bin(grid, spatial, method='mean')
+
+    with pytest.raises(NotImplementedError) as e_info:
+        vxl.bin(grid, spatial, feature, method='🍆')
+
+@pytest.mark.parametrize('set_id, grid_id, method', [
+    ('0', '1', 'density'),
+    ('0', '1', 'mean'),
+    ('0', '1', 'mode'),
+])
+def test_bin(datadir, grid_id, set_id, method):
+    data = data_pc(datadir, set_id)
+    grid = data_grid(datadir, set_id, grid_id)
+    truth = data_vxl(datadir, set_id, grid_id, method)
+
+    test = vxl.bin(grid, data.spatial, data.feature, method)
+
+    assert test is not None, 'Tested function did not return anything :('
+    assert hasattr(test, 'mask'), 'The array is not masked!'
+    assert test.shape == tuple([x.size - 1 for x in grid]), 'Voxel grid shape and test grid missmatch'
+    assert (test.mask == truth.mask).all(), 'The returned mask is different from test truth'
+    assert np.allclose(test.compressed(), truth.compressed()), 'The returned values are different from test truth'
--- a/test/test_vxl/pc0_vxl_s1.txt
+++ b/test/test_vxl/pc0_vxl_s1.txt
@ -1,4 +1,4 @@
-# x y z density mean mode 
+# x y z density mean mode
 0 0 0 1 2 2
 0 2 1 4 10 5
 9 9 9 1 1 1