Binning is passing tests

2019-03-22 17:16:39 +01:00 · 2019-03-22 17:16:39 +01:00 · 32f4bb28ff
commit 32f4bb28ff
parent dae16f62c7
3 changed files with 188 additions and 31 deletions
--- a/idefix/vxl.py
+++ b/idefix/vxl.py
@ -15,28 +15,35 @@ import ipdb
 log = logging.getLogger(__name__)
-def _ui_step(step):
+def _ui_step(step, spatial):
    '''User input management for step (number or array)
    '''
    try:
        iter(step)
-        if len(step) != 3:
+        if len(step) != spatial.shape[-1]:
-            msg = 'Wrong steps input, 3 steps expected in step = \'{}\''.format(step)
+            msg = 'Missmatch between steps count and spatial dimensions, {} step(s) expected while step = \'{}\''.format(spatial.shape[-1], step)
            log.error(msg)
-            raise IOError(msg)
+            raise ValueError(msg)
        out_step = step
    except TypeError:
-        step = [step] * 3
+        out_step = [step] * spatial.shape[-1]
-    return step
+
    for s in out_step:
        if s <= 0:
            msg = 'Step should be greater than 0, steps = \'{}\''.format(step)
            log.error(msg)
            raise ValueError(msg)
    return out_step
 def get_grid(spatial, step):
    '''Return grid bins.
-    Compute the grid bins of a spatial point cloud or corresponding bounding
+    Compute the grid bins of a point cloud or the corresponding bounding box
-    box according to given step (or steps for anisotropic grid).
+    according to given step (or steps for anisotropic grid).
    Parameters
    ----------
-    spatial : array (n, 3)
+    spatial : array (m, n)
        The spatial point cloud or the corresponding bounding box to grid.
    step : number or array or tuple
        The step of the grid, can be a number to get an isotropic grid, or an
@ -44,18 +51,109 @@ def get_grid(spatial, step):
    Returns
    -------
-    grid : array of array (3,)
+    grid : array of array (n,)
        Grid of spatial given step. Return three arrays (not necessarily of the
        same size) defining the bins of axis `x`, `y` and `z`.
    '''
    spatial = np.array(spatial)
    bb = bbox(spatial)
-    step = _ui_step(step)
+    step = _ui_step(step, spatial)
    #ipdb.set_trace()
    grid = []
    for a_min, a_max, a_s in zip(bb[0], bb[1], step):
        # Beware of float underflow
        bins = np.trunc((a_max - a_min) / a_s).astype(int) + 1
        grid += [np.linspace(a_min, a_min + bins * a_s, bins + 1)]
    return grid
 def bin(grid, spatial, feature=None, method='density'):
    '''Bin spatial data in a grid.
    Return a voxel grid representing the binned point cloud defined by point
    positions in `spatial`. The point cloud can be valued with the `feature`
    attribute.
    Parameters
    ----------
    grid : array of array (n,)
        Grid to bin spatial data.
    spatial : array (m, n)
        Spatial position of the points in R^n.
    feature : array (m)
        Point feature to represent in the bins. If None, density method is
        mandatory. Default is None.
    method : str
        Method to synthetize the point features in the grid. If the method is
        density, then the feature values are ignored. Implemented methods are:
        - 'density': The density of point in each cell.
        - 'mean': The mean of feature value in each cell.
        - 'mode': The modal (most common) in each cell. Designed for labels on
          point cloud, can be long with rich spectral data. If there is an
          equal number of elements, then the smallest is returned.
        The default is 'density'.
    Returns
    -------
    binned_pc : masked array (i, j, k)
        The binned point cloud, "No data" are masked.
    '''
    log.info('Bining point cloud in grid...')
    if method == 'density':
        return _bin_density(grid, spatial)
    else:
        if feature is None:
            msg = 'Missing required argument : \'feature\''
            log.error(msg)
            raise ValueError(msg)
    if method == 'mean':
        return _bin_mean(grid, spatial, feature)
    if method == 'mode':
        return _bin_mode(grid, spatial, feature)
    msg = 'Method \'{}\' does not exist.'.format(method)
    log.error(msg)
    raise NotImplementedError(msg)
 def _bin_density(grid, spatial):
    '''Bin spatial in a grid, density method.
    '''
    density, edge = np.histogramdd(spatial, grid)
    vxl = np.ma.masked_array(density, density == 0)
    return vxl
 def _bin_mean(grid, spatial, feature):
    '''Bin spatial in a grid, mean method.
    '''
    density, edge = np.histogramdd(spatial, grid)
    weightd, edge = np.histogramdd(spatial, grid, weights=feature)
    mask = density == 0
    return np.ma.masked_array(np.divide(weightd, density, where=~mask), mask)
 def _bin_mode(grid, spatial, feature):
    '''Bin spatial in a grid, mode method.
    This function aim for efficiency with ndarray but is linearly impacted by
    number of unique values in spatial.
    '''
    log.info('Mode binning...')
    values = np.unique(feature)
    if values.size > 10:
        log.warn('Mode called on data with {} unique values, processing may be long.'.format(values.size))
    # Init
    max_score = np.zeros([len(x) - 1 for x in grid])
    max_value = np.zeros_like(max_score, dtype=feature.dtype)
    for i, value in enumerate(values):
        log.info('Processing value {}/{}'.format(i + 0, values.size))
        mask = np.argwhere(feature == value).reshape(-1)
        score = _bin_density(grid, spatial[mask])
        winner = score > max_score
        max_score[winner] = score[winner]
        max_value[winner] = value
        del score, winner
    return np.ma.masked_array(max_value, max_score == 0)
--- a/test/test_vxl.py
+++ b/test/test_vxl.py
@ -22,14 +22,31 @@ def data_pc(datadir, set_id):
    data = np.loadtxt(path)
    return Pcloud(data[:,:3], data[:,3])
-#def data_vxl(datadir, set_id, step, method):
+def data_vxl(datadir, set_id, grid_id, method):
-#    pass
+    def _unpack_vxl(spatial, feature):
        coords = tuple([spatial[:,i] for i in range(3)])
-@pytest.fixture
+        vxld = np.zeros(spatial.max(axis=0) + 1)
-def data_0_vxl():
+        vxld[coords] = feature
-    def _data_0_vxl(method, resolution):
+
-        if method == 'mean':
+        vxlm = np.ones_like(vxld, dtype=np.bool)
-            pass
+        vxlm[coords] = False
        return np.ma.masked_array(vxld, vxlm)
    def _load_vxl(fname, feature_name):
        fields = ('x', 'y', 'z', 'density', 'mean', 'mode')
        i = fields.index(feature_name)
        data = np.loadtxt('test/test_vxl/pc0_vxl_s1.txt')
        spatial = data[:,:3].astype(np.intp)
        feature = data[:,i]
        return _unpack_vxl(spatial, feature)
    path = datadir.join('pc{}_vxl_s{}.txt'.format(set_id, grid_id))
    return _load_vxl(path, method)
 def data_grid(datadir, set_id, step_id):
    def _read(fname):
@ -41,10 +58,15 @@ def data_grid(datadir, set_id, step_id):
    path = datadir.join('pc{}_grid_s{}.txt'.format(set_id, step_id))
    return _read(path)
-    
+
@pytest.mark.parametrize('set_id, step, grid_id', [
    ('0', 1., '1'),
    ('0', 2., '2'),
    ('0', .1, '0_1'),
    ('0', .6, '0_6'),
    ('0', .7, '0_7'),
    ('0', .15, '0_15'),
    ('0', [1.,1.,2.] , '1-1-2'),
 ])
 def test_get_grid(datadir, set_id, step, grid_id):
    spatial = data_pc(datadir, set_id).spatial
@ -62,14 +84,51 @@ def test_get_grid(datadir, set_id, step, grid_id):
    for axis_test, axis_truth in zip(test, res):
        assert axis_test.size == axis_truth.size, 'Wrong size for axis'
-        assert (axis_test == axis_truth).all(), 'Axis inequality between truth and test'
+        assert np.allclose(axis_test, axis_truth), 'Axis inequality between truth and test'
-                         
+        #assert (axis_test - axis_truth == 0).all(), 'Float overflow in tested grid'
 def test_grid():
    """
    - dtype
    - method
    - mask
    - data
    """
    pass
 def test_get_grid_ui():
    np.random.seed(0)
    spatial_2D = np.random.random((100,2))
    spatial_3D = np.random.random((100,3))
    with pytest.raises(ValueError,) as e_info:
        vxl.get_grid(spatial_3D, -1), 'Negativ test'
    with pytest.raises(ValueError) as e_info:
        vxl.get_grid(spatial_3D, [1., -1., 1.])
    with pytest.raises(ValueError) as e_info:
        vxl.get_grid(spatial_3D, [1., 1.])
    with pytest.raises(ValueError) as e_info:
        vxl.get_grid(spatial_2D, [1., 1., 1.])
 def test_bin_ui():
    spatial = np.random.random((10,3))
    feature = np.random.random((10))
    grid    = [np.arange(0,1,.1)] * 3
    with pytest.raises(ValueError) as e_info:
        vxl.bin(grid, spatial, method='mean')
    with pytest.raises(NotImplementedError) as e_info:
        vxl.bin(grid, spatial, feature, method='🍆')
@pytest.mark.parametrize('set_id, grid_id, method', [
    ('0', '1', 'density'),
    ('0', '1', 'mean'),
    ('0', '1', 'mode'),
 ])
 def test_bin(datadir, grid_id, set_id, method):
    data = data_pc(datadir, set_id)
    grid = data_grid(datadir, set_id, grid_id)
    truth = data_vxl(datadir, set_id, grid_id, method)
    test = vxl.bin(grid, data.spatial, data.feature, method)
    assert test is not None, 'Tested function did not return anything :('
    assert hasattr(test, 'mask'), 'The array is not masked!'
    assert test.shape == tuple([x.size - 1 for x in grid]), 'Voxel grid shape and test grid missmatch'
    assert (test.mask == truth.mask).all(), 'The returned mask is different from test truth'
    assert np.allclose(test.compressed(), truth.compressed()), 'The returned values are different from test truth'
--- a/test/test_vxl/pc0_vxl_s1.txt
+++ b/test/test_vxl/pc0_vxl_s1.txt
@ -1,4 +1,4 @@
-# x y z density mean mode 
+# x y z density mean mode
 0 0 0 1 2 2
 0 2 1 4 10 5
 9 9 9 1 1 1