diff --git a/doc/api/index.rst b/doc/api/index.rst index 39a5b530a..9a2b3b1a6 100644 --- a/doc/api/index.rst +++ b/doc/api/index.rst @@ -45,6 +45,7 @@ Model Selection train_test_split cross_val_score + BlockShuffleSplit Coordinate Manipulation ----------------------- diff --git a/doc/references.rst b/doc/references.rst index 38a4bbcf1..10ae0bc7e 100644 --- a/doc/references.rst +++ b/doc/references.rst @@ -1,5 +1,7 @@ References ========== +.. [Roberts_etal2017] Roberts, D. R., Bahn, V., Ciuti, S., Boyce, M. S., Elith, J., Guillera‐Arroita, G., et al. (2017). Cross-validation strategies for data with temporal, spatial, hierarchical, or phylogenetic structure. Ecography, 40(8), 913–929. https://doi.org/10.1111/ecog.02881 .. [Sandwell1987] Sandwell, D. T. (1987). Biharmonic spline interpolation of GEOS-3 and SEASAT altimeter data. Geophysical Research Letters, 14(2), 139–142. https://doi.org/10.1029/GL014i002p00139 .. [SandwellWessel2016] Sandwell, D. T., & Wessel, P. (2016). Interpolation of 2-D vector data using constraints from elasticity. Geophysical Research Letters, 43(20), 2016GL070340. https://doi.org/10.1002/2016GL070340 +.. [Valavi_etal2019] Valavi, R., Elith, J., Lahoz‐Monfort, J. J., & Guillera‐Arroita, G. (2019). blockCV: An r package for generating spatially or environmentally separated folds for k-fold cross-validation of species distribution models. Methods in Ecology and Evolution, 10(2), 225–232. https://doi.org/10.1111/2041-210X.13107 diff --git a/verde/__init__.py b/verde/__init__.py index c0b5cf5bb..0fa4bee34 100644 --- a/verde/__init__.py +++ b/verde/__init__.py @@ -23,7 +23,7 @@ from .trend import Trend from .chain import Chain from .spline import Spline, SplineCV -from .model_selection import cross_val_score, train_test_split +from .model_selection import cross_val_score, train_test_split, BlockShuffleSplit from .vector import Vector, VectorSpline2D from .projections import project_region, project_grid diff --git a/verde/model_selection.py b/verde/model_selection.py index bc4c3958d..d692c5537 100644 --- a/verde/model_selection.py +++ b/verde/model_selection.py @@ -1,16 +1,14 @@ """ Functions for automating model selection through cross-validation. - -Supports using a dask.distributed.Client object for parallelism. The -DummyClient is used as a serial version of the parallel client. """ import warnings import numpy as np -from sklearn.model_selection import KFold, ShuffleSplit +from sklearn.model_selection import KFold, ShuffleSplit, BaseCrossValidator from sklearn.base import clone -from .base import check_fit_input +from .base import check_fit_input, n_1d_arrays +from .coordinates import block_split from .utils import dispatch @@ -18,6 +16,268 @@ warnings.simplefilter("default") +# Pylint doesn't like X, y scikit-learn argument names. +# pylint: disable=invalid-name,unused-argument + + +class BlockShuffleSplit(BaseCrossValidator): + """ + Random permutation of spatial blocks cross-validator. + + Yields indices to split data into training and test sets. Data are first + grouped into rectangular blocks of size given by the *spacing* argument. + Alternatively, blocks can be defined by the number of blocks in each + dimension using the *shape* argument instead of *spacing*. The blocks are + then split into testing and training sets randomly. + + The proportion of blocks assigned to each set is controlled by *test_size* + and/or *train_size*. However, the total amount of actual data points in + each set could be different from these values since blocks can have + a different number of data points inside them. To guarantee that the + proportion of actual data is as close as possible to the proportion of + blocks, this cross-validator generates an extra number of splits and + selects the one with proportion of data points in each set closer to the + desired amount [Valavi_etal2019]_. The number of balancing splits per + iteration is controlled by the *balancing* argument. + + This cross-validator is preferred over + :class:`sklearn.model_selection.ShuffleSplit` for spatial data to avoid + overestimating cross-validation scores. This can happen because of the + inherent autocorrelation that is usually associated with this type of data + (points that are close together are more likely to have similar values). + See [Roberts_etal2017]_ for an overview of this topic. + + .. note:: + + Like :class:`sklearn.model_selection.ShuffleSplit`, this + cross-validator cannot guarantee that all folds will be different, + although this is still very likely for sizeable datasets. + + Parameters + ---------- + spacing : float, tuple = (s_north, s_east), or None + The block size in the South-North and West-East directions, + respectively. A single value means that the spacing is equal in both + directions. If None, then *shape* **must be provided**. + shape : tuple = (n_north, n_east) or None + The number of blocks in the South-North and West-East directions, + respectively. If None, then *spacing* **must be provided**. + n_splits : int, default 10 + Number of re-shuffling & splitting iterations. + test_size : float, int, None, default=None + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.1. + train_size : float, int, or None, default=None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + balancing : int + The number of splits generated per iteration to try to balance the + amount of data in each set so that *test_size* and *train_size* are + respected. If 1, then no extra splits are generated (essentially + disabling the balacing). Must be >= 1. + + See also + -------- + train_test_split : Split a dataset into a training and a testing set. + cross_val_score : Score an estimator/gridder using cross-validation. + + Examples + -------- + + >>> from verde import grid_coordinates + >>> import numpy as np + >>> # Make a regular grid of data points + >>> coords = grid_coordinates(region=(0, 3, -10, -7), spacing=1) + >>> # Need to convert the coordinates into a feature matrix + >>> X = np.transpose([i.ravel() for i in coords]) + >>> shuffle = BlockShuffleSplit(spacing=1.5, n_splits=3, random_state=0) + >>> # These are the 1D indices of the points belonging to each set + >>> for train, test in shuffle.split(X): + ... print("Train: {} Test: {}".format(train, test)) + Train: [ 0 1 2 3 4 5 6 7 10 11 14 15] Test: [ 8 9 12 13] + Train: [ 2 3 6 7 8 9 10 11 12 13 14 15] Test: [0 1 4 5] + Train: [ 0 1 4 5 8 9 10 11 12 13 14 15] Test: [2 3 6 7] + >>> # A better way to visualize this is to create a 2D array and put + >>> # "train" or "test" in the corresponding locations. + >>> shape = coords[0].shape + >>> mask = np.full(shape=shape, fill_value=" ") + >>> for iteration, (train, test) in enumerate(shuffle.split(X)): + ... # The index needs to be converted to 2D so we can index our matrix. + ... mask[np.unravel_index(train, shape)] = "train" + ... mask[np.unravel_index(test, shape)] = " test" + ... print("Iteration {}:".format(iteration)) + ... print(mask) + Iteration 0: + [['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train'] + [' test' ' test' 'train' 'train'] + [' test' ' test' 'train' 'train']] + Iteration 1: + [[' test' ' test' 'train' 'train'] + [' test' ' test' 'train' 'train'] + ['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train']] + Iteration 2: + [['train' 'train' ' test' ' test'] + ['train' 'train' ' test' ' test'] + ['train' 'train' 'train' 'train'] + ['train' 'train' 'train' 'train']] + + """ + + def __init__( + self, + spacing=None, + shape=None, + n_splits=10, + test_size=0.1, + train_size=None, + random_state=None, + balancing=10, + ): + if spacing is None and shape is None: + raise ValueError("Either 'spacing' or 'shape' must be provided.") + if balancing < 1: + raise ValueError( + "The *balancing* argument must be >= 1. To disable balancing, use 1." + ) + self.spacing = spacing + self.shape = shape + self.n_splits = n_splits + self.test_size = test_size + self.train_size = train_size + self.random_state = random_state + self.balancing = balancing + + def _iter_test_indices(self, X=None, y=None, groups=None): + """ + Generates integer indices corresponding to test sets. + + Runs several iterations until a split is found that yields blocks with + the right amount of data points in it. + + Parameters + ---------- + X : array-like, shape (n_samples, 2) + Columns should be the easting and northing coordinates of data + points, respectively. + y : array-like, shape (n_samples,) + The target variable for supervised learning problems. Always + ignored. + groups : array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Always ignored. + + Yields + ------ + test : ndarray + The testing set indices for that split. + + """ + labels = block_split( + coordinates=(X[:, 0], X[:, 1]), + spacing=self.spacing, + shape=self.shape, + region=None, + adjust="spacing", + )[1] + block_ids = np.unique(labels) + # Generate many more splits so that we can pick and choose the ones + # that have the right balance of training and testing data. + shuffle = ShuffleSplit( + n_splits=self.n_splits * self.balancing, + test_size=self.test_size, + train_size=self.train_size, + random_state=self.random_state, + ).split(block_ids) + for _ in range(self.n_splits): + test_sets, balance = [], [] + for _ in range(self.balancing): + # This is a false positive in pylint which is why the warning + # is disabled at the top of this file: + # https://github.com/PyCQA/pylint/issues/1830 + # pylint: disable=stop-iteration-return + train_blocks, test_blocks = next(shuffle) + # pylint: enable=stop-iteration-return + train_points = np.where(np.isin(labels, block_ids[train_blocks]))[0] + test_points = np.where(np.isin(labels, block_ids[test_blocks]))[0] + # The proportion of data points assigned to each group should + # be close the proportion of blocks assigned to each group. + balance.append( + abs( + train_points.size / test_points.size + - train_blocks.size / test_blocks.size + ) + ) + test_sets.append(test_points) + best = np.argmin(balance) + yield test_sets[best] + + def split(self, X, y=None, groups=None): + """ + Generate indices to split data into training and test set. + + Parameters + ---------- + X : array-like, shape (n_samples, 2) + Columns should be the easting and northing coordinates of data + points, respectively. + y : array-like, shape (n_samples,) + The target variable for supervised learning problems. Always + ignored. + groups : array-like, with shape (n_samples,), optional + Group labels for the samples used while splitting the dataset into + train/test set. Always ignored. + + Yields + ------ + train : ndarray + The training set indices for that split. + test : ndarray + The testing set indices for that split. + + """ + if X.shape[1] != 2: + raise ValueError( + "X must have exactly 2 columns ({} given).".format(X.shape[1]) + ) + for train, test in super().split(X, y, groups): + yield train, test + + def get_n_splits(self, X=None, y=None, groups=None): + """ + Returns the number of splitting iterations in the cross-validator + + Parameters + ---------- + X : object + Always ignored, exists for compatibility. + y : object + Always ignored, exists for compatibility. + groups : object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits : int + Returns the number of splitting iterations in the cross-validator. + """ + return self.n_splits + + +# pylint: enable=invalid-name,unused-argument + + def train_test_split(coordinates, data, weights=None, **kwargs): r""" Split a dataset into a training and a testing set for cross-validation. @@ -48,6 +308,11 @@ def train_test_split(coordinates, data, weights=None, **kwargs): Each is a tuple = (coordinates, data, weights) generated by separating the input values randomly. + See also + -------- + cross_val_score : Score an estimator/gridder using cross-validation. + BlockShuffleSplit : Random permutation of spatial blocks cross-validator. + Examples -------- @@ -106,7 +371,8 @@ def cross_val_score( By default, will use :class:`sklearn.model_selection.KFold` with ``n_splits=5`` and ``random_state=0`` to split the dataset. Any other - cross-validation class can be passed in through the *cv* argument. + cross-validation class from scikit-learn or Verde can be passed in through + the *cv* argument. Can optionally run in parallel using :mod:`dask`. To do this, use ``delayed=True`` to dispatch computations with :func:`dask.delayed` instead @@ -135,7 +401,7 @@ def cross_val_score( one data component is provided, you must provide a weights array for each data component (if not none). cv : None or cross-validation generator - Any scikit-learn cross-validation generator. Defaults to + Any scikit-learn or Verde cross-validation generator. Defaults to :class:`sklearn.model_selection.KFold`. client : None or dask.distributed.Client **DEPRECATED:** This option is deprecated and will be removed in Verde @@ -155,6 +421,11 @@ def cross_val_score( *delayed*, will be a list of Dask delayed objects (see the *delayed* option). If *client* is not None, then the scores will be futures. + See also + -------- + train_test_split : Split a dataset into a training and a testing set. + BlockShuffleSplit : Random permutation of spatial blocks cross-validator. + Examples -------- @@ -186,6 +457,24 @@ def cross_val_score( >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) 1.00, 1.00, 1.00 + Often, spatial data are autocorrelated (points that are close together are + more likely to have similar values), which can cause cross-validation with + random splits to overestimate the prediction accuracy [Roberts_etal2017]_. + To account for the autocorrelation, we can split the data in blocks rather + than randomly with :class:`verde.BlockShuffleSplit`: + + >>> from verde import BlockShuffleSplit + >>> # spacing controls the size of the spatial blocks + >>> cross_validator = BlockShuffleSplit( + ... spacing=2, n_splits=3, random_state=0 + ... ) + >>> scores = cross_val_score(model, coords, data, cv=cross_validator) + >>> print(', '.join(['{:.2f}'.format(score) for score in scores])) + 1.00, 1.00, 1.00 + + We didn't see a difference here since our model and data are perfect. See + :ref:`model_evaluation` for an example with real data. + If using many splits, we can speed up computations by running them in parallel with Dask: @@ -216,10 +505,10 @@ def cross_val_score( ) if cv is None: cv = KFold(shuffle=True, random_state=0, n_splits=5) - ndata = data[0].size + feature_matrix = np.transpose(n_1d_arrays(coordinates, 2)) fit_args = (coordinates, data, weights) scores = [] - for train_index, test_index in cv.split(np.arange(ndata)): + for train_index, test_index in cv.split(feature_matrix): train = tuple(select(i, train_index) for i in fit_args) test = tuple(select(i, test_index) for i in fit_args) # Clone the estimator to avoid fitting the same object simultaneously diff --git a/verde/tests/test_model_selection.py b/verde/tests/test_model_selection.py index 224a4993b..4093c4fca 100644 --- a/verde/tests/test_model_selection.py +++ b/verde/tests/test_model_selection.py @@ -1,12 +1,14 @@ """ Test the model selection code (cross-validation, etc). """ +import pytest from sklearn.model_selection import ShuffleSplit +import numpy as np import numpy.testing as npt from dask.distributed import Client from .. import Trend, grid_coordinates -from ..model_selection import cross_val_score +from ..model_selection import cross_val_score, BlockShuffleSplit def test_cross_val_score_client(): @@ -22,3 +24,44 @@ def test_cross_val_score_client(): client.close() assert len(scores) == nsplits npt.assert_allclose(scores, 1) + + +def test_blockshufflesplit_n_splits(): + "Make sure get_n_splits returns the correct value" + cv = BlockShuffleSplit(spacing=1, n_splits=14) + assert cv.get_n_splits() == 14 + + +def test_blockshufflesplit_fails_balancing(): + "Should raise an exception if balancing < 1." + with pytest.raises(ValueError): + BlockShuffleSplit(spacing=1, balancing=0) + + +def test_blockshufflesplit_fails_spacing_shape(): + "Should raise an exception if not given spacing or shape." + with pytest.raises(ValueError): + BlockShuffleSplit() + + +def test_blockshufflesplit_fails_data_shape(): + "Should raise an exception if the X array doesn't have 2 columns." + cv = BlockShuffleSplit(spacing=1) + with pytest.raises(ValueError): + next(cv.split(np.ones(shape=(10, 4)))) + with pytest.raises(ValueError): + next(cv.split(np.ones(shape=(10, 1)))) + + +@pytest.mark.parametrize("test_size", [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.9]) +def test_blockshufflesplit_balancing(test_size): + "Make sure that the sets have the right number of points" + coords = np.random.RandomState(seed=0).multivariate_normal( + mean=[5, -7.5], cov=[[4, 0], [0, 9]], size=1000, + ) + npoints = coords.shape[0] + train_size = 1 - test_size + cv = BlockShuffleSplit(spacing=1, random_state=0, test_size=test_size, balancing=20) + for train, test in cv.split(coords): + npt.assert_allclose(train.size / npoints, train_size, atol=0.01) + npt.assert_allclose(test.size / npoints, test_size, atol=0.01)