Source code for skada.datasets._samples_generator

# Author: Theo Gnassounou <theo.gnassounou@inria.fr>
#         Remi Flamary <remi.flamary@polytechnique.edu>
#         Oleksii Kachaiev <kachayev@gmail.com>
#         Bueno Ruben <ruben.bueno@polytechnique.edu>
#
# License: BSD 3-Clause

import numbers

import numpy as np
from scipy import signal
from scipy.fftpack import irfft, rfft
from scipy.stats import multivariate_normal
from sklearn.datasets import make_blobs

from ._base import DomainAwareDataset


def _generate_unif_circle(n_samples, rng):
    angle = rng.rand(n_samples, 1) * 2 * np.pi
    r = np.sqrt(rng.rand(n_samples, 1))
    x = np.concatenate((r * np.cos(angle), r * np.sin(angle)), 1)
    return x


def _generate_data_2d_classif(
    n_samples,
    rng,
    mu_regression=None,
    sigma_regression=None,
    regression_scaling_constant=27,
    label="binary",
):
    """Generate 2d classification data.

    Parameters
    ----------
    n_samples : int
        It is the total number of points among one clusters.
        At the end the number of point are 8*n_samples
    rng : random generator
        Generator for dataset creation
    mu_regression : np.array, default=np.array([0, 0])
        Will only be used if label=='regression'
        When it's value is None, it will be changed to be the default one
    sigma_regression : np.array, default=np.array([[1, 0], [0, 1]])
        Will only be used if label=='regression'
        When it's value is None, it will be changed to be the default one
    regression_scaling_constant: float, default=27
        Constant by which we multiply the y-value when label=='regression'
    label : tuple, default='binary'
        If 'binary, return binary class.
        If 'multiclass', return multiclass.
        If 'regression', return regression's y-values
    """
    if mu_regression is None:
        mu_regression = np.array([0, 0])
    if sigma_regression is None:
        sigma_regression = np.array([[1, 0], [0, 1]])

    n2 = n_samples
    n1 = n2 * 4
    # make data of class 1
    Sigma1 = np.array([[2, -0.5], [-0.5, 2]])
    mu1 = np.array([2, 2])
    x1 = _generate_unif_circle(n1, rng).dot(Sigma1) + mu1[None, :]

    # make data of the first cluster of class 2
    Sigma2 = np.array([[0.15, 0], [0, 0.3]])
    mu2 = np.array([-1.5, 3])

    x21 = rng.randn(n2, 2).dot(Sigma2) + mu2[None, :]

    # make data of the second cluster of class 2
    Sigma2 = np.array([[0.2, -0.1], [-0.1, 0.2]])
    mu2 = np.array([-0.5, 1])

    x22 = rng.randn(n2, 2).dot(Sigma2) + mu2[None, :]

    # make data of the third cluster of class 2
    Sigma2 = np.array([[0.17, -0.05], [-0.05, 0.17]])
    mu2 = np.array([1, -0.4])

    x23 = rng.randn(n2, 2).dot(Sigma2) + mu2[None, :]

    # make data of the fourth cluster of class 2
    Sigma2 = np.array([[0.3, -0.0], [-0.0, 0.15]])
    mu2 = np.array([3, -1])

    x24 = rng.randn(n2, 2).dot(Sigma2) + mu2[None, :]

    # concatenate data
    x = np.concatenate((x1, x21, x22, x23, x24), 0)

    # make labels
    if label == "binary":
        y = np.concatenate((np.zeros(n1), np.ones(4 * n2)), 0)
        y = y.astype(int)
    elif label == "multiclass":
        y = np.zeros(n1)
        for i in range(4):
            y = np.concatenate((y, (i + 1) * np.ones(n2)), 0)
            y = y.astype(int)
    elif label == "regression":
        # create label y with gaussian distribution
        normal_rv = multivariate_normal(mu_regression, sigma_regression)
        y = normal_rv.pdf(x) * regression_scaling_constant
    else:
        raise ValueError(
            f"Invalid label value: {label}. The label should either be "
            "'binary', 'multiclass' or 'regression'"
        )
    return x, y


def _generate_data_2d_classif_subspace(
    n_samples,
    rng,
    mu_regression=None,
    sigma_regression=None,
    regression_scaling_constant=27,
    label="binary",
):
    """Generate 2d classification data.

    Parameters
    ----------
    n_samples : int
        It is the total number of points among one clusters.
        At the end the number of point are 8*n_samples
    rng : random generator
        Generator for dataset creation
    mu_regression : float, default=0.
        Will only be used if label=='regression'
    sigma_regression : float, default=1.
        Will only be used if label=='regression'
        When it's value is None, it will be changed to be the default one
    regression_scaling_constant: float, default=27
        Constant by which we multiply the y-value when label=='regression'
        When it's value is None, it will be changed to be the default one
    label : str, default='binary'
        If 'binary, return binary class.
        If 'multiclass', return multiclass.
        If 'regression', return regression's y-values
    """
    if mu_regression is None:
        mu_regression = 0
    if sigma_regression is None:
        sigma_regression = 1

    n2 = n_samples
    n1 = n2 * 2
    # make data of class 1
    Sigma1 = np.array([[0.5, 0], [0, 0.5]])
    mu1 = np.array([-1, 1])
    x1 = rng.randn(n1, 2).dot(Sigma1) + mu1[None, :]

    # make data of the first cluster of class 2
    Sigma2 = np.array([[0.1, 0], [0, 0.1]])
    mu2 = np.array([1.5, 0.5])

    x21 = rng.randn(n2, 2).dot(Sigma2) + mu2[None, :]

    # make data of the second cluster of class 2
    Sigma2 = np.array([[0.2, 0], [0, 0.2]])
    mu2 = np.array([-0.5, -1.5])

    x22 = rng.randn(n2, 2).dot(Sigma2) + mu2[None, :]

    # concatenate data
    x = np.concatenate((x1, x21, x22), 0)

    # make labels
    if label == "binary":
        y = np.concatenate((np.zeros(n1), np.ones(2 * n2)), 0)
        y = y.astype(int)
    elif label == "multiclass":
        k = 4
        y = np.zeros(n1 + n1 % k)
        for i in range(k):
            y = np.concatenate((y, (i + 1) * np.ones(n1 // k)), 0)
            y = y.astype(int)
    elif label == "regression":
        # create label y with gaussian distribution
        normal_rv = multivariate_normal(mu_regression, sigma_regression)
        # We project on the line: y=x
        X_projected = (x[:, 0] + x[:, 1]) / 2
        y = normal_rv.pdf(X_projected) * regression_scaling_constant
    else:
        raise ValueError(
            f"Invalid label value: {label}. The label should either be "
            "'binary', 'multiclass' or 'regression'"
        )
    return x, y


def _generate_data_from_moons(n_samples, index, rng):
    """Generate two gaussian clusters with centers draw from two moons.

    Parameters
    ----------
    n_samples : int
        It is the total number of points among one cluster.
    index : float,
        Give the position of the centers in the moons
    rng : random generator
        Generator for dataset creation
    """
    n_samples_circ = 100
    outer_circ_x = np.cos(np.linspace(0, np.pi, n_samples_circ))
    outer_circ_y = np.sin(np.linspace(0, np.pi, n_samples_circ))
    inner_circ_x = 1 - np.cos(np.linspace(0, np.pi, n_samples_circ))
    inner_circ_y = 1 - np.sin(np.linspace(0, np.pi, n_samples_circ)) - 0.5

    index = int(index * n_samples_circ)
    cov = [[0.01, 0], [0, 0.01]]
    center1 = np.array([outer_circ_x[index], outer_circ_y[index]])
    center2 = np.array([inner_circ_x[index], inner_circ_y[index]])

    X = np.concatenate(
        [
            rng.multivariate_normal(center1, cov, size=n_samples),
            rng.multivariate_normal(center2, cov, size=n_samples),
        ]
    )
    y = np.concatenate([np.zeros(n_samples), np.ones(n_samples)])

    return X, y


def _generate_signal_with_peak_frequency(
    n_samples, n_channels, input_size, frequencies, band_size, sigma_freq, fs, rng
):
    X = []
    y = []
    n_classes, n_frequencies = frequencies.shape
    for n_class in range(n_classes):
        X_new = np.zeros((n_samples, n_channels, input_size))
        for n_frequency in range(n_frequencies):
            channel_weights = rng.uniform(0.5, 1, size=(n_channels))
            X_random = rng.normal(0, 1, size=(n_samples, n_channels, input_size))
            for i in range(n_samples):
                frequency = (
                    rng.normal(
                        frequencies[n_class, n_frequency], sigma_freq * band_size
                    )
                    + 1e-5
                )
                if frequency < 0:
                    frequency = -frequency
                sos = signal.butter(
                    10,
                    [frequency, frequency + band_size],
                    "bandpass",
                    fs=fs,
                    output="sos",
                )

                X_filtered = signal.sosfilt(sos, X_random[i])

                for j in range(n_channels):
                    X_fft = rfft(X_filtered[:, j])
                    X_filtered[:, j] = irfft(X_fft * channel_weights[j])
                X_new[i] += X_filtered
        X.append(X_new)
        y.append([n_class for _ in range(n_samples)])
    X = np.concatenate(X)
    y = np.concatenate(y)
    return X, y


[docs] def make_shifted_blobs( n_samples=100, n_features=2, shift=0.10, noise=None, centers=None, cluster_std=1.0, random_state=None, return_X_y=True, return_dataset=False, ): """Generate source and shift target isotropic Gaussian blobs . Parameters ---------- n_samples : int, default=100 It is the total number of points equally divided among clusters. n_features : int, default=2 The number of features for each sample. shift : float or array like, default=0.10 If float, it is the value of the translation for every target feature. If array_like, each element of the sequence indicates the value of the translation for each target features. noise : float or array_like, default=None If float, standard deviation of Gaussian noise added to the data. If array-like, each element of the sequence indicate standard deviation of Gaussian noise added to the source and target data. centers : int or ndarray of shape (n_centers, n_features), default=None The number of centers to generate, or the fixed center locations. If n_samples is an int and centers is None, 3 centers are generated. If n_samples is array-like, centers must be either None or an array of length equal to the length of n_samples. cluster_std : float or array-like of float, default=1.0 The standard deviation of the clusters. shuffle : bool, default=True Shuffle the samples. random_state : int, RandomState instance or None, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. return_X_y : boolean, optional (default=True) Returns source and target dataset as a pair of (X, y) tuples (for the source and the target respectively). Otherwise returns tuple of (X, y, sample_domain) where `sample_domain` is a categorical label for the domain where sample is taken. return_dataset : boolean, optional (default=False) When set to `True`, the function returns :class:`~skada.datasets.DomainAwareDataset` object. Returns ------- (X, y, sample_domain) : tuple if `return_X_y=True` Tuple of (data, target, sample_domain), see the description below. data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. X: ndarray Samples from all sources and all targets given. y : ndarray Labels from all sources and all targets. sample_domain : ndarray The integer label for domain the sample was taken from. By convention, source domains have non-negative labels, and target domain label is always < 0. domain_names : dict The names of domains and associated domain labels. dataset : :class:`~skada.datasets.DomainAwareDataset` Dataset object. """ rng = np.random.RandomState(random_state) X_source, y_source = make_blobs( n_samples=n_samples, centers=centers, n_features=n_features, random_state=random_state, cluster_std=cluster_std, ) X_target = X_source + shift y_target = y_source if isinstance(noise, numbers.Real): X_source += rng.normal(scale=noise, size=X_source.shape) X_target += rng.normal(scale=noise, size=X_target.shape) elif noise is not None: X_source += rng.normal(scale=noise[0], size=X_source.shape) X_target += rng.normal(scale=noise[1], size=X_target.shape) dataset = DomainAwareDataset( domains=[ (X_source, y_source, "s"), (X_target, y_target, "t"), ] ) if return_dataset: return dataset else: return dataset.pack(as_sources=["s"], as_targets=["t"], return_X_y=return_X_y)
[docs] def make_shifted_datasets( n_samples_source=100, n_samples_target=100, shift="covariate_shift", noise=None, label="binary", ratio=0.9, mean=1, sigma=0.7, gamma=2, mu_regression=None, sigma_regression=None, regression_scaling_constant=27, center=((0, 2)), center_cov_shift=((0, 2)), random_state=None, return_X_y=True, return_dataset=False, ): """Generate source and shift target. Parameters ---------- n_samples_source : int, default=100 It is the total number of points among one source clusters. At the end 8*n_samples points. n_samples_target : int, default=100 It is the total number of points among one target clusters. At the end 8*n_samples points. shift : tuple, default='covariate_shift' Choose the nature of the shift. If 'covariate_shift', use covariate shift. If 'target_shift', use target shift. If 'concept_drift', use concept drift. If 'subspace', a subspace where the classes are separable independently of the domains exists. See detailed description of each shift in [1]_. noise : float or array_like, default=None If float, standard deviation of Gaussian noise added to the data. If array-like, each element of the sequence indicate standard deviation of Gaussian noise added to the source and target data. label : str, default='binary' If 'binary, generates binary class labels. If 'multiclass', generates multiclass labels. If 'regression', generates regression's y-values. ratio : float, default=0.9 Ratio of the number of data in class 1 selected in the target shift and the sample_selection bias mean : float, default=1 value of the translation in the concept drift. sigma : float, default=0.7 multiplicative value of the concept drift. mu_regression : np.array|float, default=None Will only be used if label=='regression' should be 2x1 matrix when shift != 'subspace' should be a scalar when shift == 'subspace' sigma_regression : np.array|float, default=None Will only be used if label=='regression' should be a 2x2 matrix when shift != 'subspace' should be a scalar when shift == 'subspace' regression_scaling_constant: float, default=27 Constant by which we multiply the y-value when label=='regression' gamma : float, default=2 Parameter of the RBF kernel. center : array-like of shape (1, 2), default=((0, 2)) Center of the distribution. center_cov_shift : array-like of shape (1, 2), default=((0, 2)) Center of the covariate-shift. random_state : int, RandomState instance or None, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. return_X_y : boolean, optional (default=True) Returns source and target dataset as a pair of (X, y) tuples (for the source and the target respectively). Otherwise returns tuple of (X, y, sample_domain) where `sample_domain` is a categorical label for the domain where sample is taken. return_dataset : boolean, optional (default=False) When set to `True`, the function returns :class:`~skada.datasets.DomainAwareDataset` object. Returns ------- (X, y, sample_domain) : tuple if `return_X_y=True` Tuple of (data, target, sample_domain), see the description below. data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. X: ndarray Samples from all sources and all targets given. y : ndarray Labels from all sources and all targets. sample_domain : ndarray The integer label for domain the sample was taken from. By convention, source domains have non-negative labels, and target domain label is always < 0. domain_names : dict The names of domains and associated domain labels. dataset : :class:`~skada.datasets.DomainAwareDataset` Dataset object. References ---------- .. [1] Moreno-Torres, J. G., Raeder, T., Alaiz-Rodriguez, R., Chawla, N. V., and Herrera, F. (2012). A unifying view on dataset shift in classification. Pattern recognition, 45(1):521-530. """ rng = np.random.RandomState(random_state) X_source, y_source = _generate_data_2d_classif( n_samples_source, rng, mu_regression, sigma_regression, regression_scaling_constant, label, ) if shift == "covariate_shift": n_samples_target_temp = n_samples_target * 100 X_target, y_target = _generate_data_2d_classif( n_samples_target_temp, rng, mu_regression, sigma_regression, regression_scaling_constant, label, ) w = np.exp(-gamma * np.sum((X_target - np.array(center_cov_shift)) ** 2, 1)) w /= w.sum() isel = rng.choice(len(w), size=(8 * n_samples_target,), replace=False, p=w) X_target = X_target[isel] y_target = y_target[isel] elif shift == "target_shift": n_samples_target_temp = n_samples_target * 3 X_target, y_target = _generate_data_2d_classif( n_samples_target_temp, rng, mu_regression, sigma_regression, regression_scaling_constant, label, ) n_samples1 = int(8 * n_samples_target * ratio) n_samples2 = 8 * n_samples_target - n_samples1 isel1 = rng.choice( 8 * n_samples_target_temp // 2, size=(n_samples1,), replace=False ) isel2 = ( rng.choice( 8 * n_samples_target_temp // 2, size=(n_samples2,), replace=False ) ) + 8 * n_samples_target_temp // 2 isel = np.concatenate((isel1, isel2)) X_target = X_target[isel] y_target = y_target[isel] elif shift == "concept_drift": X_target, y_target = _generate_data_2d_classif( n_samples_target, rng, mu_regression, sigma_regression, regression_scaling_constant, label, ) X_target = X_target * sigma + mean elif shift == "subspace": X_source, y_source = _generate_data_2d_classif_subspace( n_samples_source, rng, mu_regression, sigma_regression, regression_scaling_constant, label, ) X_target, y_target = _generate_data_2d_classif_subspace( n_samples_target, rng, mu_regression, sigma_regression, regression_scaling_constant, label, ) X_target *= -1 else: raise ValueError( f"Invalid shift value: {shift}. The shift should either be " "'covariate_shift', 'target_shift', 'concept_drift' " "or 'subspace'" ) if isinstance(noise, numbers.Real): X_source += rng.normal(scale=noise, size=X_source.shape) X_target += rng.normal(scale=noise, size=X_target.shape) elif noise is not None: X_source += rng.normal(scale=noise[0], size=X_source.shape) X_target += rng.normal(scale=noise[1], size=X_target.shape) dataset = DomainAwareDataset( domains=[ (X_source, y_source, "s"), (X_target, y_target, "t"), ] ) if return_dataset: return dataset else: return dataset.pack(as_sources=["s"], as_targets=["t"], return_X_y=return_X_y)
[docs] def make_dataset_from_moons_distribution( n_samples_source=10, n_samples_target=10, noise=None, pos_source=0.1, pos_target=0.2, random_state=None, return_X_y=True, return_dataset=False, ): """Make dataset from moons. Parameters ---------- n_samples_source : int, default=100 It is the total number of points among one source cluster. n_samples_target : int, default=100 It is the total number of points among one target cluster. noise : float or array_like, default=None If float, standard deviation of Gaussian noise added to the data. If array-like, each element of the sequence indicate standard deviation of Gaussian noise added to the source and target data. pos_source : float or array-like, default=0.1 If float, indicate the center of the source cluster. If array-like, each element of the sequence indicates the position of the center of each source cluster. pos_target : float or array-like, default=0.2 If float, indicate the center of the source cluster. If array-like, each element of the sequence indicates the position of the center of each target cluster. random_state : int, RandomState instance or None, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. return_X_y : boolean, optional (default=True) Returns source and target dataset as a pair of (X, y) tuples (for the source and the target respectively). Otherwise returns tuple of (X, y, sample_domain) where `sample_domain` is a categorical label for the domain where sample is taken. return_dataset : boolean, optional (default=False) When set to `True`, the function returns :class:`~skada.datasets.DomainAwareDataset` object. Returns ------- (X, y, sample_domain) : tuple if `return_X_y=True` Tuple of (data, target, sample_domain), see the description below. data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. X: ndarray Samples from all sources and all targets given. y : ndarray Labels from all sources and all targets. sample_domain : ndarray The integer label for domain the sample was taken from. By convention, source domains have non-negative labels, and target domain label is always < 0. domain_names : dict The names of domains and associated domain labels. dataset : :class:`~skada.datasets.DomainAwareDataset` Dataset object. """ rng = np.random.RandomState(random_state) dataset = DomainAwareDataset(domains=[]) sources = [] targets = [] if isinstance(pos_source, numbers.Real): X_source, y_source = _generate_data_from_moons( n_samples_source, pos_source, rng ) if isinstance(noise, numbers.Real): X_source += rng.normal(scale=noise, size=X_source.shape) elif noise is not None: X_source += rng.normal(scale=noise[0], size=X_source.shape) dataset.add_domain(X_source, y_source, "s") sources.append("s") else: for i, pos in enumerate(pos_source): X, y = _generate_data_from_moons(n_samples_source, pos, rng) if isinstance(noise, numbers.Real): X += rng.normal(scale=noise, size=X.shape) elif noise is not None: X += rng.normal(scale=noise[0], size=X.shape) dataset.add_domain(X, y, f"s{i}") sources.append(f"s{i}") if isinstance(pos_target, numbers.Real): X_target, y_target = _generate_data_from_moons( n_samples_target, pos_target, rng ) if isinstance(noise, numbers.Real): X_target += rng.normal(scale=noise, size=X_target.shape) elif noise is not None: X_target += rng.normal(scale=noise[1], size=X_target.shape) dataset.add_domain(X_target, y_target, "t") targets.append("t") else: for i, pos in enumerate(pos_target): X, y = _generate_data_from_moons(n_samples_target, pos, rng) if isinstance(noise, numbers.Real): X += rng.normal(scale=noise, size=X.shape) elif noise is not None: X += rng.normal(scale=noise[1], size=X.shape) dataset.add_domain(X, y, f"t{i}") targets.append(f"t{i}") if return_dataset: return dataset else: return dataset.pack( as_sources=sources, as_targets=targets, return_X_y=return_X_y )
[docs] def make_variable_frequency_dataset( n_samples_source=10, n_samples_target=10, n_channels=1, n_frequencies=1, n_classes=3, delta_f=1, band_size=1, sigma_freq=0.25, sigma_ch=1, noise=None, random_state=None, return_X_y=True, return_dataset=False, ): """Make dataset with different peak frequency. Parameters ---------- n_samples_source : int, default=100 It is the total number of points among one source cluster. n_samples_target : int, default=100 It is the total number of points among one target cluster. n_channels : int, default=1 Number of channels in the signal. n_frequency_source : int, default=1 Number of channels which generate frequency peak and propagate to other channels. n_classes : int, default=3 Number of classes in the signals. One class correspond to a specific frequency band. delta_f : float, default=1 Band frequency shift of the target data. band_size : float, default=1 Size of the frequency band. sigma_ch : float, default=1 Std for the gaussian on the channels. noise : float or array_like, default=None If float, standard deviation of Gaussian noise added to the data. If array-like, each element of the sequence indicate standard deviation of Gaussian noise added to the source and target data. random_state : int, RandomState instance or None, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. return_X_y : boolean, optional (default=True) Returns source and target dataset as a pair of (X, y) tuples (for the source and the target respectively). Otherwise returns tuple of (X, y, sample_domain) where `sample_domain` is a categorical label for the domain where sample is taken. return_dataset : boolean, optional (default=False) When set to `True`, the function returns :class:`~skada.datasets.DomainAwareDataset` object. Returns ------- (X, y, sample_domain) : tuple if `return_X_y=True` Tuple of (data, target, sample_domain), see the description below. data : :class:`~sklearn.utils.Bunch` Dictionary-like object, with the following attributes. X: ndarray Samples from all sources and all targets given. y : ndarray Labels from all sources and all targets. sample_domain : ndarray The integer label for domain the sample was taken from. By convention, source domains have non-negative labels, and target domain label is always < 0. domain_names : dict The names of domains and associated domain labels. dataset : :class:`~skada.datasets.DomainAwareDataset` Dataset object. """ rng = np.random.RandomState(random_state) input_size = 3000 fs = 100 highest_frequency = 15 frequencies = rng.choice( highest_frequency, size=(n_classes, n_frequencies), replace=False ) X_source, y_source = _generate_signal_with_peak_frequency( n_samples_source, n_channels, input_size, frequencies, band_size, sigma_freq, fs, rng, ) X_target, y_target = _generate_signal_with_peak_frequency( n_samples_target, n_channels, input_size, frequencies + delta_f, band_size, sigma_freq, fs, rng, ) if isinstance(noise, numbers.Real): X_source += rng.normal(scale=noise, size=X_source.shape) X_target += rng.normal(scale=noise, size=X_target.shape) elif noise is not None: X_source += rng.normal(scale=noise[0], size=X_source.shape) X_target += rng.normal(scale=noise[1], size=X_target.shape) dataset = DomainAwareDataset( domains=[ (X_source, y_source, "s"), (X_target, y_target, "t"), ] ) if return_dataset: return dataset else: return dataset.pack(as_sources=["s"], as_targets=["t"], return_X_y=return_X_y)