Source code for skada._reweight

# Author: Theo Gnassounou <theo.gnassounou@inria.fr>
#         Remi Flamary <remi.flamary@polytechnique.edu>
#         Oleksii Kachaiev <kachayev@gmail.com>
#         Bueno Ruben <ruben.bueno@polytechnique.edu>
#         Antoine Collas <contact@antoinecollas.fr>
#         Yanis Lalou <yanis.lalou@polytechnique.edu>
#
# License: BSD 3-Clause

import warnings
from abc import abstractmethod

import numpy as np
import scipy.sparse as sp
from scipy.stats import multivariate_normal
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import KERNEL_PARAMS, pairwise_distances, pairwise_kernels
from sklearn.model_selection import check_cv
from sklearn.neighbors import KernelDensity, KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.utils import check_random_state
from sklearn.utils.validation import check_is_fitted

from ._pipeline import make_da_pipeline
from ._utils import Y_Type, _estimate_covariance, _find_y_type
from .base import BaseAdapter, clone
from .utils import (
    check_X_domain,
    check_X_y_domain,
    extract_source_indices,
    qp_solve,
    source_target_split,
)

EPS = np.finfo(float).eps


class BaseReweightAdapter(BaseAdapter):
    """Base class for the adapter that yields weights for samples.

    Specific implementation should provide `fit` and `compute_weights`.
    The base class takes care of the rest of machinery to make it fully
    compatible with DA pipelines.
    """

    @abstractmethod
    def fit(self, X, y=None, *, sample_domain=None):
        pass

    @abstractmethod
    def compute_weights(self, X, y=None, *, sample_domain=None, **params) -> np.ndarray:
        pass

    def fit_transform(self, X, y=None, *, sample_domain=None, **params):
        """Predict adaptation weights and returns them as an additional
        parameters for the pipeline to propagate them into the estimator.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The source data.
        y : array-like, shape (n_samples,)
            The source labels.
        sample_domain : array-like, shape (n_samples,)
            The domain labels (same as sample_domain).

        Returns
        -------
        X : array-like, shape (n_samples, n_components)
            The data (same as X).
        sample_weight : array-like, shape (n_samples,)
            The weights of the samples.
        """
        self.fit(X, y=y, sample_domain=sample_domain, **params)
        weights = self.compute_weights(X, y=y, sample_domain=sample_domain, **params)
        return X, dict(sample_weight=weights)


[docs] class DensityReweightAdapter(BaseReweightAdapter): """Adapter based on re-weighting samples using density estimation. Parameters ---------- weight_estimator : estimator object, optional The estimator to use to estimate the densities of source and target observations. If None, a KernelDensity estimator is used. Attributes ---------- weight_estimator_source_ : object The estimator object fitted on the source data. weight_estimator_target_ : object The estimator object fitted on the target data. """ def __init__(self, weight_estimator=None): super().__init__() self.weight_estimator = weight_estimator or KernelDensity()
[docs] def fit(self, X, y=None, *, sample_domain=None): """Fit adaptation parameters. Parameters ---------- X : array-like, shape (n_samples, n_features) The source data. y : array-like, shape (n_samples,) The source labels. sample_domain : array-like, shape (n_samples,) The domain labels (same as sample_domain). Returns ------- self : object Returns self. """ X, sample_domain = check_X_domain(X, sample_domain) X_source, X_target = source_target_split(X, sample_domain=sample_domain) source_estimator = clone(self.weight_estimator) source_estimator.fit(X_source) target_estimator = clone(self.weight_estimator) target_estimator.fit(X_target) self.weight_estimator_source_ = source_estimator self.weight_estimator_target_ = target_estimator return self
def compute_weights(self, X, y=None, *, sample_domain=None, **params): check_is_fitted(self) X, sample_domain = check_X_domain(X, sample_domain, allow_source=True) source_idx = extract_source_indices(sample_domain) (source_idx,) = np.where(source_idx) ws = self.weight_estimator_source_.score_samples(X[source_idx]) wt = self.weight_estimator_target_.score_samples(X[source_idx]) source_weights = np.exp(wt - ws) if source_weights.mean() != 0: source_weights /= source_weights.mean() else: warnings.warn("All weights are zero. Using uniform weights.") source_weights = np.ones_like(source_weights) / len(source_weights) weights = np.zeros(X.shape[0], dtype=source_weights.dtype) weights[source_idx] = source_weights return weights
[docs] def DensityReweight( base_estimator=None, weight_estimator=None, ): """Density re-weighting pipeline adapter and estimator. Parameters ---------- base_estimator : sklearn estimator, default=None estimator used for fitting and prediction weight_estimator : estimator object, optional The estimator to use to estimate the densities of source and target observations. If None, a KernelDensity estimator is used. Returns ------- pipeline : sklearn pipeline Pipeline containing the DensityReweight adapter and the base estimator. """ if base_estimator is None: base_estimator = LogisticRegression().set_fit_request(sample_weight=True) return make_da_pipeline( DensityReweightAdapter(weight_estimator=weight_estimator), base_estimator, )
[docs] class GaussianReweightAdapter(BaseReweightAdapter): """Gaussian approximation re-weighting method. See [1]_ for details. Parameters ---------- reg : 'auto' or float, default="auto" The regularization parameter of the covariance estimator. Possible values: - None: no shrinkage. - 'auto': automatic shrinkage using the Ledoit-Wolf lemma. - float between 0 and 1: fixed shrinkage parameter. Attributes ---------- `mean_source_` : array-like, shape (n_features,) Mean of the source data. `cov_source_` : array-like, shape (n_features, n_features) Mean of the source data. `mean_target_` : array-like, shape (n_features,) Mean of the target data. `cov_target_` : array-like, shape (n_features, n_features) Covariance of the target data. References ---------- .. [1] Hidetoshi Shimodaira. Improving predictive inference under covariate shift by weighting the log-likelihood function. In Journal of Statistical Planning and Inference, 2000. """ def __init__(self, reg="auto"): super().__init__() self.reg = reg
[docs] def fit(self, X, y=None, *, sample_domain=None): """Fit adaptation parameters. Parameters ---------- X : array-like, shape (n_samples, n_features) The source data. y : array-like, shape (n_samples,) The source labels. sample_domain : array-like, shape (n_samples,) The domain labels (same as sample_domain). Returns ------- self : object Returns self. """ X, sample_domain = check_X_domain(X, sample_domain) X_source, X_target = source_target_split(X, sample_domain=sample_domain) self.mean_source_ = X_source.mean(axis=0) self.cov_source_ = _estimate_covariance(X_source, shrinkage=self.reg) self.mean_target_ = X_target.mean(axis=0) self.cov_target_ = _estimate_covariance(X_target, shrinkage=self.reg) return self
def compute_weights(self, X, y=None, *, sample_domain=None, **params): check_is_fitted(self) X, sample_domain = check_X_domain(X, sample_domain, allow_source=True) source_idx = extract_source_indices(sample_domain) (source_idx,) = np.where(source_idx) gaussian_target = multivariate_normal.pdf( X[source_idx], self.mean_target_, self.cov_target_ ) gaussian_source = multivariate_normal.pdf( X[source_idx], self.mean_source_, self.cov_source_ ) source_weights = gaussian_target / gaussian_source weights = np.zeros(X.shape[0], dtype=source_weights.dtype) weights[source_idx] = source_weights return weights
[docs] def GaussianReweight( base_estimator=None, reg="auto", ): """Gaussian approximation re-weighting pipeline adapter and estimator. see [1]_ for details. Parameters ---------- base_estimator : sklearn estimator, default=None estimator used for fitting and prediction reg : 'auto' or float, default="auto" The regularization parameter of the covariance estimator. Possible values: - None: no shrinkage. - 'auto': automatic shrinkage using the Ledoit-Wolf lemma. - float between 0 and 1: fixed shrinkage parameter. Returns ------- pipeline : sklearn pipeline Pipeline containing the GaussianReweight adapter and the base estimator. References ---------- .. [1] Hidetoshi Shimodaira. Improving predictive inference under covariate shift by weighting the log-likelihood function. In Journal of Statistical Planning and Inference, 2000. """ if base_estimator is None: base_estimator = LogisticRegression().set_fit_request(sample_weight=True) return make_da_pipeline( GaussianReweightAdapter(reg=reg), base_estimator, )
[docs] class DiscriminatorReweightAdapter(BaseReweightAdapter): """Gaussian approximation re-weighting method. See [1]_ for details. Parameters ---------- domain_classifier : sklearn classifier, optional Classifier used to predict the domains. If None, a LogisticRegression is used. Attributes ---------- `domain_classifier_` : object The classifier object fitted on the source and target data. References ---------- .. [1] Hidetoshi Shimodaira. Improving predictive inference under covariate shift by weighting the log-likelihood function. In Journal of Statistical Planning and Inference, 2000. """ def __init__(self, domain_classifier=None): super().__init__() self.domain_classifier = domain_classifier or LogisticRegression()
[docs] def fit(self, X, y=None, sample_domain=None): """Fit adaptation parameters. Parameters ---------- X : array-like, shape (n_samples, n_features) The source data. y : array-like, shape (n_samples,) The source labels. sample_domain : array-like, shape (n_samples,) The domain labels (same as sample_domain). Returns ------- self : object Returns self. """ X, sample_domain = check_X_domain(X, sample_domain) source_idx = extract_source_indices(sample_domain) (source_idx,) = np.where(source_idx) y_domain = np.ones(X.shape[0], dtype=np.int32) y_domain[source_idx] = 0 domain_classifier = clone(self.domain_classifier) domain_classifier.fit(X, y_domain) self.domain_classifier_ = domain_classifier return self
def compute_weights(self, X, y=None, *, sample_domain=None, **params): check_is_fitted(self) X, sample_domain = check_X_domain(X, sample_domain, allow_source=True) source_idx = extract_source_indices(sample_domain) # xxx(okachaiev): it seems to me that would work without np.where # as we only use this array for indexing (source_idx,) = np.where(source_idx) probas = self.domain_classifier_.predict_proba(X[source_idx])[:, 1] probas = np.clip(probas, EPS, 1.0) source_weights = (1 - probas) / probas source_weights /= source_weights.mean() weights = np.zeros(X.shape[0], dtype=source_weights.dtype) weights[source_idx] = source_weights return weights
[docs] def DiscriminatorReweight(base_estimator=None, domain_classifier=None): """Discriminator re-weighting pipeline adapter and estimator. see [1]_ for details. Parameters ---------- base_estimator : sklearn estimator, default=None estimator used for fitting and prediction domain_classifier : sklearn classifier, optional Classifier used to predict the domains. If None, a LogisticRegression is used. Returns ------- pipeline : sklearn pipeline Pipeline containing the DiscriminatorReweight adapter and the base estimator. References ---------- .. [1] Hidetoshi Shimodaira. Improving predictive inference under covariate shift by weighting the log-likelihood function. In Journal of Statistical Planning and Inference, 2000. """ if base_estimator is None: base_estimator = LogisticRegression().set_fit_request(sample_weight=True) return make_da_pipeline( DiscriminatorReweightAdapter(domain_classifier=domain_classifier), base_estimator, )
[docs] class KLIEPReweightAdapter(BaseReweightAdapter): """Kullback-Leibler Importance Estimation Procedure (KLIEPReweight). The idea of KLIEPReweight is to find an importance estimate w(x) such that the Kullback-Leibler (KL) divergence between the source input density p_source(x) to its estimate p_target(x) = w(x)p_source(x) is minimized. See [3]_ for details. Parameters ---------- gamma : float or array like Parameters for the kernels. If array like, compute the likelihood cross validation to choose the best parameters for the RBF kernel. If float, solve the optimization for the given kernel parameter. cv : int, cross-validation generator or an iterable, default=5 Determines the cross-validation splitting strategy. If it is an int it is the number of folds for the cross validation. n_centers : int, default=100 Number of kernel centers defining their number. tol : float, default=1e-6 Tolerance for the stopping criterion in the optimization. max_iter : int, default=1000 Number of maximum iteration before stopping the optimization. random_state : int, RandomState instance or None, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. Attributes ---------- `best_gamma_` : float The best gamma parameter for the RBF kernel chosen with the likelihood cross validation if several parameters are given as input. `alpha_` : float Solution of the optimization problem. `centers_` : list List of the target data taken as centers for the kernels. References ---------- .. [3] Masashi Sugiyama et. al. Direct Importance Estimation with Model Selection and Its Application to Covariate Shift Adaptation. In NeurIPS, 2007. """ def __init__( self, gamma, cv=5, n_centers=100, tol=1e-6, max_iter=1000, random_state=None, ): super().__init__() self.gamma = gamma self.cv = cv self.n_centers = n_centers self.tol = tol self.max_iter = max_iter self.random_state = random_state
[docs] def fit(self, X, y=None, sample_domain=None, **kwargs): """Fit adaptation parameters. Parameters ---------- X : array-like, shape (n_samples, n_features) The source data. y : array-like, shape (n_samples,) The source labels. sample_domain : array-like, shape (n_samples,) The domain labels (same as sample_domain). Returns ------- self : object Returns self. """ X, sample_domain = check_X_domain( X, sample_domain, allow_multi_source=True, allow_multi_target=True ) X_source, X_target = source_target_split(X, sample_domain=sample_domain) if isinstance(self.gamma, list): self.gamma = [self._auto_scale_gammas(gamma, X) for gamma in self.gamma] self.best_gamma_ = self._likelihood_cross_validation( self.gamma, X_source, X_target ) else: self.best_gamma_ = self._auto_scale_gammas(self.gamma, X) self.alpha_, self.centers_ = self._weights_optimization( self.best_gamma_, X_source, X_target ) return self
def _weights_optimization(self, gamma, X_source, X_target): """Optimization loop.""" rng = check_random_state(self.random_state) n_targets = len(X_target) n_centers = np.min((n_targets, self.n_centers)) centers = X_target[rng.choice(np.arange(n_targets), n_centers)] A = pairwise_kernels(X_target, centers, metric="rbf", gamma=gamma) b = pairwise_kernels(X_source, centers, metric="rbf", gamma=gamma) b = np.mean(b, axis=0) alpha = np.ones(n_centers) obj = np.sum(np.log(A @ alpha)) for _ in range(self.max_iter): old_obj = obj alpha += EPS * A.T @ (1 / (A @ alpha)) alpha += (1 - b @ alpha) * b / (b @ b) alpha = (alpha > 0) * alpha alpha /= b @ alpha obj = np.sum(np.log(A @ alpha + EPS)) if np.abs(obj - old_obj) < self.tol: break else: warnings.warn("Maximum iteration reached before convergence.") return alpha, centers def _likelihood_cross_validation(self, gammas, X_source, X_target): """Compute the likelihood cross validation. Used to choose the best parameter for the kernel. """ log_liks = [] # xxx(okachaiev): should this be done when fitting? rng = check_random_state(self.random_state) index = np.arange(len(X_target)) rng.shuffle(index) cv = check_cv(self.cv) for this_gamma in gammas: this_log_lik = [] for train, test in cv.split(X_target): alpha, centers = self._weights_optimization( this_gamma, X_source, X_target[train], ) A = pairwise_kernels( X_target[test], centers, metric="rbf", gamma=this_gamma ) weights = A @ alpha this_log_lik.append(np.mean(np.log(weights + EPS))) log_liks.append(np.mean(this_log_lik)) best_gamma_ = gammas[np.argmax(log_liks)] return best_gamma_ def compute_weights(self, X, y=None, *, sample_domain=None, **params): check_is_fitted(self) X, sample_domain = check_X_domain(X, sample_domain, allow_source=True) source_idx = extract_source_indices(sample_domain) (source_idx,) = np.where(source_idx) A = pairwise_kernels( X[source_idx], self.centers_, metric="rbf", gamma=self.best_gamma_ ) source_weights = A @ self.alpha_ weights = np.zeros(X.shape[0], dtype=source_weights.dtype) weights[source_idx] = source_weights return weights def _auto_scale_gammas(self, gamma, X): if isinstance(gamma, str): # Code snippet from sklearn SVC if gamma == "scale": # var = E[X^2] - E[X]^2 if sparse sparse = sp.issparse(X) X_var = (X.multiply(X)).mean() - (X.mean()) ** 2 if sparse else X.var() gamma = 1.0 / (X.shape[1] * X_var) if X_var != 0 else 1.0 elif gamma == "auto": gamma = 1 / X.shape[1] else: gamma = gamma return gamma
[docs] def KLIEPReweight( base_estimator=None, gamma=1.0, cv=5, n_centers=100, tol=1e-6, max_iter=1000, random_state=None, ): """KLIEPReweight pipeline adapter and estimator. see [3]_ for details. Parameters ---------- base_estimator : sklearn estimator, default=LogisticRegression() estimator used for fitting and prediction gamma : float or array like Parameters for the kernels. If array like, compute the likelihood cross validation to choose the best parameters for the RBF kernel. If float, solve the optimization for the given kernel parameter. cv : int, cross-validation generator or an iterable, default=5 Determines the cross-validation splitting strategy. If it is an int it is the number of folds for the cross validation. n_centers : int, default=100 Number of kernel centers defining their number. tol : float, default=1e-6 Tolerance for the stopping criterion in the optimization. max_iter : int, default=1000 Number of maximum iteration before stopping the optimization. random_state : int, RandomState instance or None, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. Returns ------- pipeline : sklearn pipeline Pipeline containing the KLIEPReweight adapter and the base estimator. References ---------- .. [3] Masashi Sugiyama et. al. Direct Importance Estimation with Model Selection and Its Application to Covariate Shift Adaptation. In NeurIPS, 2007. """ if base_estimator is None: base_estimator = LogisticRegression().set_fit_request(sample_weight=True) return make_da_pipeline( KLIEPReweightAdapter( gamma=gamma, cv=cv, n_centers=n_centers, tol=tol, max_iter=max_iter, random_state=random_state, ), base_estimator, )
[docs] class NearestNeighborReweightAdapter(BaseReweightAdapter): """Adapter based on re-weighting samples using a KNN, See [24]_ for details. Parameters ---------- n_neighbors : int, default=1 Number of neighbors to use for the KNN laplace_smoothing : bool, default=False, optional True if we want to use laplace smoothing, and thus adding 1 to all our weights (to prevent some of them to be 0) weights : {'uniform', 'distance'}, callable or None, default='uniform' Weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood are weighted equally. - 'distance' : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. - [callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights. Refer to the example entitled :ref:`sphx_glr_auto_examples_neighbors_plot_classification.py` showing the impact of the `weights` parameter on the decision boundary. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. leaf_size : int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p : float, default=2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected to be positive. metric : str or callable, default='minkowski' Metric to use for distance computation. Default is "minkowski", which results in the standard Euclidean distance when p = 2. See the documentation of `scipy.spatial.distance <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric values. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. If metric is a callable function, it takes two arrays representing 1D vectors as inputs and must return one value indicating the distance between those vectors. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. metric_params : dict, default=None Additional keyword arguments for the metric function. n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. Doesn't affect :meth:`fit` method. References ---------- .. [24] Nearest neighbor-based importance weighting. In 2012 IEEE International Workshop on Machine Learning for Signal Processing, pages 1–6. IEEE. """ def __init__( self, n_neighbors=1, weights="uniform", algorithm="auto", leaf_size=30, p=2, metric="minkowski", metric_params=None, n_jobs=None, laplace_smoothing=False, ): super().__init__() self.n_neighbors = n_neighbors self.weights = weights self.algorithm = algorithm self.leaf_size = leaf_size self.p = p self.metric = metric self.metric_params = metric_params self.n_jobs = n_jobs self.laplace_smoothing = laplace_smoothing self.base_estimator = KNeighborsClassifier( n_neighbors=self.n_neighbors, weights=self.weights, algorithm=self.algorithm, leaf_size=self.leaf_size, p=self.p, metric=self.metric, metric_params=self.metric_params, n_jobs=self.n_jobs, )
[docs] def fit(self, X, y=None, *, sample_domain=None): """Fit adaptation parameters. Parameters ---------- X : array-like, shape (n_samples, n_features) The source data. y : array-like, shape (n_samples,) The source labels. sample_domain : array-like, shape (n_samples,) The domain labels (same as sample_domain). Returns ------- self : object Returns self. """ X, sample_domain = check_X_domain(X, sample_domain) X_source, X_target = source_target_split(X, sample_domain=sample_domain) self.X_source_fit = X_source indices_source = np.arange(X_source.shape[0]) self.estimator_ = clone(self.base_estimator) self.estimator_.fit(X_source, indices_source) return self
def _get_weights(self, Xs, Xt): indices_source = np.arange(Xs.shape[0]) estimator = clone(self.base_estimator) estimator.fit(Xs, indices_source) predictions = estimator.predict(Xt) unique, counts = np.unique(predictions, return_counts=True) weights = np.ones(Xs.shape[0]) * float(self.laplace_smoothing) weights[unique] += counts return weights def compute_weights(self, X, y=None, *, sample_domain=None, **params): X, sample_domain = check_X_domain(X, sample_domain, allow_source=True) source_idx = extract_source_indices(sample_domain) # xxx(okachaiev): do we need np.where here? (source_idx,) = np.where(source_idx) indices_source = np.arange(X[source_idx].shape[0]) if np.array_equal(self.X_source_fit, X[source_idx]): estimator = self.estimator_ else: estimator = clone(self.base_estimator) estimator.fit(X[source_idx], indices_source) weights = np.ones(X.shape[0]) weights[source_idx] = self._get_weights(X[source_idx], X[~source_idx]) return weights
[docs] def NearestNeighborReweight( base_estimator=None, n_neighbors=1, weights="uniform", algorithm="auto", leaf_size=30, p=2, metric="minkowski", metric_params=None, n_jobs=None, laplace_smoothing=False, ): """Density re-weighting pipeline adapter and estimator. The last 7 parameters are the parameters from the KNN estimator that will be used to estimate the weights in the `adapt` method See [24]_ for details. Parameters ---------- n_neighbors : int, default=1 Number of neighbors to use for the KNN base_estimator : sklearn estimator, default=None estimator used for fitting and prediction laplace_smoothing : bool, default=False, optional True if we want to use laplace smoothing, and thus adding 1 to all our weights (to prevent some of them to be 0) weights : {'uniform', 'distance'}, callable or None, default='uniform' Weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood are weighted equally. - 'distance' : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. - [callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights. Refer to the example entitled :ref:`sphx_glr_auto_examples_neighbors_plot_classification.py` showing the impact of the `weights` parameter on the decision boundary. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. leaf_size : int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p : float, default=2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected to be positive. metric : str or callable, default='minkowski' Metric to use for distance computation. Default is "minkowski", which results in the standard Euclidean distance when p = 2. See the documentation of `scipy.spatial.distance <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric values. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. If metric is a callable function, it takes two arrays representing 1D vectors as inputs and must return one value indicating the distance between those vectors. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. metric_params : dict, default=None Additional keyword arguments for the metric function. n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. Doesn't affect :meth:`fit` method. Returns ------- pipeline : sklearn pipeline Pipeline containing the DensityReweight adapter and the base estimator. References ---------- .. [24] Nearest neighbor-based importance weighting. In 2012 IEEE International Workshop on Machine Learning for Signal Processing, pages 1–6. IEEE. """ if base_estimator is None: base_estimator = LogisticRegression().set_fit_request(sample_weight=True) return make_da_pipeline( NearestNeighborReweightAdapter( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, leaf_size=leaf_size, p=p, metric=metric, metric_params=metric_params, n_jobs=n_jobs, laplace_smoothing=laplace_smoothing, ), base_estimator, )
[docs] class KMMReweightAdapter(BaseReweightAdapter): """Kernel Mean Matching (KMMReweight). The idea of KMMReweight is to find an importance estimate w(x) such that the Maximum Mean Discrepancy (MMD) divergence between the target input density p_target(x) and the reweighted source input density w(x)p_source(x) is minimized. See [23]_ for details. Parameters ---------- kernel : str, default="rbf" Kernel gamma : float, None Parameters for the kernels. degree : int, 3 Parameters for the kernels. coef0 : float, default Parameters for the kernels. B : float, default=1000. Weight upper bound. eps : float, default=None KMMReweight tolerance parameter. If `None`, eps is set to (sqrt(n_samples_source) - 1) / sqrt(n_samples_source). tol : float, default=1e-6 Tolerance for the stopping criterion in the optimization. max_iter : int, default=100 Number of maximum iteration before stopping the optimization. smooth_weights : bool, default=False If True, the weights are "smoothed" using the kernel function. solver : string, default='frank-wolfe' Available solvers : ['frank-wolfe', 'scipy']. Attributes ---------- `source_weights_` : array-like, shape (n_samples,) The learned source weights. `X_source_` : array-like, shape (n_samples, n_features) The source data. References ---------- .. [23] J. Huang, A. Gretton, K. Borgwardt, B. Schölkopf and A. J. Smola. 'Correcting sample selection bias by unlabeled data.' In NIPS, 2007. """ def __init__( self, kernel="rbf", gamma=None, degree=3, coef0=1, B=1000.0, eps=None, tol=1e-6, max_iter=1000, smooth_weights=False, solver="frank-wolfe", ): super().__init__() self.kernel = kernel self.gamma = gamma self.degree = degree self.coef0 = coef0 self.B = B self.eps = eps self.tol = tol self.max_iter = max_iter self.smooth_weights = smooth_weights self.solver = solver if kernel not in KERNEL_PARAMS: kernel_list = str(list(KERNEL_PARAMS.keys())) raise ValueError( "`kernel` argument should be included in %s," " got '%s'" % (kernel_list, str(kernel)) )
[docs] def fit(self, X, y=None, *, sample_domain=None): """Fit adaptation parameters. Parameters ---------- X : array-like, shape (n_samples, n_features) The source data. y : array-like, shape (n_samples,) The source labels. sample_domain : array-like, shape (n_samples,) The domain labels (same as sample_domain). Returns ------- self : object Returns self. """ X, sample_domain = check_X_domain( X, sample_domain, allow_multi_source=True, allow_multi_target=True ) X_source, X_target = source_target_split(X, sample_domain=sample_domain) self.source_weights_ = self._weights_optimization(X_source, X_target) self.X_source_ = X_source return self
def _weights_optimization(self, X_source, X_target): """Weight optimization""" Kss = pairwise_kernels( X_source, metric=self.kernel, filter_params=True, gamma=self.gamma, degree=self.degree, coef0=self.coef0, ) Kst = pairwise_kernels( X_source, X_target, metric=self.kernel, filter_params=True, gamma=self.gamma, degree=self.degree, coef0=self.coef0, ) Ns = Kss.shape[0] kappa = Ns * Kst.mean(axis=1) if self.eps is None: eps = (np.sqrt(Ns) - 1) / np.sqrt(Ns) else: eps = self.eps A = np.stack([np.ones(Ns), -np.ones(Ns)], axis=0) b = np.array([Ns * (1 + eps), -Ns * (1 - eps)]) weights, _ = qp_solve( Kss, -kappa, A, b, lb=np.zeros(Ns), ub=np.ones(Ns) * self.B, tol=self.tol, max_iter=self.max_iter, solver=self.solver, ) weights = np.array(weights).ravel() return weights def compute_weights(self, X, y=None, *, sample_domain=None, **params): check_is_fitted(self) X, sample_domain = check_X_domain(X, sample_domain, allow_source=True) source_idx = extract_source_indices(sample_domain) if np.array_equal(self.X_source_, X[source_idx]) and not self.smooth_weights: source_weights = self.source_weights_ else: K = pairwise_kernels( X[source_idx], self.X_source_, metric=self.kernel, filter_params=True, gamma=self.gamma, degree=self.degree, coef0=self.coef0, ) source_weights = K.dot(self.source_weights_) source_idx = np.where(source_idx) weights = np.zeros(X.shape[0], dtype=source_weights.dtype) weights[source_idx] = source_weights return weights
[docs] def KMMReweight( base_estimator=None, kernel="rbf", gamma=None, degree=3, coef0=1, B=1000.0, eps=None, tol=1e-6, max_iter=1000, smooth_weights=False, solver="frank-wolfe", ): """KMMReweight pipeline adapter and estimator. see [23]_ for details. Parameters ---------- base_estimator : sklearn estimator, default=LogisticRegression() estimator used for fitting and prediction kernel : str, default="rbf" Kernel gamma : float, None Parameters for the kernels. degree : int, 3 Parameters for the kernels. coef0 : float, default Parameters for the kernels. B : float, default=1000. Weight upper bound. eps : float, default=None KMMReweight tolerance parameter. If `None`, eps is set to (sqrt(n_samples_source) - 1) / sqrt(n_samples_source). tol : float, default=1e-6 Tolerance for the stopping criterion in the optimization. max_iter : int, default=100 Number of maximum iteration before stopping the optimization. smooth_weights : bool, default=False If True, the weights are "smoothed" using the kernel function. Pipeline containing the KMMReweight adapter and the base estimator. solver : string, default='frank-wolfe' Available solvers : ['frank-wolfe', 'scipy']. Returns ------- pipeline : sklearn pipeline Pipeline containing the KMMReweight adapter and the base estimator. References ---------- .. [23] J. Huang, A. Gretton, K. Borgwardt, B. Schölkopf and A. J. Smola. 'Correcting sample selection bias by unlabeled data.' In NIPS, 2007. """ if base_estimator is None: base_estimator = LogisticRegression().set_fit_request(sample_weight=True) return make_da_pipeline( KMMReweightAdapter( kernel=kernel, gamma=gamma, degree=degree, coef0=coef0, B=B, eps=eps, tol=tol, max_iter=max_iter, smooth_weights=smooth_weights, solver=solver, ), base_estimator, )
[docs] class MMDTarSReweightAdapter(BaseReweightAdapter): """Target shift reweighting using MMD. The idea of MMDTarSReweight is to find an importance estimate beta(y) such that the Maximum Mean Discrepancy (MMD) divergence between the source input density p_source(x) to its estimate p_target(x) is minimized under the assumption of equal conditional distributions p(x|y) for both source and target domains. See Section 3 of [21]_ for details. .. warning:: This adapter uses a nearest neighbors approach to compute weights when adapting on source data different from the fitted source data. Parameters ---------- gamma : float or array like Parameters for the kernels. reg : float, default=1e-10 Regularization parameter for the labels kernel matrix. tol : float, default=1e-6 Tolerance for the stopping criterion in the optimization. max_iter : int, default=1000 Number of maximum iteration before stopping the optimization. Attributes ---------- `source_weights_` : array-like, shape (n_samples,) The learned source weights. `alpha_` : array-like, shape (n_classes,) or (n_samples,) The learned kernel weights. `X_source_` : array-like, shape (n_samples, n_features) The source data. References ---------- .. [21] Kun Zhang et. al. Domain Adaptation under Target and Conditional Shift In ICML, 2013. """ def __init__(self, gamma, reg=1e-10, tol=1e-6, max_iter=1000): super().__init__() self.gamma = gamma self.reg = reg self.tol = tol self.max_iter = max_iter def _weights_optimization(self, X_source, X_target, y_source): """Weight optimization""" m, n = X_source.shape[0], X_target.shape[0] # check y is discrete or continuous self.discrete_ = discrete = _find_y_type(y_source) == Y_Type.DISCRETE # compute A L = pairwise_kernels(y_source.reshape(-1, 1), metric="rbf", gamma=self.gamma) K = pairwise_kernels(X_source, metric="rbf", gamma=self.gamma) omega = L @ np.linalg.inv(L + self.reg * np.eye(m)) A = omega @ K @ omega.T # compute R if discrete: self.classes_ = classes = np.unique(y_source) R = np.zeros((m, len(classes))) for i, c in enumerate(classes): R[:, i] = (y_source == c).astype(int) else: self.classes_ = None R = L @ np.linalg.inv(L + self.reg * np.eye(m)) # compute M K_cross = pairwise_kernels(X_target, X_source, metric="rbf", gamma=self.gamma) M = np.ones((1, n)) @ K_cross @ omega.T # solve the optimization problem # min_alpha 0.5 * alpha^T P alpha - q^T alpha # s.t. 0 <= R alpha <= B_beta # m (1 - eps) <= 1^T R alpha <= m (1 + eps) P = R.T @ A @ R P = P + 1e-12 * np.eye(P.shape[0]) # make P positive semi-definite q = -(m / n) * ((M @ R).T).flatten() B_beta = 10 eps = B_beta / (4 * np.sqrt(m)) A = np.vstack( [-R, -np.sum(R, axis=0, keepdims=True), R, np.sum(R, axis=0, keepdims=True)] ) b = np.concatenate( [ np.zeros(R.shape[0]), -np.array([m * (1 - eps)]), B_beta * np.ones(R.shape[0]), np.array([m * (1 + eps)]), ] ) outputs = qp_solve(Q=P, c=q, A=A, b=b, tol=self.tol, max_iter=self.max_iter) alpha = outputs[0] weights = (R @ alpha).flatten() return weights, alpha
[docs] def fit(self, X, y=None, *, sample_domain=None): """Fit adaptation parameters. Parameters ---------- X : array-like, shape (n_samples, n_features) The source data. y : array-like, shape (n_samples,) The source labels. sample_domain : array-like, shape (n_samples,) The domain labels (same as sample_domain). Returns ------- self : object Returns self. """ X, sample_domain = check_X_domain(X, sample_domain) X_source, X_target, y_source, _ = source_target_split( X, y, sample_domain=sample_domain ) self.X_source_ = X_source self.source_weights_, self.alpha_ = self._weights_optimization( X_source, X_target, y_source ) return self
def compute_weights(self, X, y=None, *, sample_domain=None, **params): check_is_fitted(self) X, y, sample_domain = check_X_y_domain( X, y, sample_domain, allow_label_masks=True ) source_idx = extract_source_indices(sample_domain) if np.array_equal(self.X_source_, X[source_idx]): source_weights = self.source_weights_ else: if self.discrete_: # assign the classes weights to the source samples y_source = y[source_idx] classes = self.classes_ R = np.zeros((source_idx.sum(), len(classes))) for i, c in enumerate(classes): R[:, i] = (y_source == c).astype(np.int32) source_weights = R @ self.alpha_ else: # assign the nearest neighbor's weights to the source samples C = pairwise_distances(X[source_idx], self.X_source_) idx = np.argmin(C, axis=1) source_weights = self.source_weights_[idx] source_idx = np.where(source_idx) weights = np.zeros(X.shape[0], dtype=source_weights.dtype) weights[source_idx] = source_weights weights += 1e-13 # avoid negative weights return weights
[docs] def MMDTarSReweight( base_estimator=None, gamma=1.0, reg=1e-10, tol=1e-6, max_iter=1000, ): """Target shift reweighting using MMD. See Section 3 of [21]_ for details. Parameters ---------- base_estimator : sklearn estimator, default=None Estimator used for fitting and prediction gamma : float or array like Parameters for the kernels. reg : float, default=1e-10 Regularization parameter for the labels kernel matrix. tol : float, default=1e-6 Tolerance for the stopping criterion in the optimization. max_iter : int, default=1000 Number of maximum iteration before stopping the optimization. Returns ------- pipeline : sklearn pipeline Pipeline containing the DensityReweight adapter and the base estimator. References ---------- .. [21] Kun Zhang et. al. Domain Adaptation under Target and Conditional Shift In ICML, 2013. """ if base_estimator is None: base_estimator = SVC().set_fit_request(sample_weight=True) return make_da_pipeline( MMDTarSReweightAdapter(gamma=gamma, reg=reg, tol=tol, max_iter=max_iter), base_estimator, )