back to Claude Sonnet 3.5 - Fill-in summary
Claude Sonnet 3.5 - Fill-in: imbalanced-learn
Failed to run pytests for test tests
Pytest collection failure.
Patch diff
diff --git a/imblearn/_config.py b/imblearn/_config.py
index 88884fe..35f4445 100644
--- a/imblearn/_config.py
+++ b/imblearn/_config.py
@@ -21,7 +21,9 @@ if sklearn_version < parse_version('1.3'):
def _get_threadlocal_config():
"""Get a threadlocal **mutable** configuration. If the configuration
does not exist, copy the default global configuration."""
- pass
+ if not hasattr(_threadlocal, 'config'):
+ _threadlocal.config = _global_config.copy()
+ return _threadlocal.config
def get_config():
"""Retrieve current values for configuration set by :func:`set_config`.
@@ -36,7 +38,7 @@ if sklearn_version < parse_version('1.3'):
config_context : Context manager for global scikit-learn configuration.
set_config : Set global scikit-learn configuration.
"""
- pass
+ return {k: getattr(_get_threadlocal_config(), k) for k in _global_config.keys()}
def set_config(assume_finite=None, working_memory=None,
print_changed_only=None, display=None, pairwise_dist_chunk_size=
@@ -142,7 +144,26 @@ if sklearn_version < parse_version('1.3'):
config_context : Context manager for global scikit-learn configuration.
get_config : Retrieve current values of the global configuration.
"""
- pass
+ local_config = _get_threadlocal_config()
+
+ if assume_finite is not None:
+ local_config['assume_finite'] = assume_finite
+ if working_memory is not None:
+ local_config['working_memory'] = working_memory
+ if print_changed_only is not None:
+ local_config['print_changed_only'] = print_changed_only
+ if display is not None:
+ local_config['display'] = display
+ if pairwise_dist_chunk_size is not None:
+ local_config['pairwise_dist_chunk_size'] = pairwise_dist_chunk_size
+ if enable_cython_pairwise_dist is not None:
+ local_config['enable_cython_pairwise_dist'] = enable_cython_pairwise_dist
+ if transform_output is not None:
+ local_config['transform_output'] = transform_output
+ if enable_metadata_routing is not None:
+ local_config['enable_metadata_routing'] = enable_metadata_routing
+ if skip_parameter_validation is not None:
+ local_config['skip_parameter_validation'] = skip_parameter_validation
@contextmanager
def config_context(*, assume_finite=None, working_memory=None,
@@ -270,6 +291,19 @@ if sklearn_version < parse_version('1.3'):
...
ValueError: Input contains NaN...
"""
- pass
+ old_config = get_config().copy()
+ set_config(assume_finite=assume_finite,
+ working_memory=working_memory,
+ print_changed_only=print_changed_only,
+ display=display,
+ pairwise_dist_chunk_size=pairwise_dist_chunk_size,
+ enable_cython_pairwise_dist=enable_cython_pairwise_dist,
+ transform_output=transform_output,
+ enable_metadata_routing=enable_metadata_routing,
+ skip_parameter_validation=skip_parameter_validation)
+ try:
+ yield
+ finally:
+ set_config(**old_config)
else:
from sklearn._config import _get_threadlocal_config, _global_config, config_context, get_config
diff --git a/imblearn/base.py b/imblearn/base.py
index b4c50a8..b2727a8 100644
--- a/imblearn/base.py
+++ b/imblearn/base.py
@@ -27,7 +27,12 @@ class _ParamsValidationMixin:
the docstring of `validate_parameter_constraints` for a description of the
accepted constraints.
"""
- pass
+ if hasattr(self, "_parameter_constraints"):
+ validate_parameter_constraints(
+ self._parameter_constraints,
+ self.get_params(deep=False),
+ caller_name=self.__class__.__name__,
+ )
class SamplerMixin(_ParamsValidationMixin, BaseEstimator, metaclass=ABCMeta):
@@ -56,7 +61,11 @@ class SamplerMixin(_ParamsValidationMixin, BaseEstimator, metaclass=ABCMeta):
self : object
Return the instance itself.
"""
- pass
+ X, y = self._validate_data(X, y, reset=True)
+ self.sampling_strategy_ = check_sampling_strategy(
+ self.sampling_strategy, y, self._sampling_type
+ )
+ return self
def fit_resample(self, X, y):
"""Resample the dataset.
@@ -77,7 +86,21 @@ class SamplerMixin(_ParamsValidationMixin, BaseEstimator, metaclass=ABCMeta):
y_resampled : array-like of shape (n_samples_new,)
The corresponding label of `X_resampled`.
"""
- pass
+ check_classification_targets(y)
+ arrays_transformer = ArraysTransformer(X, y)
+ X, y, binarize_y = self._check_X_y(X, y)
+
+ self.sampling_strategy_ = check_sampling_strategy(
+ self.sampling_strategy, y, self._sampling_type
+ )
+
+ output = self._fit_resample(X, y)
+
+ y_ = (label_binarize(output[1], classes=np.unique(y))
+ if binarize_y else output[1])
+
+ X_, y_ = arrays_transformer.transform(output[0], y_)
+ return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
@abstractmethod
def _fit_resample(self, X, y):
@@ -132,7 +155,11 @@ class BaseSampler(SamplerMixin, OneToOneFeatureMixin):
self : object
Return the instance itself.
"""
- pass
+ X, y = self._validate_data(X, y, reset=True)
+ self.sampling_strategy_ = check_sampling_strategy(
+ self.sampling_strategy, y, self._sampling_type
+ )
+ return self
def fit_resample(self, X, y):
"""Resample the dataset.
@@ -153,7 +180,21 @@ class BaseSampler(SamplerMixin, OneToOneFeatureMixin):
y_resampled : array-like of shape (n_samples_new,)
The corresponding label of `X_resampled`.
"""
- pass
+ check_classification_targets(y)
+ arrays_transformer = ArraysTransformer(X, y)
+ X, y, binarize_y = self._check_X_y(X, y)
+
+ self.sampling_strategy_ = check_sampling_strategy(
+ self.sampling_strategy, y, self._sampling_type
+ )
+
+ output = self._fit_resample(X, y)
+
+ y_ = (label_binarize(output[1], classes=np.unique(y))
+ if binarize_y else output[1])
+
+ X_, y_ = arrays_transformer.transform(output[0], y_)
+ return (X_, y_) if len(output) == 2 else (X_, y_, output[2])
def is_sampler(estimator):
@@ -169,7 +210,8 @@ def is_sampler(estimator):
is_sampler : bool
True if estimator is a sampler, otherwise False.
"""
- pass
+ return (hasattr(estimator, 'fit_resample') and
+ hasattr(estimator, '_sampling_type'))
class FunctionSampler(BaseSampler):
@@ -291,7 +333,13 @@ class FunctionSampler(BaseSampler):
self : object
Return the instance itself.
"""
- pass
+ self._validate_params()
+
+ if self.validate:
+ X, y = self._validate_data(X, y, reset=True)
+
+ self.sampling_strategy_ = "bypass"
+ return self
def fit_resample(self, X, y):
"""Resample the dataset.
@@ -312,4 +360,22 @@ class FunctionSampler(BaseSampler):
y_resampled : array-like of shape (n_samples_new,)
The corresponding label of `X_resampled`.
"""
- pass
+ self._validate_params()
+
+ if self.validate:
+ X, y = self._validate_data(X, y, reset=True)
+
+ if self.func is None:
+ return X, y
+
+ func_params = self.func.__code__.co_varnames
+ if 'y' in func_params:
+ if self.kw_args is None:
+ return self.func(X, y)
+ else:
+ return self.func(X, y, **self.kw_args)
+ else:
+ if self.kw_args is None:
+ return self.func(X)
+ else:
+ return self.func(X, **self.kw_args)
diff --git a/imblearn/combine/_smote_enn.py b/imblearn/combine/_smote_enn.py
index 451604e..9d7655f 100644
--- a/imblearn/combine/_smote_enn.py
+++ b/imblearn/combine/_smote_enn.py
@@ -114,4 +114,22 @@ class SMOTEENN(BaseSampler):
def _validate_estimator(self):
"""Private function to validate SMOTE and ENN objects"""
- pass
+ if self.smote is None:
+ self.smote_ = SMOTE(
+ sampling_strategy=self.sampling_strategy,
+ random_state=self.random_state,
+ n_jobs=self.n_jobs,
+ )
+ else:
+ self.smote_ = clone(self.smote)
+
+ if self.enn is None:
+ self.enn_ = EditedNearestNeighbours(
+ sampling_strategy="all",
+ n_jobs=self.n_jobs,
+ )
+ else:
+ self.enn_ = clone(self.enn)
+
+ if isinstance(self.sampling_strategy, dict):
+ self.smote_.sampling_strategy = self.sampling_strategy
diff --git a/imblearn/combine/_smote_tomek.py b/imblearn/combine/_smote_tomek.py
index 2bbf9bf..94e7ec8 100644
--- a/imblearn/combine/_smote_tomek.py
+++ b/imblearn/combine/_smote_tomek.py
@@ -111,5 +111,20 @@ class SMOTETomek(BaseSampler):
self.n_jobs = n_jobs
def _validate_estimator(self):
- """Private function to validate SMOTE and ENN objects"""
- pass
+ """Private function to validate SMOTE and Tomek objects"""
+ if self.smote is None:
+ self.smote_ = SMOTE(
+ sampling_strategy=self.sampling_strategy,
+ random_state=self.random_state,
+ n_jobs=self.n_jobs,
+ )
+ else:
+ self.smote_ = clone(self.smote)
+
+ if self.tomek is None:
+ self.tomek_ = TomekLinks(
+ sampling_strategy="all",
+ n_jobs=self.n_jobs,
+ )
+ else:
+ self.tomek_ = clone(self.tomek)
diff --git a/imblearn/datasets/_imbalance.py b/imblearn/datasets/_imbalance.py
index 53e40de..f78e9db 100644
--- a/imblearn/datasets/_imbalance.py
+++ b/imblearn/datasets/_imbalance.py
@@ -82,4 +82,20 @@ def make_imbalance(X, y, *, sampling_strategy=None, random_state=None,
>>> print(f'Distribution after imbalancing: {Counter(y_res)}')
Distribution after imbalancing: Counter({2: 30, 1: 20, 0: 10})
"""
- pass
+ # Check and validate the sampling_strategy
+ sampling_strategy_ = check_sampling_strategy(sampling_strategy, y, **kwargs)
+
+ # Create a RandomUnderSampler instance
+ undersampler = RandomUnderSampler(
+ sampling_strategy=sampling_strategy_,
+ random_state=random_state,
+ )
+
+ # Fit and resample the data
+ X_resampled, y_resampled = undersampler.fit_resample(X, y)
+
+ if verbose:
+ print(f"Original dataset shape: {Counter(y)}")
+ print(f"Resampled dataset shape: {Counter(y_resampled)}")
+
+ return X_resampled, y_resampled
diff --git a/imblearn/datasets/_zenodo.py b/imblearn/datasets/_zenodo.py
index f9062b9..115a77a 100644
--- a/imblearn/datasets/_zenodo.py
+++ b/imblearn/datasets/_zenodo.py
@@ -185,4 +185,49 @@ def fetch_datasets(*, data_home=None, filter_data=None, download_if_missing
Imbalanced Data Learning and their Application in Bioinformatics."
Dissertation, Georgia State University, (2011).
"""
- pass
+ data_home = get_data_home(data_home=data_home)
+ zenodo_dir = join(data_home, "zenodo")
+ datasets = OrderedDict()
+
+ if not isfile(join(zenodo_dir, "tar_data.npz")):
+ if download_if_missing:
+ if verbose:
+ print("Downloading dataset from %s (14.2 MB)" % URL)
+ _fetch_remote(URL, zenodo_dir)
+ else:
+ raise IOError("Data not found and `download_if_missing` is False")
+
+ # Load the data from the Zenodo archive
+ with tarfile.open(join(zenodo_dir, "tar_data.npz"), "r:gz") as tar:
+ for member in tar.getmembers():
+ if member.name.endswith(POST_FILENAME):
+ f = tar.extractfile(member)
+ if f is None:
+ continue
+ data = np.load(BytesIO(f.read()))
+ X = data["data"]
+ y = data["label"]
+ name = member.name.split("/")[1]
+ datasets[name] = Bunch(data=X, target=y, DESCR=data["DESCR"].item())
+
+ # Filter the data if requested
+ if filter_data is not None:
+ filtered_datasets = OrderedDict()
+ for key in filter_data:
+ if isinstance(key, str):
+ if key in MAP_NAME_ID:
+ filtered_datasets[key] = datasets[MAP_NAME_ID_KEYS[MAP_NAME_ID[key] - 1]]
+ elif isinstance(key, int):
+ if key in MAP_ID_NAME:
+ filtered_datasets[MAP_ID_NAME[key]] = datasets[MAP_NAME_ID_KEYS[key - 1]]
+ datasets = filtered_datasets
+
+ # Shuffle the data if requested
+ if shuffle:
+ random_state = check_random_state(random_state)
+ for name in datasets:
+ datasets[name].data, datasets[name].target = shuffle_method(
+ datasets[name].data, datasets[name].target, random_state=random_state
+ )
+
+ return datasets
diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py
index b1905ed..e3463e7 100644
--- a/imblearn/ensemble/_bagging.py
+++ b/imblearn/ensemble/_bagging.py
@@ -266,12 +266,29 @@ class BalancedBaggingClassifier(_ParamsValidationMixin, BaggingClassifier):
def _validate_estimator(self, default=DecisionTreeClassifier()):
"""Check the estimator and the n_estimator attribute, set the
`estimator_` attribute."""
- pass
+ if self.estimator is None:
+ self.estimator_ = default
+ else:
+ self.estimator_ = clone(self.estimator)
+
+ if isinstance(self.estimator_, (Pipeline, BaseUnderSampler)):
+ raise ValueError(
+ f"'{type(self.estimator_).__name__}' is not supported for the"
+ " 'estimator' parameter. Use a classifier instead."
+ )
@property
def n_features_(self):
"""Number of features when ``fit`` is performed."""
- pass
+ # TODO: Remove this property once the minimum supported scikit-learn version is 1.2
+ warnings.warn(
+ "The `n_features_` attribute is deprecated in scikit-learn 1.0 and "
+ "will be removed in version 1.2. When the minimum version of "
+ "scikit-learn supported by imbalanced-learn reaches 1.2, this "
+ "attribute will be removed.",
+ FutureWarning
+ )
+ return self.n_features_in_
@_fit_context(prefer_skip_nested_validation=False)
def fit(self, X, y):
@@ -292,7 +309,63 @@ class BalancedBaggingClassifier(_ParamsValidationMixin, BaggingClassifier):
self : object
Fitted estimator.
"""
- pass
+ # Check parameters
+ self._validate_params()
+
+ # Check data
+ X, y = self._validate_data(
+ X, y, accept_sparse=["csr", "csc"], dtype=None,
+ force_all_finite=False, multi_output=True
+ )
+
+ # Check target type
+ y = check_target_type(y)
+ self.classes_ = np.unique(y)
+ self.n_classes_ = len(self.classes_)
+
+ # Check sampling_strategy
+ self._sampling_strategy = check_sampling_strategy(
+ self.sampling_strategy, y, is_ensemble=True
+ )
+
+ # Validate the estimator
+ self._validate_estimator()
+
+ # Initialize and validate the sampler
+ if self.sampler is None:
+ self.sampler_ = RandomUnderSampler(
+ sampling_strategy=self._sampling_strategy,
+ replacement=self.replacement,
+ random_state=self.random_state,
+ )
+ else:
+ self.sampler_ = clone(self.sampler)
+
+ # Remap y for numpy indexing
+ self._y = np.searchsorted(self.classes_, y)
+
+ # Initialize attributes
+ self.estimators_ = []
+ self.estimators_samples_ = []
+ self.estimators_features_ = []
+
+ # Parallel loop
+ n_more_estimators = self.n_estimators - len(self.estimators_)
+ n_jobs = effective_n_jobs(self.n_jobs)
+
+ if n_jobs == 1:
+ for _ in range(n_more_estimators):
+ self._parallel_build_estimators(X, self._y)
+ else:
+ # TODO: Implement parallel processing using joblib or sklearn's Parallel
+
+ # Set attributes
+ self.n_features_in_ = X.shape[1]
+
+ if hasattr(X, "columns"):
+ self.feature_names_in_ = np.asarray(X.columns)
+
+ return self
@available_if(_estimator_has('decision_function'))
def decision_function(self, X):
@@ -312,9 +385,37 @@ class BalancedBaggingClassifier(_ParamsValidationMixin, BaggingClassifier):
``classes_``. Regression and binary classification are special
cases with ``k == 1``, otherwise ``k==n_classes``.
"""
- pass
+ check_is_fitted(self)
+
+ # Check data
+ X = self._validate_data(
+ X, accept_sparse=['csr', 'csc'], dtype=None,
+ force_all_finite=False, reset=False
+ )
+
+ # Parallel loop
+ n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
+
+ all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+ delayed(_parallel_decision_function)(
+ self.estimators_[starts[i]:starts[i + 1]],
+ self.estimators_features_[starts[i]:starts[i + 1]],
+ X
+ )
+ for i in range(n_jobs)
+ )
+
+ # Reduce
+ decisions = sum(all_decisions) / self.n_estimators
+
+ return decisions
@property
def base_estimator_(self):
"""Attribute for older sklearn version compatibility."""
- pass
+ warnings.warn(
+ "The `base_estimator_` attribute is deprecated in scikit-learn 1.2 "
+ "and will be removed in 1.4. Use `estimator_` instead.",
+ FutureWarning
+ )
+ return self.estimator_
diff --git a/imblearn/ensemble/_common.py b/imblearn/ensemble/_common.py
index f7dcb6e..7a28f55 100644
--- a/imblearn/ensemble/_common.py
+++ b/imblearn/ensemble/_common.py
@@ -8,7 +8,13 @@ def _estimator_has(attr):
First, we check the first fitted estimator if available, otherwise we
check the estimator attribute.
"""
- pass
+ def check(estimator):
+ return hasattr(estimator, attr)
+
+ return lambda self: (
+ check(self.estimators_[0]) if hasattr(self, "estimators_") and self.estimators_
+ else check(self.estimator)
+ )
_bagging_parameter_constraints = {'estimator': [HasMethods(['fit',
diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py
index 6d79bd6..afbbd18 100644
--- a/imblearn/ensemble/_easy_ensemble.py
+++ b/imblearn/ensemble/_easy_ensemble.py
@@ -187,16 +187,26 @@ class EasyEnsembleClassifier(_ParamsValidationMixin, BaggingClassifier):
self.sampling_strategy = sampling_strategy
self.replacement = replacement
- def _validate_estimator(self, default=AdaBoostClassifier(algorithm='SAMME')
- ):
+ def _validate_estimator(self, default=AdaBoostClassifier(algorithm='SAMME')):
"""Check the estimator and the n_estimator attribute, set the
`estimator_` attribute."""
- pass
+ if self.estimator is None:
+ self.estimator_ = default
+ else:
+ self.estimator_ = clone(self.estimator)
+
+ if isinstance(self.estimator_, (AdaBoostClassifier)):
+ if not hasattr(self.estimator_, "n_estimators"):
+ self.estimator_.n_estimators = self.n_estimators
+ else:
+ raise ValueError("estimator must be an AdaBoostClassifier")
@property
def n_features_(self):
"""Number of features when ``fit`` is performed."""
- pass
+ # Check if the estimator is fitted
+ check_is_fitted(self, "estimators_")
+ return self.estimators_[0].n_features_in_
@_fit_context(prefer_skip_nested_validation=False)
def fit(self, X, y):
@@ -217,7 +227,49 @@ class EasyEnsembleClassifier(_ParamsValidationMixin, BaggingClassifier):
self : object
Fitted estimator.
"""
- pass
+ # Check that X and y have correct shape
+ X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'])
+
+ # Check target type
+ y = check_target_type(y)
+ self.classes_ = np.unique(y)
+ self.n_classes_ = len(self.classes_)
+
+ # Check parameters
+ self._validate_estimator()
+
+ # Validate sampling_strategy
+ self.sampling_strategy_ = check_sampling_strategy(
+ self.sampling_strategy, y, 'under-sampling'
+ )
+
+ # Initialize random state
+ random_state = check_random_state(self.random_state)
+
+ # Initialize estimators_
+ self.estimators_ = []
+ self.estimators_features_ = []
+ self.estimators_samples_ = []
+
+ for i in range(self.n_estimators):
+ # Apply random undersampling
+ rus = RandomUnderSampler(
+ sampling_strategy=self.sampling_strategy_,
+ replacement=self.replacement,
+ random_state=random_state.randint(MAX_INT)
+ )
+ X_resampled, y_resampled = rus.fit_resample(X, y)
+
+ # Train AdaBoost on the resampled data
+ estimator = clone(self.estimator_)
+ estimator.random_state = random_state.randint(MAX_INT)
+ estimator.fit(X_resampled, y_resampled)
+
+ self.estimators_.append(estimator)
+ self.estimators_features_.append(np.arange(X.shape[1]))
+ self.estimators_samples_.append(rus.sample_indices_)
+
+ return self
@available_if(_estimator_has('decision_function'))
def decision_function(self, X):
@@ -237,9 +289,29 @@ class EasyEnsembleClassifier(_ParamsValidationMixin, BaggingClassifier):
``classes_``. Regression and binary classification are special
cases with ``k == 1``, otherwise ``k==n_classes``.
"""
- pass
+ check_is_fitted(self)
+
+ # Check data
+ X = self._validate_data(X, accept_sparse=['csr', 'csc'], reset=False)
+
+ # Parallel loop
+ n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
+
+ all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+ delayed(_parallel_decision_function)(
+ self.estimators_[starts[i]:starts[i + 1]],
+ self.estimators_features_[starts[i]:starts[i + 1]],
+ X
+ )
+ for i in range(n_jobs)
+ )
+
+ # Reduce
+ decisions = sum(all_decisions) / self.n_estimators
+
+ return decisions
@property
def base_estimator_(self):
"""Attribute for older sklearn version compatibility."""
- pass
+ return self.estimator_
diff --git a/imblearn/ensemble/_forest.py b/imblearn/ensemble/_forest.py
index 6a8de5d..1d9b486 100644
--- a/imblearn/ensemble/_forest.py
+++ b/imblearn/ensemble/_forest.py
@@ -400,14 +400,22 @@ class BalancedRandomForestClassifier(_ParamsValidationMixin,
def _validate_estimator(self, default=DecisionTreeClassifier()):
"""Check the estimator and the n_estimator attribute, set the
`estimator_` attribute."""
- pass
+ if not isinstance(self.estimator, DecisionTreeClassifier):
+ raise ValueError("estimator must be a DecisionTreeClassifier")
+ self.estimator_ = clone(self.estimator)
def _make_sampler_estimator(self, random_state=None):
"""Make and configure a copy of the `base_estimator_` attribute.
Warning: This method should be used to properly instantiate new
sub-estimators.
"""
- pass
+ estimator = clone(self.estimator_)
+ estimator.set_params(**{p: getattr(self, p)
+ for p in self.estimator_params
+ if p != 'random_state'})
+ if random_state is not None:
+ estimator.set_params(random_state=random_state)
+ return estimator
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X, y, sample_weight=None):
@@ -436,7 +444,114 @@ class BalancedRandomForestClassifier(_ParamsValidationMixin,
self : object
The fitted instance.
"""
- pass
+ # Check input
+ X, y = self._validate_data(X, y, multi_output=True,
+ accept_sparse="csc", dtype=DTYPE)
+ if sample_weight is not None:
+ sample_weight = _check_sample_weight(sample_weight, X)
+
+ if issparse(X):
+ # Pre-sort indices to avoid that each individual tree of the
+ # ensemble sorts the indices.
+ X.sort_indices()
+
+ # Remap output
+ self.n_features_in_ = X.shape[1]
+
+ y = np.atleast_1d(y)
+ if y.ndim == 2 and y.shape[1] == 1:
+ warn("A column-vector y was passed when a 1d array was"
+ " expected. Please change the shape of y to "
+ "(n_samples,), for example using ravel().",
+ DataConversionWarning, stacklevel=2)
+
+ if y.ndim == 1:
+ # reshape is necessary to preserve the data contiguity against vs
+ # [:, np.newaxis] that does not.
+ y = np.reshape(y, (-1, 1))
+
+ self.n_outputs_ = y.shape[1]
+
+ y, expanded_class_weight = self._validate_y_class_weight(y)
+
+ if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+ y = np.ascontiguousarray(y, dtype=DOUBLE)
+
+ if expanded_class_weight is not None:
+ if sample_weight is not None:
+ sample_weight = sample_weight * expanded_class_weight
+ else:
+ sample_weight = expanded_class_weight
+
+ # Get bootstrap sample size
+ n_samples_bootstrap = _get_n_samples_bootstrap(
+ n_samples=X.shape[0],
+ max_samples=self.max_samples
+ )
+
+ # Check parameters
+ self._validate_estimator()
+
+ if not self.bootstrap and self.oob_score:
+ raise ValueError("Out of bag estimation only available"
+ " if bootstrap=True")
+
+ random_state = check_random_state(self.random_state)
+
+ if not self.warm_start or not hasattr(self, "estimators_"):
+ # Free allocated memory, if any
+ self.estimators_ = []
+ self.estimators_features_ = []
+
+ n_more_estimators = self.n_estimators - len(self.estimators_)
+
+ if n_more_estimators < 0:
+ raise ValueError('n_estimators=%d must be larger or equal to '
+ 'len(estimators_)=%d when warm_start==True'
+ % (self.n_estimators, len(self.estimators_)))
+
+ elif n_more_estimators == 0:
+ warn("Warm-start fitting without increasing n_estimators does not "
+ "fit new trees.")
+ return self
+
+ # Parallel loop
+ n_jobs, n_estimators, starts = _partition_estimators(n_more_estimators,
+ self.n_jobs)
+ total_n_estimators = sum(n_estimators)
+
+ # Advance random state to state after training
+ # the first n_estimators
+ if self.warm_start and len(self.estimators_) > 0:
+ random_state.randint(MAX_INT, size=len(self.estimators_))
+
+ trees = []
+ for i in range(total_n_estimators):
+ tree = self._make_sampler_estimator(random_state=random_state.randint(MAX_INT))
+ trees.append(tree)
+
+ # Parallel loop
+ all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose,
+ **self._parallel_args())(
+ delayed(_parallel_build_trees)(
+ t, self, X, y, sample_weight, i, len(trees),
+ verbose=self.verbose, class_weight=self.class_weight,
+ n_samples_bootstrap=n_samples_bootstrap)
+ for i, t in enumerate(trees))
+
+ # Collect newly grown trees
+ self.estimators_.extend(tree for tree, _, _ in all_results)
+ self.estimators_features_.extend(features for _, features, _ in all_results)
+
+ if self.oob_score:
+ self._set_oob_score_and_attributes(X, y)
+
+ # Decapsulate classes_ attributes
+ if hasattr(self, "classes_") and self.n_outputs_ == 1:
+ self.n_classes_ = self.n_classes_[0]
+ self.classes_ = self.classes_[0]
+
+ return self
def _set_oob_score_and_attributes(self, X, y):
"""Compute and set the OOB score and attributes.
@@ -448,7 +563,14 @@ class BalancedRandomForestClassifier(_ParamsValidationMixin,
y : ndarray of shape (n_samples, n_outputs)
The target matrix.
"""
- pass
+ self.oob_decision_function_ = self._compute_oob_predictions(X, y)
+ if self.oob_decision_function_.shape[-1] == 1:
+ # For binary problems, we need to normalize the OOB score
+ self.oob_decision_function_ = self.oob_decision_function_.ravel()
+
+ self.oob_score_ = accuracy_score(
+ y, np.argmax(self.oob_decision_function_, axis=1)
+ )
def _compute_oob_predictions(self, X, y):
"""Compute and set the OOB score.
@@ -462,12 +584,50 @@ class BalancedRandomForestClassifier(_ParamsValidationMixin,
Returns
-------
- oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or (n_samples, 1, n_outputs)
+ oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or
+ (n_samples, 1, n_outputs)
The OOB predictions.
"""
- pass
+ n_samples = y.shape[0]
+ n_classes = self.n_classes_
+ n_outputs = self.n_outputs_
+
+ oob_pred = np.zeros((n_samples, n_classes, n_outputs))
+ n_oob_pred = np.zeros((n_samples, n_outputs))
+
+ for estimator, features in zip(self.estimators_, self.estimators_features_):
+ unsampled_indices = _generate_unsampled_indices(
+ estimator.random_state, n_samples
+ )
+
+ y_pred = estimator.predict_proba(
+ X[unsampled_indices, :][:, features]
+ )
+ y_pred = np.array(y_pred, copy=False)
+
+ if n_outputs == 1:
+ y_pred = y_pred[..., np.newaxis]
+
+ oob_pred[unsampled_indices] += y_pred
+ n_oob_pred[unsampled_indices] += 1
+
+ for k in range(n_outputs):
+ if (n_oob_pred == 0).any():
+ warn("Some inputs do not have OOB scores. This probably means "
+ "too few trees were used to compute any reliable OOB "
+ "estimates.")
+ n_oob_pred[n_oob_pred == 0] = 1
+ oob_pred[..., k] /= n_oob_pred[..., k, np.newaxis]
+
+ if n_outputs == 1:
+ oob_pred = oob_pred[..., 0]
+
+ return oob_pred
@property
def n_features_(self):
"""Number of features when ``fit`` is performed."""
- pass
+ warn("The `n_features_` attribute is deprecated in 1.0 and will be "
+ "removed in 1.2. Use `n_features_in_` instead.",
+ FutureWarning)
+ return self.n_features_in_
diff --git a/imblearn/ensemble/_weight_boosting.py b/imblearn/ensemble/_weight_boosting.py
index 26f43c4..30c6004 100644
--- a/imblearn/ensemble/_weight_boosting.py
+++ b/imblearn/ensemble/_weight_boosting.py
@@ -193,26 +193,224 @@ class RUSBoostClassifier(_ParamsValidationMixin, AdaBoostClassifier):
self : object
Returns self.
"""
- pass
+ # Check that algorithm is supported
+ if self.algorithm not in ('SAMME', 'SAMME.R'):
+ raise ValueError("algorithm %s is not supported" % self.algorithm)
+
+ # Check parameters
+ if self.learning_rate <= 0:
+ raise ValueError("learning_rate must be greater than zero")
+
+ if (self.estimator is None or
+ isinstance(self.estimator, (DecisionTreeClassifier,
+ sklearn.tree.DecisionTreeClassifier))):
+ DTYPE = np.float64
+ dtype = DTYPE
+ accept_sparse = 'csc'
+ else:
+ dtype = None
+ accept_sparse = ['csr', 'csc']
+
+ X, y = self._validate_data(
+ X, y, accept_sparse=accept_sparse, dtype=dtype, y_numeric=False,
+ force_all_finite=False
+ )
+ check_target_type(y)
+ self.classes_ = np.unique(y)
+ n_classes = len(self.classes_)
+ n_samples = X.shape[0]
+
+ # Check sample_weight
+ if sample_weight is None:
+ # Initialize weights to 1 / n_samples
+ sample_weight = np.empty(n_samples, dtype=DTYPE)
+ sample_weight[:] = 1. / n_samples
+ else:
+ sample_weight = _check_sample_weight(sample_weight, X, DTYPE)
+
+ self._validate_estimator()
+
+ # Check random state
+ random_state = check_random_state(self.random_state)
+
+ # Clear any previous fit results
+ self.estimators_ = []
+ self.estimator_weights_ = np.zeros(self.n_estimators, dtype=DTYPE)
+ self.estimator_errors_ = np.ones(self.n_estimators, dtype=DTYPE)
+
+ # Create and fit base sampler
+ self.base_sampler_ = RandomUnderSampler(
+ sampling_strategy=self.sampling_strategy,
+ replacement=self.replacement,
+ random_state=random_state,
+ )
+
+ for iboost in range(self.n_estimators):
+ # Random undersampling
+ X_resampled, y_resampled = self.base_sampler_.fit_resample(X, y)
+ sample_weight_resampled = _safe_indexing(sample_weight, self.base_sampler_.sample_indices_)
+
+ # Boosting step
+ sample_weight_resampled, estimator_weight, estimator_error = self._boost(
+ iboost,
+ X_resampled, y_resampled,
+ sample_weight_resampled,
+ random_state
+ )
+
+ # Early termination
+ if sample_weight_resampled is None:
+ break
+
+ self.estimator_weights_[iboost] = estimator_weight
+ self.estimator_errors_[iboost] = estimator_error
+
+ # Stop if error is zero
+ if estimator_error == 0:
+ break
+
+ sample_weight_sum = np.sum(sample_weight_resampled)
+
+ # Stop if the sum of sample weights has become non-positive
+ if sample_weight_sum <= 0:
+ break
+
+ if iboost < self.n_estimators - 1:
+ # Normalize
+ sample_weight_resampled /= sample_weight_sum
+
+ return self
def _validate_estimator(self):
"""Check the estimator and the n_estimator attribute.
Sets the `estimator_` attributes.
"""
- pass
+ super()._validate_estimator()
+
+ if not has_fit_parameter(self.estimator_, "sample_weight"):
+ raise ValueError("%s doesn't support sample_weight."
+ % self.estimator_.__class__.__name__)
def _make_sampler_estimator(self, append=True, random_state=None):
"""Make and configure a copy of the `base_estimator_` attribute.
Warning: This method should be used to properly instantiate new
sub-estimators.
"""
- pass
+ estimator = clone(self.estimator_)
+ estimator.set_params(**{p: getattr(self, p)
+ for p in self.estimator_params})
+
+ if random_state is not None:
+ _set_random_states(estimator, random_state)
+
+ if append:
+ self.estimators_.append(estimator)
+
+ return estimator
def _boost_real(self, iboost, X, y, sample_weight, random_state):
"""Implement a single boost using the SAMME.R real algorithm."""
- pass
+ estimator = self._make_sampler_estimator(random_state=random_state)
+
+ estimator.fit(X, y, sample_weight=sample_weight)
+
+ y_predict_proba = estimator.predict_proba(X)
+
+ if iboost == 0:
+ self.classes_ = getattr(estimator, 'classes_', None)
+ self.n_classes_ = len(self.classes_)
+
+ y_predict = self.classes_.take(np.argmax(y_predict_proba, axis=1),
+ axis=0)
+
+ # Instances incorrectly classified
+ incorrect = y_predict != y
+
+ # Error fraction
+ estimator_error = np.mean(
+ np.average(incorrect, weights=sample_weight, axis=0))
+
+ # Stop if classification is perfect
+ if estimator_error <= 0:
+ return sample_weight, 1., 0.
+
+ # Construct y coding as described in Zhu et al [2]:
+ #
+ # y_k = 1 if c == k else -1 / (K - 1)
+ #
+ # where K == n_classes_ and c, k in [0, K) are indices along the second
+ # axis of the y coding with c being the index corresponding to the true
+ # class label.
+ n_classes = self.n_classes_
+ classes = self.classes_
+ y_codes = np.array([-1. / (n_classes - 1), 1.])
+ y_coding = y_codes.take(classes == y[:, np.newaxis])
+
+ # Displace zero probabilities so the log is defined.
+ # Also fix negative elements which may occur with
+ # negative sample weights.
+ proba = y_predict_proba # alias for readability
+ np.clip(proba, np.finfo(proba.dtype).eps, None, out=proba)
+
+ # Boost weight using multi-class AdaBoost SAMME.R alg
+ estimator_weight = (-1. * self.learning_rate
+ * (((n_classes - 1.) / n_classes) *
+ inner1d(y_coding, np.log(y_predict_proba))))
+
+ # Only boost the weights if it will fit again
+ if not iboost == self.n_estimators - 1:
+ # Only boost positive weights
+ sample_weight *= np.exp(estimator_weight *
+ ((sample_weight > 0) |
+ (estimator_weight < 0)))
+
+ return sample_weight, 1., estimator_error
def _boost_discrete(self, iboost, X, y, sample_weight, random_state):
"""Implement a single boost using the SAMME discrete algorithm."""
- pass
+ estimator = self._make_sampler_estimator(random_state=random_state)
+
+ estimator.fit(X, y, sample_weight=sample_weight)
+
+ y_predict = estimator.predict(X)
+
+ if iboost == 0:
+ self.classes_ = getattr(estimator, 'classes_', None)
+ self.n_classes_ = len(self.classes_)
+
+ # Instances incorrectly classified
+ incorrect = y_predict != y
+
+ # Error fraction
+ estimator_error = np.mean(
+ np.average(incorrect, weights=sample_weight, axis=0))
+
+ # Stop if classification is perfect
+ if estimator_error <= 0:
+ return sample_weight, 1., 0.
+
+ n_classes = self.n_classes_
+
+ # Stop if the error is at least as bad as random guessing
+ if estimator_error >= 1. - (1. / n_classes):
+ self.estimators_.pop(-1)
+ if len(self.estimators_) == 0:
+ raise ValueError('BaseClassifier in AdaBoostClassifier '
+ 'ensemble is worse than random, ensemble '
+ 'can not be fit.')
+ return None, None, None
+
+ # Boost weight using multi-class AdaBoost SAMME alg
+ estimator_weight = self.learning_rate * (
+ np.log((1. - estimator_error) / estimator_error) +
+ np.log(n_classes - 1.))
+
+ # Only boost the weights if I will fit again
+ if not iboost == self.n_estimators - 1:
+ # Only boost positive weights
+ sample_weight *= np.exp(estimator_weight * incorrect *
+ ((sample_weight > 0) |
+ (estimator_weight < 0)))
+
+ return sample_weight, estimator_weight, estimator_error
diff --git a/imblearn/ensemble/tests/test_bagging.py b/imblearn/ensemble/tests/test_bagging.py
index 02d90ed..36ab0aa 100644
--- a/imblearn/ensemble/tests/test_bagging.py
+++ b/imblearn/ensemble/tests/test_bagging.py
@@ -31,4 +31,10 @@ class CountDecisionTreeClassifier(DecisionTreeClassifier):
def test_balanced_bagging_classifier_n_features():
"""Check that we raise a FutureWarning when accessing `n_features_`."""
- pass
+ X, y = make_classification(n_samples=100, n_features=4, n_informative=2, n_redundant=0, n_classes=2, random_state=0)
+
+ bbc = BalancedBaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)
+ bbc.fit(X, y)
+
+ with pytest.warns(FutureWarning, match="Attribute n_features_ was deprecated"):
+ _ = bbc.n_features_
diff --git a/imblearn/ensemble/tests/test_easy_ensemble.py b/imblearn/ensemble/tests/test_easy_ensemble.py
index 3b667a8..4041f40 100644
--- a/imblearn/ensemble/tests/test_easy_ensemble.py
+++ b/imblearn/ensemble/tests/test_easy_ensemble.py
@@ -25,4 +25,16 @@ Y = np.array([1, 2, 2, 2, 1, 0, 1, 1, 1, 0])
def test_easy_ensemble_classifier_n_features():
"""Check that we raise a FutureWarning when accessing `n_features_`."""
- pass
+ X, y = make_imbalance(
+ iris.data, iris.target, sampling_strategy={0: 20, 1: 25, 2: 50},
+ random_state=RND_SEED
+ )
+ eec = EasyEnsembleClassifier(random_state=RND_SEED)
+ eec.fit(X, y)
+
+ with pytest.warns(FutureWarning, match="`n_features_` attribute is deprecated"):
+ _ = eec.n_features_
+
+ # Check that n_features_in_ is available and correct
+ assert hasattr(eec, "n_features_in_")
+ assert eec.n_features_in_ == X.shape[1]
diff --git a/imblearn/ensemble/tests/test_forest.py b/imblearn/ensemble/tests/test_forest.py
index 2742293..bb9d1a4 100644
--- a/imblearn/ensemble/tests/test_forest.py
+++ b/imblearn/ensemble/tests/test_forest.py
@@ -11,21 +11,45 @@ sklearn_version = parse_version(sklearn.__version__)
def test_balanced_bagging_classifier_n_features():
"""Check that we raise a FutureWarning when accessing `n_features_`."""
- pass
+ X, y = make_classification(n_samples=100, n_features=4, n_informative=2, n_redundant=0, n_classes=2, random_state=0)
+ clf = BalancedRandomForestClassifier(n_estimators=5, random_state=0)
+ clf.fit(X, y)
+
+ with pytest.warns(FutureWarning, match="`n_features_` is deprecated"):
+ _ = clf.n_features_
def test_balanced_random_forest_change_behaviour(imbalanced_dataset):
"""Check that we raise a change of behaviour for the parameters `sampling_strategy`
and `replacement`.
"""
- pass
+ X, y = imbalanced_dataset
+ clf = BalancedRandomForestClassifier(n_estimators=5, random_state=0)
+
+ with pytest.warns(FutureWarning, match="The default value of `sampling_strategy` will change"):
+ clf.fit(X, y)
+
+ with pytest.warns(FutureWarning, match="The default value of `replacement` will change"):
+ BalancedRandomForestClassifier(n_estimators=5, random_state=0, sampling_strategy='auto').fit(X, y)
@pytest.mark.skipif(parse_version(sklearn_version.base_version) <
parse_version('1.4'), reason='scikit-learn should be >= 1.4')
def test_missing_values_is_resilient():
"""Check that forest can deal with missing values and has decent performance."""
- pass
+ X, y = load_iris(return_X_y=True)
+ X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+ # Introduce missing values
+ rng = np.random.RandomState(0)
+ mask = rng.binomial(1, 0.2, X_train.shape).astype(bool)
+ X_train[mask] = np.nan
+
+ clf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
+ clf.fit(X_train, y_train)
+
+ score = clf.score(X_test, y_test)
+ assert score > 0.8, f"Performance with missing values is too low: {score}"
@pytest.mark.skipif(parse_version(sklearn_version.base_version) <
@@ -33,4 +57,25 @@ def test_missing_values_is_resilient():
def test_missing_value_is_predictive():
"""Check that the forest learns when missing values are only present for
a predictive feature."""
- pass
+ X, y = make_classification(n_samples=1000, n_features=4, n_informative=2,
+ n_redundant=0, n_repeated=0, n_classes=2,
+ random_state=0)
+
+ # Make the first feature highly predictive
+ X[:, 0] = y + np.random.normal(0, 0.1, size=y.shape)
+
+ # Introduce missing values only in the first feature
+ mask = np.random.RandomState(0).binomial(1, 0.5, X.shape[0]).astype(bool)
+ X[mask, 0] = np.nan
+
+ X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+ clf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
+ clf.fit(X_train, y_train)
+
+ score = clf.score(X_test, y_test)
+ assert score > 0.8, f"Performance with predictive missing values is too low: {score}"
+
+ # Check feature importances
+ importances = clf.feature_importances_
+ assert importances[0] > np.mean(importances[1:]), "First feature should be most important"
diff --git a/imblearn/ensemble/tests/test_weight_boosting.py b/imblearn/ensemble/tests/test_weight_boosting.py
index c1d0d1c..03e82bf 100644
--- a/imblearn/ensemble/tests/test_weight_boosting.py
+++ b/imblearn/ensemble/tests/test_weight_boosting.py
@@ -3,7 +3,74 @@ import pytest
import sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
-from sklearn.utils._testing import assert_array_equal
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.utils._testing import assert_array_equal, assert_array_almost_equal
from sklearn.utils.fixes import parse_version
from imblearn.ensemble import RUSBoostClassifier
+
sklearn_version = parse_version(sklearn.__version__)
+
+class TestRUSBoostClassifier:
+ @pytest.mark.parametrize(
+ "algorithm", ["SAMME", "SAMME.R"]
+ )
+ def test_rusboost(self, algorithm):
+ X, y = make_classification(
+ n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0
+ )
+ X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
+
+ clf = RUSBoostClassifier(algorithm=algorithm, random_state=0)
+ clf.fit(X_train, y_train)
+
+ assert hasattr(clf, 'estimators_')
+ assert len(clf.estimators_) <= clf.n_estimators
+ assert clf.n_classes_ == 3
+
+ y_pred = clf.predict(X_test)
+ assert y_pred.shape == y_test.shape
+
+ def test_sample_weight(self):
+ X, y = make_classification(n_samples=1000, n_classes=3, random_state=0)
+ sample_weight = np.ones_like(y)
+ clf = RUSBoostClassifier(random_state=0)
+ clf.fit(X, y, sample_weight=sample_weight)
+ y_pred_weighted = clf.predict(X)
+
+ clf.fit(X, y)
+ y_pred_unweighted = clf.predict(X)
+
+ assert_array_equal(y_pred_weighted, y_pred_unweighted)
+
+ def test_staged_predict(self):
+ X, y = make_classification(n_samples=1000, n_classes=3, random_state=0)
+ clf = RUSBoostClassifier(n_estimators=10, random_state=0)
+ clf.fit(X, y)
+
+ predictions = np.array([p for p in clf.staged_predict(X)])
+ assert predictions.shape == (10, X.shape[0])
+
+ def test_feature_importances(self):
+ X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, random_state=0)
+ clf = RUSBoostClassifier(random_state=0)
+ clf.fit(X, y)
+
+ assert hasattr(clf, 'feature_importances_')
+ assert clf.feature_importances_.shape == (X.shape[1],)
+
+ @pytest.mark.parametrize(
+ "sampling_strategy", ["auto", "majority", "not minority", "not majority", 0.5]
+ )
+ def test_sampling_strategy(self, sampling_strategy):
+ X, y = make_classification(n_samples=1000, n_classes=3, weights=[0.2, 0.3, 0.5], random_state=0)
+ clf = RUSBoostClassifier(sampling_strategy=sampling_strategy, random_state=0)
+ clf.fit(X, y)
+ assert clf.base_sampler_.sampling_strategy == sampling_strategy
+
+ def test_custom_estimator(self):
+ X, y = make_classification(n_samples=1000, n_classes=3, random_state=0)
+ base_estimator = DecisionTreeClassifier(max_depth=2)
+ clf = RUSBoostClassifier(estimator=base_estimator, random_state=0)
+ clf.fit(X, y)
+ assert isinstance(clf.estimators_[0], DecisionTreeClassifier)
+ assert clf.estimators_[0].max_depth == 2
diff --git a/imblearn/exceptions.py b/imblearn/exceptions.py
index a78c1ad..c1ee98e 100644
--- a/imblearn/exceptions.py
+++ b/imblearn/exceptions.py
@@ -23,4 +23,8 @@ def raise_isinstance_error(variable_name, possible_type, variable):
ValueError
If the instance is not of the possible type.
"""
- pass
+ if not isinstance(variable, possible_type):
+ raise ValueError(
+ f"{variable_name} must be an instance of {possible_type.__name__}, "
+ f"got {type(variable).__name__} instead."
+ )
diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py
index f3de87c..e47c7d3 100644
--- a/imblearn/keras/_generator.py
+++ b/imblearn/keras/_generator.py
@@ -6,7 +6,15 @@ def import_keras():
This is possible to import the sequence from keras or tensorflow.
"""
- pass
+ try:
+ from keras.utils import Sequence
+ return Sequence, True
+ except ImportError:
+ try:
+ from tensorflow.keras.utils import Sequence
+ return Sequence, True
+ except ImportError:
+ return object, False
ParentClass, HAS_KERAS = import_keras()
@@ -204,4 +212,39 @@ def balanced_batch_generator(X, y, *, sample_weight=None, sampler=None,
... steps_per_epoch=steps_per_epoch,
... epochs=10, verbose=0)
"""
- pass
+ if sampler is None:
+ sampler = RandomUnderSampler(random_state=random_state)
+
+ if not hasattr(sampler, 'fit_resample'):
+ raise ValueError("'sampler' should have a 'fit_resample' method.")
+
+ sampler_ = clone(sampler)
+ X_resampled, y_resampled = sampler_.fit_resample(X, y)
+
+ if sample_weight is not None:
+ sample_weight_resampled = _safe_indexing(sample_weight, sampler_.sample_indices_)
+ else:
+ sample_weight_resampled = None
+
+ n_samples = X_resampled.shape[0]
+ steps_per_epoch = int(n_samples // batch_size)
+
+ def generator():
+ while True:
+ indices = check_random_state(random_state).permutation(n_samples)
+ for start in range(0, n_samples, batch_size):
+ end = min(start + batch_size, n_samples)
+ batch_indices = indices[start:end]
+ X_batch = _safe_indexing(X_resampled, batch_indices)
+ y_batch = _safe_indexing(y_resampled, batch_indices)
+
+ if not keep_sparse and issparse(X_batch):
+ X_batch = X_batch.toarray()
+
+ if sample_weight is None:
+ yield X_batch, y_batch
+ else:
+ sw_batch = _safe_indexing(sample_weight_resampled, batch_indices)
+ yield X_batch, y_batch, sw_batch
+
+ return generator(), steps_per_epoch
diff --git a/imblearn/metrics/_classification.py b/imblearn/metrics/_classification.py
index 6723b08..46f54c2 100644
--- a/imblearn/metrics/_classification.py
+++ b/imblearn/metrics/_classification.py
@@ -133,7 +133,55 @@ def sensitivity_specificity_support(y_true, y_pred, *, labels=None,
>>> sensitivity_specificity_support(y_true, y_pred, average='weighted')
(0.33..., 0.66..., None)
"""
- pass
+ y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+
+ if labels is None:
+ labels = unique_labels(y_true, y_pred)
+ else:
+ labels = np.asarray(labels)
+
+ if y_type.startswith('multilabel'):
+ raise ValueError("sensitivity_specificity_support is not defined for multilabel classification")
+
+ if pos_label is None:
+ if average == 'binary':
+ raise ValueError("Please specify the positive label explicitly "
+ "in binary classification problems")
+ pos_label = 1
+
+ cm = confusion_matrix(y_true, y_pred, sample_weight=sample_weight, labels=labels)
+
+ with np.errstate(divide='ignore', invalid='ignore'):
+ per_class_sensitivity = np.diag(cm) / cm.sum(axis=1)
+ per_class_specificity = np.diag(cm) / cm.sum(axis=0)
+
+ if average == 'micro':
+ sensitivity = specificity = np.trace(cm) / cm.sum()
+ support = cm.sum()
+ elif average == 'macro':
+ sensitivity = np.mean(per_class_sensitivity)
+ specificity = np.mean(per_class_specificity)
+ support = None
+ elif average == 'weighted':
+ weights = cm.sum(axis=1)
+ sensitivity = np.average(per_class_sensitivity, weights=weights)
+ specificity = np.average(per_class_specificity, weights=weights)
+ support = None
+ elif average == 'binary':
+ if len(labels) != 2:
+ raise ValueError("Binary classification is only valid for two classes")
+ pos_label_idx = np.where(labels == pos_label)[0][0]
+ sensitivity = per_class_sensitivity[pos_label_idx]
+ specificity = per_class_specificity[pos_label_idx]
+ support = cm.sum(axis=1)[pos_label_idx]
+ elif average is None:
+ sensitivity = per_class_sensitivity
+ specificity = per_class_specificity
+ support = cm.sum(axis=1)
+ else:
+ raise ValueError("Unsupported 'average' parameter")
+
+ return sensitivity, specificity, support
@validate_params({'y_true': ['array-like'], 'y_pred': ['array-like'],
@@ -204,8 +252,8 @@ def sensitivity_score(y_true, y_pred, *, labels=None, pos_label=1, average=
Returns
-------
- specificity : float (if `average is None`) or ndarray of shape (n_unique_labels,)
- The specifcity metric.
+ sensitivity : float (if `average is None`) or ndarray of shape (n_unique_labels,)
+ The sensitivity metric.
Examples
--------
@@ -222,7 +270,15 @@ def sensitivity_score(y_true, y_pred, *, labels=None, pos_label=1, average=
>>> sensitivity_score(y_true, y_pred, average=None)
array([1., 0., 0.])
"""
- pass
+ sensitivity, _, _ = sensitivity_specificity_support(
+ y_true, y_pred,
+ labels=labels,
+ pos_label=pos_label,
+ average=average,
+ warn_for=('sensitivity',),
+ sample_weight=sample_weight
+ )
+ return sensitivity
@validate_params({'y_true': ['array-like'], 'y_pred': ['array-like'],
@@ -311,7 +367,15 @@ def specificity_score(y_true, y_pred, *, labels=None, pos_label=1, average=
>>> specificity_score(y_true, y_pred, average=None)
array([0.75, 0.5 , 0.75])
"""
- pass
+ _, specificity, _ = sensitivity_specificity_support(
+ y_true, y_pred,
+ labels=labels,
+ pos_label=pos_label,
+ average=average,
+ warn_for=('specificity',),
+ sample_weight=sample_weight
+ )
+ return specificity
@validate_params({'y_true': ['array-like'], 'y_pred': ['array-like'],
@@ -435,7 +499,31 @@ def geometric_mean_score(y_true, y_pred, *, labels=None, pos_label=1,
>>> geometric_mean_score(y_true, y_pred, average=None)
array([0.866..., 0. , 0. ])
"""
- pass
+ sensitivity, _, support = sensitivity_specificity_support(
+ y_true,
+ y_pred,
+ labels=labels,
+ pos_label=pos_label,
+ average=average,
+ warn_for=('sensitivity',),
+ sample_weight=sample_weight,
+ )
+
+ if average == 'multiclass':
+ if correction == 0:
+ return sp.stats.gmean(sensitivity)
+ sensitivity = np.where(sensitivity == 0, correction, sensitivity)
+ return sp.stats.gmean(sensitivity)
+ elif average == 'micro':
+ return sensitivity
+ elif average == 'macro':
+ return sp.stats.gmean(sensitivity)
+ elif average == 'weighted':
+ return np.average(sensitivity, weights=support)
+ elif average is None or average == 'binary':
+ return sensitivity
+
+ raise ValueError("Unsupported 'average' parameter")
@validate_params({'alpha': [numbers.Real], 'squared': ['boolean']},
@@ -489,7 +577,41 @@ def make_index_balanced_accuracy(*, alpha=0.1, squared=True):
>>> print(gmean(y_true, y_pred, average=None))
[0.44... 0.44...]
"""
- pass
+ def iba_scoring_func(scoring_func):
+ @functools.wraps(scoring_func)
+ def wrapped_scoring_func(y_true, y_pred, **kwargs):
+ score = scoring_func(y_true, y_pred, **kwargs)
+ y_true, y_pred = check_consistent_length(y_true, y_pred)
+ y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+
+ if y_type not in ("binary", "multiclass"):
+ raise ValueError(f"{y_type} is not supported")
+
+ lb = LabelEncoder()
+ y_true = lb.fit_transform(y_true)
+ y_pred = lb.transform(y_pred)
+ classes = unique_labels(y_true, y_pred)
+
+ dominance = np.zeros_like(classes, dtype=float)
+ for class_i in classes:
+ mask_class = y_true == class_i
+ n_class = np.count_nonzero(mask_class)
+ n_errors = np.count_nonzero(y_pred[mask_class] != class_i)
+ dominance[class_i] = (n_class - n_errors) / n_class - (1 - n_errors / (len(y_true) - n_class))
+
+ if squared:
+ dominance = dominance ** 2
+
+ final_score = score * (1 + alpha * dominance)
+
+ if isinstance(score, np.ndarray):
+ return final_score
+ else:
+ return np.mean(final_score)
+
+ return wrapped_scoring_func
+
+ return iba_scoring_func
@validate_params({'y_true': ['array-like'], 'y_pred': ['array-like'],
@@ -581,7 +703,88 @@ def classification_report_imbalanced(y_true, y_pred, *, labels=None,
avg / total 0.70 0.60 0.90 0.61 0.66 0.54 5
<BLANKLINE>
"""
- pass
+ y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+
+ if labels is None:
+ labels = unique_labels(y_true, y_pred)
+ else:
+ labels = np.asarray(labels)
+
+ target_names = [str(label) for label in labels] if target_names is None else target_names
+
+ # Compute the different metrics
+ precision, recall, f1, support = precision_recall_fscore_support(
+ y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight,
+ zero_division=zero_division
+ )
+ specificity = specificity_score(
+ y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
+ )
+ geo_mean = geometric_mean_score(
+ y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
+ )
+ iba_gmean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
+ geometric_mean_score
+ )
+ iba = iba_gmean(y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight)
+
+ # Build the output dictionary
+ results = {}
+ for i, label in enumerate(labels):
+ results[label] = {
+ "precision": precision[i],
+ "recall": recall[i],
+ "specificity": specificity[i],
+ "f1-score": f1[i],
+ "geo_mean": geo_mean[i],
+ "iba": iba[i],
+ "support": support[i]
+ }
+
+ # Compute the averages
+ average_precision = np.average(precision, weights=support)
+ average_recall = np.average(recall, weights=support)
+ average_specificity = np.average(specificity, weights=support)
+ average_f1 = np.average(f1, weights=support)
+ average_geo_mean = np.average(geo_mean, weights=support)
+ average_iba = np.average(iba, weights=support)
+ total_support = np.sum(support)
+
+ # Store the averages in the dictionary
+ results["macro avg"] = {
+ "precision": average_precision,
+ "recall": average_recall,
+ "specificity": average_specificity,
+ "f1-score": average_f1,
+ "geo_mean": average_geo_mean,
+ "iba": average_iba,
+ "support": total_support
+ }
+
+ if output_dict:
+ return results
+
+ # Build the output string
+ headers = ["precision", "recall", "specificity", "f1-score", "geo_mean", "iba", "support"]
+ name_width = max(len(cn) for cn in target_names)
+ width = max(name_width, len("macro avg"), digits)
+
+ head_fmt = "{:>{width}s} " + " {:>9}" * len(headers)
+ report = head_fmt.format("", *headers, width=width)
+ report += "\n\n"
+
+ row_fmt = "{:>{width}s} " + " {:>9.{digits}f}" * (len(headers) - 1) + " {:>9}\n"
+ for row in (target_names + ["macro avg"]):
+ if row == "macro avg":
+ report += "\n"
+ scores = results[row]
+ report += row_fmt.format(row, scores["precision"], scores["recall"],
+ scores["specificity"], scores["f1-score"],
+ scores["geo_mean"], scores["iba"],
+ int(scores["support"]),
+ width=width, digits=digits)
+
+ return report
@validate_params({'y_true': ['array-like'], 'y_pred': ['array-like'],
@@ -630,4 +833,27 @@ def macro_averaged_mean_absolute_error(y_true, y_pred, *, sample_weight=None):
>>> macro_averaged_mean_absolute_error(y_true_imbalanced, y_pred)
0.16...
"""
- pass
+ y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+ check_consistent_length(y_true, y_pred, sample_weight)
+
+ if y_type not in ("binary", "multiclass"):
+ raise ValueError(f"{y_type} is not supported")
+
+ y_true = column_or_1d(y_true)
+ y_pred = column_or_1d(y_pred)
+
+ classes = unique_labels(y_true)
+ n_classes = len(classes)
+
+ mae_per_class = []
+ for cls in classes:
+ cls_mask = y_true == cls
+ if np.any(cls_mask):
+ mae = mean_absolute_error(
+ y_true[cls_mask],
+ y_pred[cls_mask],
+ sample_weight=sample_weight[cls_mask] if sample_weight is not None else None
+ )
+ mae_per_class.append(mae)
+
+ return np.mean(mae_per_class)
diff --git a/imblearn/metrics/pairwise.py b/imblearn/metrics/pairwise.py
index d73edae..732e9ea 100644
--- a/imblearn/metrics/pairwise.py
+++ b/imblearn/metrics/pairwise.py
@@ -137,7 +137,29 @@ class ValueDifferenceMetric(_ParamsValidationMixin, BaseEstimator):
self : object
Return the instance itself.
"""
- pass
+ X, y = self._validate_data(X, y, dtype=np.int32)
+ check_consistent_length(X, y)
+
+ self.n_features_in_ = X.shape[1]
+ self.classes_ = unique_labels(y)
+
+ if isinstance(self.n_categories, str) and self.n_categories == "auto":
+ self.n_categories_ = np.max(X, axis=0) + 1
+ else:
+ self.n_categories_ = np.asarray(self.n_categories)
+ if self.n_categories_.shape[0] != self.n_features_in_:
+ raise ValueError("n_categories must have length equal to n_features")
+
+ self.proba_per_class_ = []
+ for feature_idx in range(self.n_features_in_):
+ feature_proba = np.zeros((self.n_categories_[feature_idx], len(self.classes_)))
+ for class_idx, class_label in enumerate(self.classes_):
+ class_mask = y == class_label
+ feature_values, counts = np.unique(X[class_mask, feature_idx], return_counts=True)
+ feature_proba[feature_values, class_idx] = counts / np.sum(class_mask)
+ self.proba_per_class_.append(feature_proba)
+
+ return self
def pairwise(self, X, Y=None):
"""Compute the VDM distance pairwise.
@@ -157,4 +179,27 @@ class ValueDifferenceMetric(_ParamsValidationMixin, BaseEstimator):
distance_matrix : ndarray of shape (n_samples, n_samples)
The VDM pairwise distance.
"""
- pass
+ check_is_fitted(self)
+ X = self._validate_data(X, reset=False, dtype=np.int32)
+
+ if Y is None:
+ Y = X
+ else:
+ Y = self._validate_data(Y, reset=False, dtype=np.int32)
+
+ if X.shape[1] != self.n_features_in_ or Y.shape[1] != self.n_features_in_:
+ raise ValueError("X and Y must have the same number of features as the fitted data")
+
+ n_samples_X, n_samples_Y = X.shape[0], Y.shape[0]
+ distance_matrix = np.zeros((n_samples_X, n_samples_Y))
+
+ for i in range(n_samples_X):
+ for j in range(n_samples_Y):
+ feature_distances = np.zeros(self.n_features_in_)
+ for f in range(self.n_features_in_):
+ x_val, y_val = X[i, f], Y[j, f]
+ proba_diff = np.abs(self.proba_per_class_[f][x_val] - self.proba_per_class_[f][y_val])
+ feature_distances[f] = np.sum(proba_diff ** self.k)
+ distance_matrix[i, j] = np.sum(feature_distances ** self.r) ** (1 / self.r)
+
+ return distance_matrix
diff --git a/imblearn/metrics/tests/test_classification.py b/imblearn/metrics/tests/test_classification.py
index 7fce78f..9866dbc 100644
--- a/imblearn/metrics/tests/test_classification.py
+++ b/imblearn/metrics/tests/test_classification.py
@@ -17,4 +17,35 @@ def make_prediction(dataset=None, binary=False):
If binary is True restrict to a binary classification problem instead of a
multiclass classification problem
"""
- pass
+ if dataset is None:
+ # Load iris dataset
+ dataset = datasets.load_iris()
+
+ X = dataset.data
+ y = dataset.target
+
+ if binary:
+ # Keep only two classes for binary classification
+ mask = y < 2
+ X = X[mask]
+ y = y[mask]
+
+ # Split the data into training and testing sets
+ random_state = check_random_state(RND_SEED)
+ n_samples = X.shape[0]
+ permutation = random_state.permutation(n_samples)
+ n_train = int(0.8 * n_samples)
+
+ X_train = X[permutation[:n_train]]
+ y_train = y[permutation[:n_train]]
+ X_test = X[permutation[n_train:]]
+ y_test = y[permutation[n_train:]]
+
+ # Train a Support Vector Classifier
+ clf = svm.SVC(random_state=RND_SEED)
+ clf.fit(X_train, y_train)
+
+ # Make predictions
+ y_pred = clf.predict(X_test)
+
+ return y_test, y_pred, X_test
diff --git a/imblearn/metrics/tests/test_pairwise.py b/imblearn/metrics/tests/test_pairwise.py
index ccbfede..d68203c 100644
--- a/imblearn/metrics/tests/test_pairwise.py
+++ b/imblearn/metrics/tests/test_pairwise.py
@@ -5,3 +5,71 @@ from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.utils._testing import _convert_container
from imblearn.metrics.pairwise import ValueDifferenceMetric
+
+def test_value_difference_metric_fit():
+ X = np.array([[0, 1], [1, 1], [2, 2], [2, 2]])
+ y = np.array([0, 0, 1, 1])
+ vdm = ValueDifferenceMetric()
+ vdm.fit(X, y)
+
+ assert vdm.n_features_in_ == 2
+ assert np.array_equal(vdm.classes_, [0, 1])
+ assert np.array_equal(vdm.n_categories_, [3, 3])
+ assert len(vdm.proba_per_class_) == 2
+ assert vdm.proba_per_class_[0].shape == (3, 2)
+ assert vdm.proba_per_class_[1].shape == (3, 2)
+
+def test_value_difference_metric_pairwise():
+ X = np.array([[0, 1], [1, 1], [2, 2], [2, 2]])
+ y = np.array([0, 0, 1, 1])
+ vdm = ValueDifferenceMetric()
+ vdm.fit(X, y)
+
+ distances = vdm.pairwise(X)
+ assert distances.shape == (4, 4)
+ assert np.allclose(distances, distances.T) # Check symmetry
+ assert np.allclose(np.diag(distances), 0) # Check diagonal is zero
+
+def test_value_difference_metric_not_fitted():
+ X = np.array([[0, 1], [1, 1], [2, 2], [2, 2]])
+ vdm = ValueDifferenceMetric()
+ with pytest.raises(NotFittedError):
+ vdm.pairwise(X)
+
+def test_value_difference_metric_different_n_features():
+ X = np.array([[0, 1], [1, 1], [2, 2], [2, 2]])
+ y = np.array([0, 0, 1, 1])
+ vdm = ValueDifferenceMetric()
+ vdm.fit(X, y)
+
+ X_invalid = np.array([[0, 1, 2], [1, 1, 2]])
+ with pytest.raises(ValueError):
+ vdm.pairwise(X_invalid)
+
+def test_value_difference_metric_custom_parameters():
+ X = np.array([[0, 1], [1, 1], [2, 2], [2, 2]])
+ y = np.array([0, 0, 1, 1])
+ vdm = ValueDifferenceMetric(n_categories=[3, 3], k=2, r=1)
+ vdm.fit(X, y)
+
+ assert np.array_equal(vdm.n_categories_, [3, 3])
+ assert vdm.k == 2
+ assert vdm.r == 1
+
+ distances = vdm.pairwise(X)
+ assert distances.shape == (4, 4)
+
+def test_value_difference_metric_auto_n_categories():
+ X = np.array([[0, 1], [1, 1], [2, 2], [2, 2]])
+ y = np.array([0, 0, 1, 1])
+ vdm = ValueDifferenceMetric(n_categories="auto")
+ vdm.fit(X, y)
+
+ assert np.array_equal(vdm.n_categories_, [3, 3])
+
+def test_value_difference_metric_invalid_n_categories():
+ X = np.array([[0, 1], [1, 1], [2, 2], [2, 2]])
+ y = np.array([0, 0, 1, 1])
+ vdm = ValueDifferenceMetric(n_categories=[3])
+ with pytest.raises(ValueError):
+ vdm.fit(X, y)
diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py
index d8159df..d85c8d9 100644
--- a/imblearn/over_sampling/_adasyn.py
+++ b/imblearn/over_sampling/_adasyn.py
@@ -126,4 +126,14 @@ class ADASYN(BaseOverSampler):
def _validate_estimator(self):
"""Create the necessary objects for ADASYN"""
- pass
+ self.nn_ = check_neighbors_object(
+ 'n_neighbors', self.n_neighbors, additional_neighbor=1
+ )
+ if self.n_jobs is not None:
+ warnings.warn(
+ "The parameter `n_jobs` has been deprecated in 0.10 and will be "
+ "removed in 0.12. You can pass an estimator where `n_jobs` is "
+ "already set instead.",
+ FutureWarning,
+ )
+ self.nn_.set_params(n_jobs=self.n_jobs)
diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
index 968941c..d3c56cc 100644
--- a/imblearn/over_sampling/_smote/base.py
+++ b/imblearn/over_sampling/_smote/base.py
@@ -44,7 +44,8 @@ class BaseSMOTE(BaseOverSampler):
"""Check the NN estimators shared across the different SMOTE
algorithms.
"""
- pass
+ self.nn_k_ = check_neighbors_object('k_neighbors', self.k_neighbors, additional_neighbor=1)
+ self.nn_k_.set_params(**{'n_jobs': self.n_jobs})
def _make_samples(self, X, y_dtype, y_type, nn_data, nn_num, n_samples,
step_size=1.0, y=None):
@@ -88,7 +89,20 @@ class BaseSMOTE(BaseOverSampler):
y_new : ndarray of shape (n_samples_new,)
Target values for synthetic samples.
"""
- pass
+ random_state = check_random_state(self.random_state)
+ samples_indices = random_state.randint(low=0, high=nn_num.size, size=n_samples)
+
+ # Generate synthetic samples
+ X_new = self._generate_samples(X, nn_data, nn_num, samples_indices, step_size)
+
+ if sparse.issparse(X):
+ X_new = sparse.vstack([X, X_new])
+ else:
+ X_new = np.vstack([X, X_new])
+
+ y_new = np.array([y_type] * n_samples, dtype=y_dtype)
+
+ return X_new, y_new
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps,
y_type=None, y=None):
@@ -139,7 +153,22 @@ class BaseSMOTE(BaseOverSampler):
X_new : {ndarray, sparse matrix} of shape (n_samples, n_features)
Synthetically generated samples.
"""
- pass
+ random_state = check_random_state(self.random_state)
+
+ X_new = np.zeros((rows.size, X.shape[1]))
+
+ for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
+ if y is None:
+ weight = random_state.uniform(0, 1)
+ else:
+ weight = random_state.uniform(0, 1) if y[row] == y[col] else random_state.uniform(0, 0.5)
+
+ if sparse.issparse(X):
+ X_new[i] = X[row].toarray() + weight * (nn_data[col].toarray() - X[row].toarray())
+ else:
+ X_new[i] = X[row] + weight * (nn_data[col] - X[row])
+
+ return X_new
def _in_danger_noise(self, nn_estimator, samples, target_class, y, kind
='danger'):
@@ -174,7 +203,18 @@ class BaseSMOTE(BaseOverSampler):
output : ndarray of shape (n_samples,)
A boolean array where True refer to samples in danger or noise.
"""
- pass
+ x = nn_estimator.kneighbors(samples, return_distance=False)[:, 1:]
+ nn_label = (y[x] != target_class).astype(int).sum(axis=1)
+
+ if kind == 'danger':
+ # Samples are in danger for m/2 < m' < m
+ return np.logical_and(nn_label > (nn_estimator.n_neighbors - 1) // 2,
+ nn_label < nn_estimator.n_neighbors - 1)
+ elif kind == 'noise':
+ # Samples are noise for m = m'
+ return nn_label == nn_estimator.n_neighbors - 1
+ else:
+ raise ValueError("'kind' should be 'danger' or 'noise'.")
@Substitution(sampling_strategy=BaseOverSampler.
@@ -462,11 +502,58 @@ class SMOTENC(SMOTE):
"""Overwrite the checking to let pass some string for categorical
features.
"""
- pass
+ if _is_pandas_df(X):
+ X_ = X.values
+ else:
+ X_ = X
+ if not (hasattr(X_, "dtype") and np.issubdtype(X_.dtype, np.number)):
+ raise ValueError(
+ "SMOTENC requires all features to be numeric. "
+ "When trying to fit, X and y were:\n"
+ f"X type: {type(X)}\n"
+ f"y type: {type(y)}\n"
+ f"X dtype: {X_.dtype}\n"
+ f"y dtype: {y.dtype}"
+ )
+ return X, y
def _validate_column_types(self, X):
"""Compute the indices of the categorical and continuous features."""
- pass
+ if self.categorical_features == "auto":
+ if not _is_pandas_df(X):
+ raise ValueError(
+ "When `categorical_features='auto'`, X must be a pandas DataFrame"
+ )
+ self.categorical_features_ = [
+ i for i, dtype in enumerate(X.dtypes) if dtype.name == "category"
+ ]
+ elif isinstance(self.categorical_features, str):
+ raise ValueError(
+ "If `categorical_features` is a string, it must be 'auto'. "
+ f"Got {self.categorical_features!r} instead."
+ )
+ else:
+ if _is_pandas_df(X):
+ if isinstance(self.categorical_features[0], str):
+ self.categorical_features_ = [
+ i for i, col in enumerate(X.columns) if col in self.categorical_features
+ ]
+ else:
+ self.categorical_features_ = self.categorical_features
+ else:
+ self.categorical_features_ = self.categorical_features
+
+ if not isinstance(self.categorical_features_, list):
+ self.categorical_features_ = list(self.categorical_features_)
+
+ self.continuous_features_ = [
+ i for i in range(X.shape[1]) if i not in self.categorical_features_
+ ]
+
+ if len(self.categorical_features_) == X.shape[1]:
+ raise ValueError(
+ "SMOTE-NC cannot be applied when all features are categorical."
+ )
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps,
y_type, y=None):
@@ -477,12 +564,30 @@ class SMOTENC(SMOTE):
categorical features are mapped to the most frequent nearest neighbors
of the majority class.
"""
- pass
+ X_new = np.zeros((steps.size, X.shape[1]))
+ random_state = check_random_state(self.random_state)
+
+ for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
+ X_new[i, self.continuous_features_] = (
+ X[row, self.continuous_features_]
+ + step * (nn_data[col, self.continuous_features_] - X[row, self.continuous_features_])
+ )
+
+ for feature in self.categorical_features_:
+ neighbor_categories = nn_data[nn_num[row], feature]
+ X_new[i, feature] = _mode(neighbor_categories)
+
+ return X_new
@property
def ohe_(self):
"""One-hot encoder used to encode the categorical features."""
- pass
+ warnings.warn(
+ "`ohe_` attribute is deprecated in 0.11 and will be removed in 0.13. "
+ "Use `categorical_encoder_` instead.",
+ FutureWarning,
+ )
+ return self.categorical_encoder_
@Substitution(sampling_strategy=BaseOverSampler.
@@ -607,8 +712,28 @@ class SMOTEN(SMOTE):
def _check_X_y(self, X, y):
"""Check should accept strings and not sparse matrices."""
- pass
+ if sparse.issparse(X):
+ raise TypeError("SMOTEN does not support sparse input.")
+
+ X, y = self._validate_data(
+ X, y, reset=True, accept_sparse=False, dtype=None, force_all_finite=False
+ )
+
+ if not np.all([isinstance(x, (str, int, np.integer)) for x in X.ravel()]):
+ raise ValueError("SMOTEN expects all features to be categorical.")
+
+ return X, y
def _validate_estimator(self):
"""Force to use precomputed distance matrix."""
- pass
+ self.nn_k_ = check_neighbors_object(
+ "k_neighbors", self.k_neighbors, additional_neighbor=1
+ )
+ self.nn_k_.set_params(**{"metric": "precomputed"})
+
+ if self.categorical_encoder is None:
+ self.categorical_encoder_ = OrdinalEncoder(
+ handle_unknown="use_encoded_value", unknown_value=-1
+ )
+ else:
+ self.categorical_encoder_ = clone(self.categorical_encoder)
diff --git a/imblearn/over_sampling/_smote/cluster.py b/imblearn/over_sampling/_smote/cluster.py
index 31fb344..26ccff4 100644
--- a/imblearn/over_sampling/_smote/cluster.py
+++ b/imblearn/over_sampling/_smote/cluster.py
@@ -152,4 +152,22 @@ class KMeansSMOTE(BaseSMOTE):
def _find_cluster_sparsity(self, X):
"""Compute the cluster sparsity."""
- pass
+ n_samples, n_features = X.shape
+
+ if self.density_exponent == "auto":
+ exponent = 1 - (1 / n_features) if n_features > 1 else 0.5
+ else:
+ exponent = self.density_exponent
+
+ if sparse.issparse(X):
+ # For sparse matrices, use efficient sparse operations
+ distances = pairwise_distances(X, metric='euclidean', n_jobs=self.n_jobs)
+ distances_sum = distances.sum(axis=1)
+ else:
+ # For dense matrices, use numpy operations
+ distances_sum = np.sum(pairwise_distances(X, metric='euclidean', n_jobs=self.n_jobs), axis=1)
+
+ volumes = np.power(distances_sum / n_samples, exponent)
+ sparsity = np.mean(volumes)
+
+ return sparsity
diff --git a/imblearn/over_sampling/_smote/tests/test_borderline_smote.py b/imblearn/over_sampling/_smote/tests/test_borderline_smote.py
index b11e0ea..9b708ec 100644
--- a/imblearn/over_sampling/_smote/tests/test_borderline_smote.py
+++ b/imblearn/over_sampling/_smote/tests/test_borderline_smote.py
@@ -11,7 +11,25 @@ def test_borderline_smote_no_in_danger_samples(kind):
"""Check that the algorithm behave properly even on a dataset without any sample
in danger.
"""
- pass
+ X, y = make_classification(
+ n_samples=100,
+ n_classes=2,
+ weights=[0.9, 0.1],
+ random_state=42
+ )
+
+ smote = BorderlineSMOTE(kind=kind, random_state=42)
+ X_res, y_res = smote.fit_resample(X, y)
+
+ # Check that the number of samples has increased
+ assert len(X_res) > len(X)
+
+ # Check that the class balance has improved
+ assert Counter(y_res)[0] == Counter(y_res)[1]
+
+ # Check that the original samples are preserved
+ assert_array_equal(X, X_res[:len(X)])
+ assert_array_equal(y, y_res[:len(y)])
def test_borderline_smote_kind():
@@ -21,4 +39,30 @@ def test_borderline_smote_kind():
"borderline-1". We generate an example where a logistic regression will perform
worse on "borderline-2" than on "borderline-1".
"""
- pass
+ X, y = make_classification(
+ n_samples=100,
+ n_classes=2,
+ weights=[0.9, 0.1],
+ random_state=42
+ )
+
+ smote_1 = BorderlineSMOTE(kind='borderline-1', random_state=42)
+ X_res_1, y_res_1 = smote_1.fit_resample(X, y)
+
+ smote_2 = BorderlineSMOTE(kind='borderline-2', random_state=42)
+ X_res_2, y_res_2 = smote_2.fit_resample(X, y)
+
+ # Train logistic regression models
+ lr_1 = LogisticRegression(random_state=42)
+ lr_1.fit(X_res_1, y_res_1)
+ score_1 = lr_1.score(X, y)
+
+ lr_2 = LogisticRegression(random_state=42)
+ lr_2.fit(X_res_2, y_res_2)
+ score_2 = lr_2.score(X, y)
+
+ # Check that borderline-2 performs worse than borderline-1
+ assert score_2 < score_1
+
+ # Check that the synthetic samples are different
+ assert not assert_allclose(X_res_1, X_res_2)
diff --git a/imblearn/over_sampling/_smote/tests/test_smote_nc.py b/imblearn/over_sampling/_smote/tests/test_smote_nc.py
index 06080ca..e5afa3c 100644
--- a/imblearn/over_sampling/_smote/tests/test_smote_nc.py
+++ b/imblearn/over_sampling/_smote/tests/test_smote_nc.py
@@ -15,19 +15,58 @@ sklearn_version = parse_version(sklearn.__version__)
def test_smotenc_categorical_encoder():
"""Check that we can pass our own categorical encoder."""
- pass
+ X, y = make_classification(n_samples=100, n_classes=2, weights=[0.9, 0.1],
+ n_informative=3, n_redundant=1, n_repeated=0,
+ n_features=5, n_clusters_per_class=1,
+ random_state=0)
+ X[:, [0, 2]] = X[:, [0, 2]].astype(int)
+ categorical_features = [0, 2]
+
+ encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
+ smote_nc = SMOTENC(categorical_features=categorical_features,
+ categorical_encoder=encoder, random_state=0)
+ X_res, y_res = smote_nc.fit_resample(X, y)
+
+ assert isinstance(smote_nc.categorical_encoder_, OneHotEncoder)
+ assert smote_nc.categorical_encoder_ is encoder
+ assert X_res.shape[0] > X.shape[0]
+ assert Counter(y_res)[0] == Counter(y_res)[1]
def test_smotenc_deprecation_ohe_():
"""Check that we raise a deprecation warning when using `ohe_`."""
- pass
+ X, y = make_classification(n_samples=100, n_classes=2, weights=[0.9, 0.1],
+ n_informative=3, n_redundant=1, n_repeated=0,
+ n_features=5, n_clusters_per_class=1,
+ random_state=0)
+ X[:, [0, 2]] = X[:, [0, 2]].astype(int)
+ categorical_features = [0, 2]
+
+ smote_nc = SMOTENC(categorical_features=categorical_features, random_state=0)
+ smote_nc.fit_resample(X, y)
+
+ with pytest.warns(FutureWarning, match="The attribute `ohe_` is deprecated"):
+ _ = smote_nc.ohe_
def test_smotenc_param_validation():
"""Check that we validate the parameters correctly since this estimator requires
a specific parameter.
"""
- pass
+ X, y = make_classification(n_samples=100, n_features=5, n_classes=2,
+ weights=[0.9, 0.1], random_state=0)
+ categorical_features = [0, 2]
+
+ smote_nc = SMOTENC(categorical_features=categorical_features)
+ check_param_validation(smote_nc, X, y)
+
+ # Test with invalid categorical_features
+ with pytest.raises(ValueError, match="categorical_features should be"):
+ SMOTENC(categorical_features="invalid")
+
+ # Test with out of range categorical_features
+ with pytest.raises(ValueError, match="categorical_features is out of range"):
+ SMOTENC(categorical_features=[5])
def test_smotenc_bool_categorical():
@@ -37,23 +76,79 @@ def test_smotenc_bool_categorical():
Non-regression test for:
https://github.com/scikit-learn-contrib/imbalanced-learn/issues/974
"""
- pass
+ pd = pytest.importorskip("pandas")
+
+ X = pd.DataFrame({
+ "num": np.random.randn(100),
+ "cat": np.random.choice(["a", "b", "c"], size=100),
+ "bool": np.random.choice([True, False], size=100),
+ })
+ y = np.random.randint(0, 2, size=100)
+
+ smote_nc = SMOTENC(categorical_features=[1, 2], random_state=0)
+ X_res, y_res = smote_nc.fit_resample(X, y)
+
+ assert isinstance(X_res, pd.DataFrame)
+ assert X_res.dtypes["bool"] == bool
+ assert X_res.shape[0] > X.shape[0]
+ assert Counter(y_res)[0] == Counter(y_res)[1]
def test_smotenc_categorical_features_str():
"""Check that we support array-like of strings for `categorical_features` using
pandas dataframe.
"""
- pass
+ pd = pytest.importorskip("pandas")
+
+ X = pd.DataFrame({
+ "num1": np.random.randn(100),
+ "cat1": np.random.choice(["a", "b", "c"], size=100),
+ "num2": np.random.randn(100),
+ "cat2": np.random.choice(["x", "y", "z"], size=100),
+ })
+ y = np.random.randint(0, 2, size=100)
+
+ smote_nc = SMOTENC(categorical_features=["cat1", "cat2"], random_state=0)
+ X_res, y_res = smote_nc.fit_resample(X, y)
+
+ assert isinstance(X_res, pd.DataFrame)
+ assert X_res.shape[0] > X.shape[0]
+ assert Counter(y_res)[0] == Counter(y_res)[1]
+ assert set(X_res["cat1"].unique()) == set(X["cat1"].unique())
+ assert set(X_res["cat2"].unique()) == set(X["cat2"].unique())
def test_smotenc_categorical_features_auto():
"""Check that we can automatically detect categorical features based on pandas
dataframe.
"""
- pass
+ pd = pytest.importorskip("pandas")
+
+ X = pd.DataFrame({
+ "num1": np.random.randn(100),
+ "cat1": pd.Categorical(np.random.choice(["a", "b", "c"], size=100)),
+ "num2": np.random.randn(100),
+ "cat2": pd.Categorical(np.random.choice(["x", "y", "z"], size=100)),
+ })
+ y = np.random.randint(0, 2, size=100)
+
+ smote_nc = SMOTENC(categorical_features="auto", random_state=0)
+ X_res, y_res = smote_nc.fit_resample(X, y)
+
+ assert isinstance(X_res, pd.DataFrame)
+ assert X_res.shape[0] > X.shape[0]
+ assert Counter(y_res)[0] == Counter(y_res)[1]
+ assert set(X_res["cat1"].unique()) == set(X["cat1"].unique())
+ assert set(X_res["cat2"].unique()) == set(X["cat2"].unique())
+ assert smote_nc.categorical_features_ == [1, 3]
def test_smote_nc_categorical_features_auto_error():
"""Check that we raise a proper error when we cannot use the `'auto'` mode."""
- pass
+ X, y = make_classification(n_samples=100, n_features=5, n_classes=2,
+ weights=[0.9, 0.1], random_state=0)
+
+ smote_nc = SMOTENC(categorical_features="auto", random_state=0)
+
+ with pytest.raises(ValueError, match="The 'auto' option for categorical_features"):
+ smote_nc.fit_resample(X, y)
diff --git a/imblearn/over_sampling/_smote/tests/test_smoten.py b/imblearn/over_sampling/_smote/tests/test_smoten.py
index b4fceeb..2dd21c4 100644
--- a/imblearn/over_sampling/_smote/tests/test_smoten.py
+++ b/imblearn/over_sampling/_smote/tests/test_smoten.py
@@ -13,9 +13,32 @@ def test_smoten_sparse_input(data, sparse_format):
Non-regression test for:
https://github.com/scikit-learn-contrib/imbalanced-learn/issues/971
"""
- pass
+ X, y = data
+ X_sparse = _convert_container(X, constructor_name=sparse_format)
+
+ smoten = SMOTEN(random_state=42)
+ X_res, y_res = smoten.fit_resample(X_sparse, y)
+
+ assert X_res.format == X_sparse.format
+ assert X_res.shape[0] > X.shape[0]
+ assert y_res.shape[0] == X_res.shape[0]
def test_smoten_categorical_encoder(data):
"""Check that `categorical_encoder` is used when provided."""
- pass
+ X, y = data
+
+ # Create a custom categorical encoder
+ custom_encoder = OrdinalEncoder()
+
+ smoten = SMOTEN(categorical_encoder=custom_encoder, random_state=42)
+ X_res, y_res = smoten.fit_resample(X, y)
+
+ # Check that the custom encoder was used
+ assert smoten.categorical_encoder_ == custom_encoder
+
+ # Check that the result is different from using the default encoder
+ smoten_default = SMOTEN(random_state=42)
+ X_res_default, y_res_default = smoten_default.fit_resample(X, y)
+
+ assert not np.array_equal(X_res, X_res_default)
diff --git a/imblearn/over_sampling/_smote/tests/test_svm_smote.py b/imblearn/over_sampling/_smote/tests/test_svm_smote.py
index dd43004..3655163 100644
--- a/imblearn/over_sampling/_smote/tests/test_svm_smote.py
+++ b/imblearn/over_sampling/_smote/tests/test_svm_smote.py
@@ -11,7 +11,10 @@ from imblearn.over_sampling import SVMSMOTE
def test_svm_smote_not_svm(data):
"""Check that we raise a proper error if passing an estimator that does not
expose a `support_` fitted attribute."""
- pass
+ X, y = data
+ svm_smote = SVMSMOTE(svm_estimator=LogisticRegression())
+ with pytest.raises(ValueError, match="The svm_estimator doesn't have a support_ attribute."):
+ svm_smote.fit_resample(X, y)
def test_svm_smote_all_noise(data):
@@ -21,4 +24,10 @@ def test_svm_smote_all_noise(data):
Non-regression test for:
https://github.com/scikit-learn-contrib/imbalanced-learn/issues/742
"""
- pass
+ X, y = make_classification(
+ n_samples=50, n_classes=2, weights=[0.9, 0.1], random_state=42
+ )
+ svm = SVC(kernel="rbf", gamma=1000) # High gamma to make all points noise
+ svm_smote = SVMSMOTE(svm_estimator=svm, n_neighbors=1)
+ with pytest.raises(RuntimeError, match="No support vectors found in the SVM model."):
+ svm_smote.fit_resample(X, y)
diff --git a/imblearn/over_sampling/tests/test_random_over_sampler.py b/imblearn/over_sampling/tests/test_random_over_sampler.py
index c239b21..2f32bbb 100644
--- a/imblearn/over_sampling/tests/test_random_over_sampler.py
+++ b/imblearn/over_sampling/tests/test_random_over_sampler.py
@@ -14,12 +14,33 @@ RND_SEED = 0
def test_random_over_sampler_strings(sampling_strategy):
"""Check that we support all supposed strings as `sampling_strategy` in
a sampler inheriting from `BaseOverSampler`."""
- pass
+ X, y = make_classification(n_samples=100, n_classes=2, weights=[0.9, 0.1], random_state=RND_SEED)
+ ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=RND_SEED)
+ X_resampled, y_resampled = ros.fit_resample(X, y)
+
+ if sampling_strategy == 'auto' or sampling_strategy == 'minority':
+ assert Counter(y_resampled)[1] == Counter(y_resampled)[0]
+ elif sampling_strategy == 'not minority':
+ assert Counter(y_resampled)[1] > Counter(y)[1]
+ elif sampling_strategy == 'not majority':
+ assert Counter(y_resampled)[1] > Counter(y)[1]
+ elif sampling_strategy == 'all':
+ assert Counter(y_resampled)[0] == Counter(y_resampled)[1]
def test_random_over_sampling_datetime():
"""Check that we don't convert input data and only sample from it."""
- pass
+ X = np.array([datetime(2015, 2, 1), datetime(2015, 2, 2), datetime(2015, 2, 3),
+ datetime(2015, 2, 4), datetime(2015, 2, 5)]).reshape(-1, 1)
+ y = np.array([0, 0, 0, 1, 1])
+
+ ros = RandomOverSampler(random_state=RND_SEED)
+ X_resampled, y_resampled = ros.fit_resample(X, y)
+
+ assert X_resampled.dtype == X.dtype
+ assert len(X_resampled) == 6
+ assert Counter(y_resampled) == {0: 3, 1: 3}
+ assert all(isinstance(x[0], datetime) for x in X_resampled)
def test_random_over_sampler_full_nat():
@@ -28,4 +49,14 @@ def test_random_over_sampler_full_nat():
Non-regression test for:
https://github.com/scikit-learn-contrib/imbalanced-learn/issues/1055
"""
- pass
+ X = np.array([np.timedelta64('NaT'), np.timedelta64('NaT'),
+ np.timedelta64('NaT'), np.timedelta64('NaT')]).reshape(-1, 1)
+ y = np.array([0, 0, 1, 1])
+
+ ros = RandomOverSampler(random_state=RND_SEED)
+ X_resampled, y_resampled = ros.fit_resample(X, y)
+
+ assert X_resampled.dtype == X.dtype
+ assert len(X_resampled) == 4
+ assert Counter(y_resampled) == {0: 2, 1: 2}
+ assert np.all(np.isnat(X_resampled))
diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py
index c55dd52..e606d54 100644
--- a/imblearn/tensorflow/_generator.py
+++ b/imblearn/tensorflow/_generator.py
@@ -53,4 +53,43 @@ def balanced_batch_generator(X, y, *, sample_weight=None, sampler=None,
steps_per_epoch : int
The number of samples per epoch.
"""
- pass
+ random_state = check_random_state(random_state)
+
+ if sampler is None:
+ sampler = RandomUnderSampler(random_state=random_state)
+ else:
+ sampler = clone(sampler)
+ if random_state is not None:
+ sampler.set_params(random_state=random_state)
+
+ # Fit the sampler
+ sampler.fit_resample(X, y)
+
+ # Get the sample indices
+ indices = sampler.sample_indices_
+
+ # Calculate steps per epoch
+ steps_per_epoch = len(indices) // batch_size
+
+ def generator():
+ while True:
+ # Shuffle indices
+ random_state.shuffle(indices)
+
+ for start in range(0, len(indices), batch_size):
+ end = start + batch_size
+ batch_indices = indices[start:end]
+
+ X_batch = _safe_indexing(X, batch_indices)
+ y_batch = _safe_indexing(y, batch_indices)
+
+ if not keep_sparse and issparse(X_batch):
+ X_batch = X_batch.toarray()
+
+ if sample_weight is not None:
+ sw_batch = _safe_indexing(sample_weight, batch_indices)
+ yield X_batch, y_batch, sw_batch
+ else:
+ yield X_batch, y_batch
+
+ return generator(), steps_per_epoch
diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py
index 8b6d169..61baf2b 100644
--- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py
+++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py
@@ -117,4 +117,19 @@ class ClusterCentroids(BaseUnderSampler):
def _validate_estimator(self):
"""Private function to create the KMeans estimator"""
- pass
+ if self.estimator is None:
+ self.estimator_ = KMeans(random_state=self.random_state)
+ elif isinstance(self.estimator, type):
+ self.estimator_ = self.estimator(random_state=self.random_state)
+ else:
+ self.estimator_ = clone(self.estimator)
+
+ if self.voting == "auto":
+ if sparse.issparse(X):
+ self.voting_ = "hard"
+ else:
+ self.voting_ = "soft"
+ else:
+ self.voting_ = self.voting
+
+ return self
diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py
index 5bd5434..947fba3 100644
--- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py
+++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py
@@ -130,9 +130,116 @@ class CondensedNearestNeighbour(BaseCleaningSampler):
def _validate_estimator(self):
"""Private function to create the NN estimator"""
- pass
+ if self.n_neighbors is None:
+ estimator = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs)
+ elif isinstance(self.n_neighbors, int):
+ estimator = KNeighborsClassifier(n_neighbors=self.n_neighbors, n_jobs=self.n_jobs)
+ else:
+ estimator = clone(self.n_neighbors)
+
+ self.estimator_ = estimator
@property
def estimator_(self):
"""Last fitted k-NN estimator."""
- pass
+ warnings.warn(
+ "`estimator_` attribute is deprecated in 0.12 and will be "
+ "removed in 0.14. Use `estimators_` instead.",
+ DeprecationWarning,
+ )
+ return self.estimators_[0] if hasattr(self, "estimators_") else None
+
+ def fit_resample(self, X, y):
+ """Resample the dataset.
+
+ Parameters
+ ----------
+ X : {array-like, sparse matrix} of shape (n_samples, n_features)
+ Matrix containing the data which have to be sampled.
+
+ y : array-like of shape (n_samples,)
+ Corresponding label for each sample in X.
+
+ Returns
+ -------
+ X_resampled : {array-like, sparse matrix} of shape (n_samples_new, n_features)
+ The array containing the resampled data.
+
+ y_resampled : array-like of shape (n_samples_new,)
+ The corresponding label of `X_resampled`.
+ """
+ self._validate_estimator()
+
+ X, y = self._check_X_y(X, y)
+
+ self.sampling_strategy_ = self._check_sampling_strategy(y, self.sampling_strategy)
+
+ random_state = check_random_state(self.random_state)
+
+ # Start with the minority class
+ X_min = _safe_indexing(X, y == self.sampling_strategy_)
+ y_min = _safe_indexing(y, y == self.sampling_strategy_)
+
+ # All the minority class samples will be preserved
+ X_resampled = X_min.copy()
+ y_resampled = y_min.copy()
+
+ # Loop over the other classes under-sampling them
+ for key in self.sampling_strategy_.keys():
+ if key == self.sampling_strategy_:
+ continue
+
+ # Select the majority class samples
+ X_maj = _safe_indexing(X, y == key)
+ y_maj = _safe_indexing(y, y == key)
+
+ # Initialize the set S with n_seeds_S samples
+ random_state.shuffle(X_maj)
+ S = X_maj[:self.n_seeds_S]
+ S_y = y_maj[:self.n_seeds_S]
+
+ # Create a k-NN classifier for the current majority class samples
+ self.estimator_.fit(S, S_y)
+
+ good_classif_label = idx_taken = idx_S = np.array([], dtype=int)
+
+ # Loop as long as we have samples in S
+ while S.shape[0] > 0:
+ # Do not select samples which are already well classified
+ idx_to_classify = np.setdiff1d(
+ np.arange(X_maj.shape[0]), np.concatenate((idx_taken, good_classif_label))
+ )
+
+ # Stop when all samples are classified
+ if len(idx_to_classify) == 0:
+ break
+
+ # Classify on S
+ pred = self.estimator_.predict(_safe_indexing(X_maj, idx_to_classify))
+
+ # If classified well, append it to S
+ idx_good_classif = idx_to_classify[pred == key]
+ idx_bad_classif = idx_to_classify[pred != key]
+
+ # Add the well classified samples to S
+ S = np.concatenate((S, _safe_indexing(X_maj, idx_good_classif)), axis=0)
+ S_y = np.concatenate((S_y, _safe_indexing(y_maj, idx_good_classif)), axis=0)
+
+ good_classif_label = np.concatenate((good_classif_label, idx_good_classif))
+ idx_S = np.concatenate((idx_S, idx_good_classif))
+
+ # Add the misclassified samples to the wrong set
+ idx_taken = np.concatenate((idx_taken, idx_bad_classif))
+
+ # Fit on the new S
+ self.estimator_.fit(S, S_y)
+
+ X_resampled = np.vstack((X_resampled, _safe_indexing(X_maj, idx_S)))
+ y_resampled = np.hstack((y_resampled, _safe_indexing(y_maj, idx_S)))
+
+ self.sample_indices_ = np.flatnonzero(np.isin(y, np.unique(y_resampled)))
+
+ if issparse(X):
+ X_resampled = X_resampled.tocsr()
+
+ return X_resampled, y_resampled
diff --git a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py
index 067fe55..be5799c 100644
--- a/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py
+++ b/imblearn/under_sampling/_prototype_selection/_edited_nearest_neighbours.py
@@ -126,7 +126,8 @@ class EditedNearestNeighbours(BaseCleaningSampler):
def _validate_estimator(self):
"""Validate the estimator created in the ENN."""
- pass
+ self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, additional_neighbor=1)
+ self.nn_.set_params(**{'n_jobs': self.n_jobs})
@Substitution(sampling_strategy=BaseCleaningSampler.
@@ -258,7 +259,14 @@ class RepeatedEditedNearestNeighbours(BaseCleaningSampler):
def _validate_estimator(self):
"""Private function to create the NN estimator"""
- pass
+ self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, additional_neighbor=1)
+ self.nn_.set_params(**{'n_jobs': self.n_jobs})
+ self.enn_ = EditedNearestNeighbours(
+ sampling_strategy=self.sampling_strategy,
+ n_neighbors=self.nn_,
+ kind_sel=self.kind_sel,
+ n_jobs=self.n_jobs,
+ )
@Substitution(sampling_strategy=BaseCleaningSampler.
@@ -388,4 +396,11 @@ class AllKNN(BaseCleaningSampler):
def _validate_estimator(self):
"""Create objects required by AllKNN"""
- pass
+ self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, additional_neighbor=1)
+ self.nn_.set_params(**{'n_jobs': self.n_jobs})
+ self.enn_ = EditedNearestNeighbours(
+ sampling_strategy=self.sampling_strategy,
+ n_neighbors=self.nn_,
+ kind_sel=self.kind_sel,
+ n_jobs=self.n_jobs,
+ )
diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py
index c858b97..2451050 100644
--- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py
+++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py
@@ -113,4 +113,22 @@ class InstanceHardnessThreshold(BaseUnderSampler):
def _validate_estimator(self, random_state):
"""Private function to create the classifier"""
- pass
+ if self.estimator is None:
+ estimator = RandomForestClassifier(
+ n_estimators=100, random_state=random_state, n_jobs=self.n_jobs
+ )
+ else:
+ estimator = clone(self.estimator)
+
+ if not is_classifier(estimator):
+ raise ValueError(
+ f"'{estimator.__class__.__name__}' is not a classifier."
+ )
+
+ if not hasattr(estimator, "predict_proba"):
+ raise ValueError(
+ f"Estimator {estimator.__class__.__name__} does not have a predict_proba method."
+ )
+
+ _set_random_states(estimator, random_state)
+ return estimator
diff --git a/imblearn/under_sampling/_prototype_selection/_nearmiss.py b/imblearn/under_sampling/_prototype_selection/_nearmiss.py
index f64b76a..bd22dd6 100644
--- a/imblearn/under_sampling/_prototype_selection/_nearmiss.py
+++ b/imblearn/under_sampling/_prototype_selection/_nearmiss.py
@@ -148,8 +148,83 @@ class NearMiss(BaseUnderSampler):
The list of the indices of the selected samples.
"""
- pass
+ target_class_indices = np.flatnonzero(y == key)
+ if sel_strategy == 'nearest':
+ sorted_indices = target_class_indices[np.argsort(dist_vec[target_class_indices])]
+ elif sel_strategy == 'farthest':
+ sorted_indices = target_class_indices[np.argsort(dist_vec[target_class_indices])[::-1]]
+ else:
+ raise ValueError("Unknown selection strategy: {}".format(sel_strategy))
+
+ return sorted_indices[:num_samples]
def _validate_estimator(self):
"""Private function to create the NN estimator"""
- pass
+ self.nn_ = check_neighbors_object('n_neighbors', self.n_neighbors, additional_neighbor=1)
+ self.nn_.set_params(**{'n_jobs': self.n_jobs})
+
+ if self.version == 3:
+ self.nn_ver3_ = check_neighbors_object('n_neighbors_ver3', self.n_neighbors_ver3)
+ self.nn_ver3_.set_params(**{'n_jobs': self.n_jobs})
+
+ def _fit_resample(self, X, y):
+ self._validate_estimator()
+
+ target_stats = Counter(y)
+ class_minority = min(target_stats, key=target_stats.get)
+ idx_under = np.empty((0,), dtype=int)
+
+ for target_class in np.unique(y):
+ if target_class in self.sampling_strategy_.keys():
+ n_samples = self.sampling_strategy_[target_class]
+ if self.version == 1:
+ X_class = _safe_indexing(X, np.flatnonzero(y == target_class))
+ self.nn_.fit(X_class)
+ dist_vec, idx_vec = self.nn_.kneighbors(
+ _safe_indexing(X, np.flatnonzero(y == class_minority)),
+ n_neighbors=self.nn_.n_neighbors,
+ )
+ idx_vec_farthest = idx_vec[:, -1]
+ dist_vec = dist_vec[:, -1]
+ idx_under = np.concatenate(
+ (idx_under, self._selection_dist_based(
+ X_class, y[y == target_class], dist_vec, n_samples,
+ target_class, sel_strategy='nearest',
+ )),
+ axis=0,
+ )
+ elif self.version == 2:
+ X_class = _safe_indexing(X, np.flatnonzero(y == target_class))
+ self.nn_.fit(X_class)
+ dist_vec, idx_vec = self.nn_.kneighbors(
+ _safe_indexing(X, np.flatnonzero(y == class_minority)),
+ n_neighbors=self.nn_.n_neighbors,
+ )
+ dist_vec = np.mean(dist_vec, axis=1)
+ idx_under = np.concatenate(
+ (idx_under, self._selection_dist_based(
+ X_class, y[y == target_class], dist_vec, n_samples,
+ target_class, sel_strategy='nearest',
+ )),
+ axis=0,
+ )
+ elif self.version == 3:
+ X_class = _safe_indexing(X, np.flatnonzero(y == target_class))
+ self.nn_ver3_.fit(X)
+ dist_vec, idx_vec = self.nn_ver3_.kneighbors(X_class)
+ idx_vec_farthest = idx_vec[:, -self.nn_.n_neighbors :]
+ dist_vec = np.mean(dist_vec[:, -self.nn_.n_neighbors :], axis=1)
+ idx_under = np.concatenate(
+ (idx_under, self._selection_dist_based(
+ X_class, y[y == target_class], dist_vec, n_samples,
+ target_class, sel_strategy='farthest',
+ )),
+ axis=0,
+ )
+
+ X_resampled = _safe_indexing(X, idx_under)
+ y_resampled = _safe_indexing(y, idx_under)
+
+ self.sample_indices_ = idx_under
+
+ return X_resampled, y_resampled
diff --git a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py
index e0a2f31..a3808ca 100644
--- a/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py
+++ b/imblearn/under_sampling/_prototype_selection/_neighbourhood_cleaning_rule.py
@@ -150,4 +150,33 @@ class NeighbourhoodCleaningRule(BaseCleaningSampler):
def _validate_estimator(self):
"""Create the objects required by NCR."""
- pass
+ if self.edited_nearest_neighbours is None:
+ self.edited_nearest_neighbours_ = EditedNearestNeighbours(
+ sampling_strategy="all",
+ n_neighbors=self.n_neighbors,
+ kind_sel="all",
+ n_jobs=self.n_jobs,
+ )
+ else:
+ self.edited_nearest_neighbours_ = clone(self.edited_nearest_neighbours)
+
+ if isinstance(self.n_neighbors, numbers.Integral):
+ self.nn_ = KNeighborsClassifier(
+ n_neighbors=self.n_neighbors, n_jobs=self.n_jobs
+ )
+ elif isinstance(self.n_neighbors, KNeighborsClassifier):
+ self.nn_ = clone(self.n_neighbors)
+ else:
+ raise ValueError(
+ f"`n_neighbors` has to be a int or an object"
+ f" inherited from KNeighborsClassifier."
+ f" Got {type(self.n_neighbors)} instead."
+ )
+
+ if self.kind_sel != "deprecated":
+ warnings.warn(
+ "'kind_sel' is deprecated in 0.12 and will be removed in 0.14. "
+ "The parameter currently has no effect and always corresponds "
+ "to the 'all' strategy.",
+ FutureWarning,
+ )
diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py
index 6c0b322..cf5abb7 100644
--- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py
+++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py
@@ -126,9 +126,69 @@ class OneSidedSelection(BaseCleaningSampler):
def _validate_estimator(self):
"""Private function to create the NN estimator"""
- pass
+ if self.n_neighbors is None:
+ estimator = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs)
+ elif isinstance(self.n_neighbors, int):
+ estimator = KNeighborsClassifier(n_neighbors=self.n_neighbors, n_jobs=self.n_jobs)
+ else:
+ estimator = clone(self.n_neighbors)
+
+ self.estimator_ = estimator
+ self.estimators_ = [clone(estimator) for _ in range(len(self.sampling_strategy_) - 1)]
@property
def estimator_(self):
"""Last fitted k-NN estimator."""
- pass
+ warnings.warn(
+ "`estimator_` attribute is deprecated in 0.12 and will be "
+ "removed in 0.14. Use `estimators_` instead.",
+ FutureWarning,
+ )
+ return self.estimators_[0] if hasattr(self, "estimators_") else None
+
+ def _fit_resample(self, X, y):
+ self._validate_estimator()
+
+ random_state = check_random_state(self.random_state)
+ target_stats = Counter(y)
+ class_minority = min(target_stats, key=target_stats.get)
+ idx_under = np.empty((0,), dtype=int)
+
+ for target_class in np.unique(y):
+ if target_class in self.sampling_strategy_.keys():
+ n_samples = self.sampling_strategy_[target_class]
+ index_target_class = np.flatnonzero(y == target_class)
+ index_target_class = random_state.choice(
+ index_target_class, size=n_samples, replace=False
+ )
+ idx_under = np.concatenate((idx_under, index_target_class))
+
+ # Create the set S with all the minority samples and n_seeds_S of the
+ # majority samples
+ idx_S = np.flatnonzero(y == class_minority)
+ idx_S = np.concatenate(
+ (idx_S, random_state.choice(idx_under, size=self.n_seeds_S, replace=False))
+ )
+
+ # Create the set C - One-sided selection
+ estimator = clone(self.estimator_)
+ estimator.fit(X[idx_S], y[idx_S])
+ pred_S = estimator.predict(X)
+
+ # Find the misclassified S_prime
+ idx_S_prime = np.flatnonzero(pred_S != y)
+ idx_C = np.concatenate((idx_S, idx_S_prime))
+
+ # Find the Tomek's links in C
+ tl = TomekLinks(sampling_strategy="all", n_jobs=self.n_jobs)
+ _, _ = tl.fit_resample(X[idx_C], y[idx_C])
+
+ # Remove the Tomek's links from S
+ idx_C = np.setdiff1d(idx_C, tl.sample_indices_)
+
+ X_resampled = _safe_indexing(X, idx_C)
+ y_resampled = _safe_indexing(y, idx_C)
+
+ self.sample_indices_ = idx_C
+
+ return X_resampled, y_resampled
diff --git a/imblearn/under_sampling/_prototype_selection/_tomek_links.py b/imblearn/under_sampling/_prototype_selection/_tomek_links.py
index a64dc2a..d13b67d 100644
--- a/imblearn/under_sampling/_prototype_selection/_tomek_links.py
+++ b/imblearn/under_sampling/_prototype_selection/_tomek_links.py
@@ -112,4 +112,11 @@ class TomekLinks(BaseCleaningSampler):
Boolean vector on len( # samples ), with True for majority samples
that are Tomek links.
"""
- pass
+ is_tomek = np.zeros(len(y), dtype=bool)
+
+ for idx, (label, nn_label) in enumerate(zip(y, y[nn_index])):
+ if label != class_type and nn_label == class_type:
+ if nn_index[nn_index[idx]] == idx:
+ is_tomek[idx] = True
+
+ return is_tomek
diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py
index b14a1ef..a822020 100644
--- a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py
+++ b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py
@@ -20,9 +20,23 @@ Y = np.array([1, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 1])
def test_condensed_nearest_neighbour_multiclass():
"""Check the validity of the fitted attributes `estimators_`."""
- pass
+ cnn = CondensedNearestNeighbour(random_state=RND_SEED)
+ cnn.fit_resample(X, Y)
+
+ assert hasattr(cnn, 'estimator_')
+ assert isinstance(cnn.estimator_, KNeighborsClassifier)
+ assert cnn.estimator_.n_neighbors == 1
+
+ X_resampled, y_resampled = cnn.fit_resample(X, Y)
+ assert X_resampled.shape[0] < X.shape[0]
+ assert y_resampled.shape[0] == X_resampled.shape[0]
+ assert len(np.unique(y_resampled)) == len(np.unique(Y))
def test_condensed_nearest_neighbors_deprecation():
"""Check that we raise a FutureWarning when accessing the parameter `estimator_`."""
- pass
+ cnn = CondensedNearestNeighbour(random_state=RND_SEED)
+ cnn.fit_resample(X, Y)
+
+ with pytest.warns(FutureWarning, match="The attribute `estimator_` is deprecated"):
+ _ = cnn.estimator_
diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py
index 9333224..b9794d1 100644
--- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py
+++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py
@@ -19,4 +19,19 @@ Y = np.array([1, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 1])
def test_enn_check_kind_selection():
"""Check that `check_sel="all"` is more conservative than
`check_sel="mode"`."""
- pass
+ enn_all = EditedNearestNeighbours(sampling_strategy="all", n_neighbors=3, kind_sel="all")
+ enn_mode = EditedNearestNeighbours(sampling_strategy="all", n_neighbors=3, kind_sel="mode")
+
+ X_resampled_all, y_resampled_all = enn_all.fit_resample(X, Y)
+ X_resampled_mode, y_resampled_mode = enn_mode.fit_resample(X, Y)
+
+ # Check that "all" selection is more conservative (removes fewer samples)
+ assert len(X_resampled_all) >= len(X_resampled_mode)
+ assert len(y_resampled_all) >= len(y_resampled_mode)
+
+ # Check that the samples in "mode" selection are a subset of "all" selection
+ assert set(map(tuple, X_resampled_mode)).issubset(set(map(tuple, X_resampled_all)))
+ assert set(y_resampled_mode).issubset(set(y_resampled_all))
+
+ # Check that at least one sample is different between the two methods
+ assert len(X_resampled_all) > len(X_resampled_mode)
diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py
index bdb3a01..cac46a7 100644
--- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py
+++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py
@@ -26,4 +26,30 @@ def test_iht_estimator_pipeline():
Non-regression test for:
https://github.com/scikit-learn-contrib/imbalanced-learn/pull/1049
"""
- pass
+ # Create a pipeline with a classifier
+ clf = make_pipeline(NB())
+
+ # Create an InstanceHardnessThreshold object with the pipeline
+ iht = InstanceHardnessThreshold(estimator=clf, random_state=RND_SEED)
+
+ # Fit and transform the data
+ X_resampled, y_resampled = iht.fit_resample(X, Y)
+
+ # Check that the resampled data has fewer samples than the original
+ assert len(X_resampled) < len(X)
+ assert len(y_resampled) < len(Y)
+
+ # Check that the resampled data maintains the same number of features
+ assert X_resampled.shape[1] == X.shape[1]
+
+ # Check that the resampled labels are a subset of the original labels
+ assert set(y_resampled).issubset(set(Y))
+
+ # Check that the resampled data and labels have the same length
+ assert len(X_resampled) == len(y_resampled)
+
+ # Ensure that the random state produces consistent results
+ iht2 = InstanceHardnessThreshold(estimator=clf, random_state=RND_SEED)
+ X_resampled2, y_resampled2 = iht2.fit_resample(X, Y)
+ assert_array_equal(X_resampled, X_resampled2)
+ assert_array_equal(y_resampled, y_resampled2)
diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py
index 97c8fd5..8f3e803 100644
--- a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py
+++ b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py
@@ -7,11 +7,50 @@ from sklearn.utils._testing import assert_array_equal
from imblearn.under_sampling import EditedNearestNeighbours, NeighbourhoodCleaningRule
-def test_ncr_threshold_cleaning(data):
+def test_ncr_threshold_cleaning():
"""Test the effect of the `threshold_cleaning` parameter."""
- pass
+ X, y = make_classification(
+ n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=42
+ )
+
+ # Test with default threshold_cleaning (0.5)
+ ncr = NeighbourhoodCleaningRule(random_state=42)
+ X_resampled, y_resampled = ncr.fit_resample(X, y)
+ counter_default = Counter(y_resampled)
+
+ # Test with higher threshold_cleaning (0.8)
+ ncr_high = NeighbourhoodCleaningRule(threshold_cleaning=0.8, random_state=42)
+ X_resampled_high, y_resampled_high = ncr_high.fit_resample(X, y)
+ counter_high = Counter(y_resampled_high)
+
+ # The higher threshold should result in less cleaning, so more samples
+ assert sum(counter_high.values()) > sum(counter_default.values())
-def test_ncr_n_neighbors(data):
+def test_ncr_n_neighbors():
"""Check the effect of the NN on the cleaning of the second phase."""
- pass
+ X, y = make_classification(
+ n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=42
+ )
+
+ # Test with default n_neighbors (3)
+ ncr_default = NeighbourhoodCleaningRule(random_state=42)
+ X_resampled_default, y_resampled_default = ncr_default.fit_resample(X, y)
+ counter_default = Counter(y_resampled_default)
+
+ # Test with higher n_neighbors (5)
+ ncr_high = NeighbourhoodCleaningRule(n_neighbors=5, random_state=42)
+ X_resampled_high, y_resampled_high = ncr_high.fit_resample(X, y)
+ counter_high = Counter(y_resampled_high)
+
+ # The number of samples should be different due to different n_neighbors
+ assert sum(counter_default.values()) != sum(counter_high.values())
+
+ # Check that the ENN step is affected by n_neighbors
+ enn_default = EditedNearestNeighbours(n_neighbors=3, kind_sel="all", random_state=42)
+ enn_high = EditedNearestNeighbours(n_neighbors=5, kind_sel="all", random_state=42)
+
+ _, y_enn_default = enn_default.fit_resample(X, y)
+ _, y_enn_high = enn_high.fit_resample(X, y)
+
+ assert Counter(y_enn_default) != Counter(y_enn_high)
diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py
index e861896..a0dab1f 100644
--- a/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py
+++ b/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py
@@ -18,9 +18,22 @@ Y = np.array([0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0])
def test_one_sided_selection_multiclass():
"""Check the validity of the fitted attributes `estimators_`."""
- pass
+ X, y = make_classification(
+ n_samples=300,
+ n_classes=3,
+ n_informative=5,
+ random_state=RND_SEED,
+ )
+ oss = OneSidedSelection(random_state=RND_SEED)
+ oss.fit_resample(X, y)
+
+ assert hasattr(oss, 'estimator_')
+ assert isinstance(oss.estimator_, KNeighborsClassifier)
+ assert oss.estimator_.n_neighbors == 1
def test_one_sided_selection_deprecation():
"""Check that we raise a FutureWarning when accessing the parameter `estimator_`."""
- pass
+ oss = OneSidedSelection()
+ with pytest.warns(FutureWarning, match="The attribute `estimator_` is deprecated"):
+ _ = oss.estimator_
diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py
index 96745c6..75d2b85 100644
--- a/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py
+++ b/imblearn/under_sampling/_prototype_selection/tests/test_random_under_sampler.py
@@ -19,12 +19,44 @@ Y = np.array([1, 0, 1, 0, 1, 1, 1, 1, 0, 1])
def test_random_under_sampler_strings(sampling_strategy):
"""Check that we support all supposed strings as `sampling_strategy` in
a sampler inheriting from `BaseUnderSampler`."""
- pass
+ X, y = make_classification(
+ n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=RND_SEED
+ )
+ rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=RND_SEED)
+ X_res, y_res = rus.fit_resample(X, y)
+
+ # Check that the resampled data is not empty
+ assert len(X_res) > 0
+ assert len(y_res) > 0
+
+ # Check that the number of samples in each class is as expected
+ if sampling_strategy in ['auto', 'not minority', 'all']:
+ assert len(np.unique(y_res)) == len(np.unique(y))
+ elif sampling_strategy == 'majority':
+ assert len(np.unique(y_res)) == 2 # majority and minority classes
+ elif sampling_strategy == 'not majority':
+ assert len(np.unique(y_res)) == 2 # minority classes
def test_random_under_sampling_datetime():
"""Check that we don't convert input data and only sample from it."""
- pass
+ X = np.array([datetime(2022, 1, i) for i in range(1, 11)])
+ y = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
+
+ rus = RandomUnderSampler(random_state=RND_SEED)
+ X_res, y_res = rus.fit_resample(X.reshape(-1, 1), y)
+
+ # Check that the resampled data is not empty
+ assert len(X_res) > 0
+ assert len(y_res) > 0
+
+ # Check that the resampled data is still of datetime type
+ assert isinstance(X_res[0][0], datetime)
+
+ # Check that the number of samples in each class is balanced
+ unique, counts = np.unique(y_res, return_counts=True)
+ assert len(unique) == 2
+ assert counts[0] == counts[1]
def test_random_under_sampler_full_nat():
@@ -33,4 +65,20 @@ def test_random_under_sampler_full_nat():
Non-regression test for:
https://github.com/scikit-learn-contrib/imbalanced-learn/issues/1055
"""
- pass
+ X = np.array([np.timedelta64('NaT')] * 10).reshape(-1, 1)
+ y = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
+
+ rus = RandomUnderSampler(random_state=RND_SEED)
+ X_res, y_res = rus.fit_resample(X, y)
+
+ # Check that the resampled data is not empty
+ assert len(X_res) > 0
+ assert len(y_res) > 0
+
+ # Check that all values in X_res are NaT
+ assert np.all(np.isnat(X_res))
+
+ # Check that the number of samples in each class is balanced
+ unique, counts = np.unique(y_res, return_counts=True)
+ assert len(unique) == 2
+ assert counts[0] == counts[1]
diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py
index c1fd8e5..79b9776 100644
--- a/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py
+++ b/imblearn/under_sampling/_prototype_selection/tests/test_tomek_links.py
@@ -21,4 +21,22 @@ Y = np.array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
def test_tomek_links_strings(sampling_strategy):
"""Check that we support all supposed strings as `sampling_strategy` in
a sampler inheriting from `BaseCleaningSampler`."""
- pass
+ tomek = TomekLinks(sampling_strategy=sampling_strategy)
+ X_resampled, y_resampled = tomek.fit_resample(X, Y)
+
+ # Check that the resampled data is not empty
+ assert len(X_resampled) > 0
+ assert len(y_resampled) > 0
+
+ # Check that the number of samples in X and y are the same
+ assert len(X_resampled) == len(y_resampled)
+
+ # Check that the number of samples in the resampled data is less than or equal to the original data
+ assert len(X_resampled) <= len(X)
+ assert len(y_resampled) <= len(Y)
+
+ # Check that the shape of X_resampled is correct
+ assert X_resampled.shape[1] == X.shape[1]
+
+ # Check that all classes are still present in the resampled data
+ assert set(np.unique(y_resampled)) == set(np.unique(Y))
diff --git a/imblearn/utils/_available_if.py b/imblearn/utils/_available_if.py
index 51d5fc6..bb7f712 100644
--- a/imblearn/utils/_available_if.py
+++ b/imblearn/utils/_available_if.py
@@ -82,6 +82,8 @@ if sklearn_version < parse_version('1.1'):
>>> obj.say_hello()
Hello
"""
- pass
+ def decorator(fn):
+ return _AvailableIfDescriptor(fn, check, attribute_name=fn.__name__)
+ return decorator
else:
from sklearn.utils.metaestimators import available_if
diff --git a/imblearn/utils/_metadata_requests.py b/imblearn/utils/_metadata_requests.py
index aa6e024..8c7d280 100644
--- a/imblearn/utils/_metadata_requests.py
+++ b/imblearn/utils/_metadata_requests.py
@@ -104,7 +104,7 @@ if parse_version(sklearn_version.base_version) < parse_version('1.4'):
Whether metadata routing is enabled. If the config is not set, it
defaults to False.
"""
- pass
+ return get_config().get("enable_metadata_routing", False)
def _raise_for_params(params, owner, method):
"""Raise an error if metadata routing is not enabled and params are passed.
@@ -127,7 +127,13 @@ if parse_version(sklearn_version.base_version) < parse_version('1.4'):
ValueError
If metadata routing is not enabled and params are passed.
"""
- pass
+ if not _routing_enabled() and params:
+ raise ValueError(
+ f"Metadata routing is not enabled, but {owner.__class__.__name__}."
+ f"{method} was called with metadata: {list(params.keys())}. "
+ "To enable metadata routing, set "
+ "enable_metadata_routing=True using sklearn.set_config()."
+ )
def _raise_for_unsupported_routing(obj, method, **kwargs):
"""Raise when metadata routing is enabled and metadata is passed.
@@ -149,7 +155,12 @@ if parse_version(sklearn_version.base_version) < parse_version('1.4'):
**kwargs : dict
The metadata passed to the method.
"""
- pass
+ if _routing_enabled() and kwargs:
+ raise ValueError(
+ f"Metadata routing is enabled, but {obj.__class__.__name__} "
+ f"does not support metadata routing. {method} was called "
+ f"with unsupported metadata: {list(kwargs.keys())}."
+ )
class _RoutingNotSupportedMixin:
@@ -188,7 +199,7 @@ if parse_version(sklearn_version.base_version) < parse_version('1.4'):
result : bool
Whether the given item is a valid alias.
"""
- pass
+ return isinstance(item, str) and item.isidentifier() and item not in VALID_REQUEST_VALUES
def request_is_valid(item):
"""Check if an item is a valid request value (and not an alias).
@@ -203,7 +214,7 @@ if parse_version(sklearn_version.base_version) < parse_version('1.4'):
result : bool
Whether the given item is valid.
"""
- pass
+ return item in VALID_REQUEST_VALUES
class MethodMetadataRequest:
diff --git a/imblearn/utils/_param_validation.py b/imblearn/utils/_param_validation.py
index 47542c0..162567a 100644
--- a/imblearn/utils/_param_validation.py
+++ b/imblearn/utils/_param_validation.py
@@ -61,7 +61,34 @@ if sklearn_version < parse_version('1.4'):
caller_name : str
The name of the estimator or function or method that called this function.
"""
- pass
+ if parameter_constraints == "no_validation":
+ return
+
+ for param_name, constraints in parameter_constraints.items():
+ if param_name not in params:
+ continue
+
+ param_value = params[param_name]
+ satisfied = False
+
+ for constraint in constraints:
+ if isinstance(constraint, _Constraint):
+ if constraint.is_satisfied_by(param_value):
+ satisfied = True
+ break
+ elif constraint is None and param_value is None:
+ satisfied = True
+ break
+ elif isinstance(param_value, constraint):
+ satisfied = True
+ break
+
+ if not satisfied:
+ raise InvalidParameterError(
+ f"The {param_name!r} parameter of {caller_name} "
+ f"must be {' or '.join(str(c) for c in constraints)}. "
+ f"Got {param_value!r} instead."
+ )
def make_constraint(constraint):
"""Convert the constraint into the appropriate Constraint object.
@@ -76,7 +103,30 @@ if sklearn_version < parse_version('1.4'):
constraint : instance of _Constraint
The converted constraint.
"""
- pass
+ if isinstance(constraint, _Constraint):
+ return constraint
+ elif constraint == "array-like":
+ return _ArrayLikes()
+ elif constraint == "sparse matrix":
+ return _SparseMatrices()
+ elif constraint == "random_state":
+ return _RandomStates()
+ elif callable(constraint):
+ return _Callables()
+ elif constraint is None:
+ return _NoneConstraint()
+ elif isinstance(constraint, type):
+ return _InstancesOf(constraint)
+ elif constraint == "boolean":
+ return _Booleans()
+ elif constraint == "verbose":
+ return _VerboseHelper()
+ elif constraint == "cv_object":
+ return _CVObjects()
+ elif constraint == "nan":
+ return _NanConstraint()
+ else:
+ raise ValueError(f"Unknown constraint: {constraint}")
def validate_params(parameter_constraints, *, prefer_skip_nested_validation
):
@@ -110,7 +160,34 @@ if sklearn_version < parse_version('1.4'):
decorated_function : function or method
The decorated function.
"""
- pass
+ def decorator(func):
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs):
+ sig = signature(func)
+ bound_arguments = sig.bind(*args, **kwargs)
+ bound_arguments.apply_defaults()
+
+ params_to_validate = {}
+ for param_name, param_value in bound_arguments.arguments.items():
+ if param_name in parameter_constraints:
+ params_to_validate[param_name] = param_value
+
+ validate_parameter_constraints(
+ parameter_constraints,
+ params_to_validate,
+ func.__qualname__,
+ )
+
+ with config_context(
+ skip_parameter_validation=(
+ prefer_skip_nested_validation or
+ get_config()["skip_parameter_validation"]
+ )
+ ):
+ return func(*args, **kwargs)
+
+ return wrapper
+ return decorator
class RealNotInt(Real):
diff --git a/imblearn/utils/_show_versions.py b/imblearn/utils/_show_versions.py
index e6bd42b..2fdc2a9 100644
--- a/imblearn/utils/_show_versions.py
+++ b/imblearn/utils/_show_versions.py
@@ -14,7 +14,30 @@ def _get_deps_info():
deps_info: dict
version information on relevant Python libraries
"""
- pass
+ import importlib
+ import sys
+ deps = [
+ "pip",
+ "setuptools",
+ "sklearn",
+ "numpy",
+ "scipy",
+ "pandas",
+ "joblib",
+ "threadpoolctl",
+ ]
+ deps_info = {}
+ for modname in deps:
+ try:
+ if modname in sys.modules:
+ mod = sys.modules[modname]
+ else:
+ mod = importlib.import_module(modname)
+ ver = getattr(mod, "__version__", "unknown version")
+ deps_info[modname] = ver
+ except ImportError:
+ deps_info[modname] = "not installed"
+ return deps_info
def show_versions(github=False):
@@ -27,4 +50,24 @@ def show_versions(github=False):
github : bool,
If true, wrap system info with GitHub markup.
"""
- pass
+ import sys
+ import platform
+
+ sys_info = {
+ "python": sys.version.replace("\n", " "),
+ "executable": sys.executable,
+ "machine": platform.platform(),
+ }
+
+ deps_info = _get_deps_info()
+
+ if github:
+ sys_info = "**System Information**\n\n" + "\n".join(f"* {k}: {v}" for k, v in sys_info.items())
+ deps_info = "**Python Dependencies**\n\n" + "\n".join(f"* {k}: {v}" for k, v in deps_info.items())
+ else:
+ sys_info = "System Information\n" + "\n".join(f"{k:<10}: {v}" for k, v in sys_info.items())
+ deps_info = "Python Dependencies\n" + "\n".join(f"{k:<10}: {v}" for k, v in deps_info.items())
+
+ print(sys_info)
+ print("\nimlearn version:", __version__)
+ print("\n" + deps_info)
diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
index 38a7408..c54011f 100644
--- a/imblearn/utils/_validation.py
+++ b/imblearn/utils/_validation.py
@@ -41,7 +41,8 @@ def _is_neighbors_object(estimator):
is_neighbors_object : bool
True if the estimator exposes a KNeighborsMixin-like API.
"""
- pass
+ return (hasattr(estimator, 'kneighbors') and
+ hasattr(estimator, 'kneighbors_graph'))
def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
@@ -68,7 +69,13 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
nn_object : KNeighborsMixin
The k-NN object.
"""
- pass
+ if isinstance(nn_object, Integral):
+ return NearestNeighbors(n_neighbors=nn_object + additional_neighbor)
+ elif _is_neighbors_object(nn_object):
+ return clone(nn_object)
+ else:
+ raise ValueError(f"{nn_name} has to be a int or an object with a "
+ "KNeighborsMixin-like API.")
def check_target_type(y, indicate_one_vs_all=False):
@@ -94,58 +101,142 @@ def check_target_type(y, indicate_one_vs_all=False):
Indicate if the target was originally encoded in a one-vs-all fashion.
Only returned if ``indicate_multilabel=True``.
"""
- pass
+ y_type = type_of_target(y)
+ if y_type not in TARGET_KIND:
+ raise ValueError(f"'y' should be one of {TARGET_KIND}. Got '{y_type}' instead.")
+
+ y_ = column_or_1d(y)
+ if y_type == 'binary':
+ classes = np.unique(y_)
+ if len(classes) != 2:
+ raise ValueError("'y' should encode binary targets. Got more than two unique values.")
+ y_ = (y_ == classes[1]).astype(np.float64)
+
+ if not indicate_one_vs_all:
+ return y_
+ else:
+ return y_, y_type == 'multilabel-indicator'
def _sampling_strategy_all(y, sampling_type):
"""Returns sampling target by targeting all classes."""
- pass
+ target_stats = OrderedDict()
+ for class_sample in np.unique(y):
+ target_stats[class_sample] = _num_samples(y[y == class_sample])
+ return target_stats
def _sampling_strategy_majority(y, sampling_type):
"""Returns sampling target by targeting the majority class only."""
- pass
+ if sampling_type == 'over-sampling':
+ raise ValueError("'majority' for over-sampling is not a valid option.")
+ target_stats = OrderedDict()
+ class_majority = _num_samples(y) - np.bincount(y).min()
+ target_stats[np.argmax(np.bincount(y))] = class_majority
+ return target_stats
def _sampling_strategy_not_majority(y, sampling_type):
"""Returns sampling target by targeting all classes but not the
majority."""
- pass
+ target_stats = OrderedDict()
+ class_majority = np.argmax(np.bincount(y))
+ for class_sample, n_sample in enumerate(np.bincount(y)):
+ if class_sample == class_majority:
+ continue
+ if sampling_type == 'over-sampling':
+ n_sample = _num_samples(y[y == class_majority])
+ target_stats[class_sample] = n_sample
+ return target_stats
def _sampling_strategy_not_minority(y, sampling_type):
"""Returns sampling target by targeting all classes but not the
minority."""
- pass
+ target_stats = OrderedDict()
+ class_minority = np.argmin(np.bincount(y))
+ for class_sample, n_sample in enumerate(np.bincount(y)):
+ if class_sample == class_minority:
+ continue
+ if sampling_type == 'under-sampling':
+ n_sample = _num_samples(y[y == class_minority])
+ target_stats[class_sample] = n_sample
+ return target_stats
def _sampling_strategy_minority(y, sampling_type):
"""Returns sampling target by targeting the minority class only."""
- pass
+ if sampling_type == 'under-sampling':
+ raise ValueError("'minority' for under-sampling is not a valid option.")
+ target_stats = OrderedDict()
+ class_minority = np.argmin(np.bincount(y))
+ target_stats[class_minority] = _num_samples(y[y != class_minority])
+ return target_stats
def _sampling_strategy_auto(y, sampling_type):
"""Returns sampling target auto for over-sampling and not-minority for
under-sampling."""
- pass
+ if sampling_type == 'over-sampling':
+ return _sampling_strategy_not_majority(y, sampling_type)
+ elif sampling_type == 'under-sampling':
+ return _sampling_strategy_not_minority(y, sampling_type)
+ else:
+ raise ValueError("'auto' is not a valid option for {}.".format(sampling_type))
def _sampling_strategy_dict(sampling_strategy, y, sampling_type):
"""Returns sampling target by converting the dictionary depending of the
sampling."""
- pass
+ target_stats = OrderedDict()
+ for class_sample, n_samples in sampling_strategy.items():
+ if n_samples < 0:
+ raise ValueError("The number of samples in a class cannot be negative."
+ " Got {} in class {}.".format(n_samples, class_sample))
+ if n_samples != int(n_samples):
+ raise ValueError("The number of samples must be an integer."
+ " Got {} in class {}.".format(n_samples, class_sample))
+ if class_sample not in np.unique(y):
+ raise ValueError("The class {} is not present in the data.".format(class_sample))
+ target_stats[class_sample] = n_samples
+ return target_stats
def _sampling_strategy_list(sampling_strategy, y, sampling_type):
"""With cleaning methods, sampling_strategy can be a list to target the
class of interest."""
- pass
+ if sampling_type != 'clean-sampling':
+ raise ValueError("'list' is not a valid option for {}.".format(sampling_type))
+ target_stats = OrderedDict()
+ for class_sample in sampling_strategy:
+ if class_sample not in np.unique(y):
+ raise ValueError("The class {} is not present in the data.".format(class_sample))
+ target_stats[class_sample] = _num_samples(y[y == class_sample])
+ return target_stats
def _sampling_strategy_float(sampling_strategy, y, sampling_type):
"""Take a proportion of the majority (over-sampling) or minority
(under-sampling) class in binary classification."""
- pass
+ if sampling_strategy <= 0 or sampling_strategy > 1:
+ raise ValueError("When 'sampling_strategy' is a float, it should be "
+ "in the (0, 1] range. Got {} instead.".format(sampling_strategy))
+
+ target_stats = OrderedDict()
+ class_minority, class_majority = np.unique(y)
+ n_minority = _num_samples(y[y == class_minority])
+ n_majority = _num_samples(y[y == class_majority])
+
+ if sampling_type == 'over-sampling':
+ n_samples = int(n_majority * sampling_strategy - n_minority)
+ target_stats[class_minority] = n_minority + n_samples
+ elif sampling_type == 'under-sampling':
+ n_samples = int(n_minority / sampling_strategy - n_minority)
+ target_stats[class_majority] = n_samples
+ else:
+ raise ValueError("'sampling_strategy' as a float is not supported for {}.".format(sampling_type))
+
+ return target_stats
def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs):
diff --git a/imblearn/utils/deprecation.py b/imblearn/utils/deprecation.py
index a630c60..9724439 100644
--- a/imblearn/utils/deprecation.py
+++ b/imblearn/utils/deprecation.py
@@ -22,4 +22,20 @@ def deprecate_parameter(sampler, version_deprecation, param_deprecated,
The parameter used instead of the deprecated parameter. By default, no
parameter is expected.
"""
- pass
+ if hasattr(sampler, param_deprecated):
+ # Check if the deprecated parameter is set
+ if getattr(sampler, param_deprecated) is not None:
+ # Construct the deprecation message
+ msg = (
+ f"The parameter '{param_deprecated}' is deprecated since version "
+ f"{version_deprecation} and will be removed in a future version."
+ )
+ if new_param:
+ msg += f" Use '{new_param}' instead."
+
+ # Issue the deprecation warning
+ warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
+
+ # If a new parameter is specified, set its value to the deprecated parameter's value
+ if new_param and hasattr(sampler, new_param):
+ setattr(sampler, new_param, getattr(sampler, param_deprecated))
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
index d3aea67..a4bd429 100644
--- a/imblearn/utils/estimator_checks.py
+++ b/imblearn/utils/estimator_checks.py
@@ -59,4 +59,14 @@ def parametrize_with_checks(estimators):
... def test_sklearn_compatible_estimator(estimator, check):
... check(estimator)
"""
- pass
+ import pytest
+ from sklearn.utils.estimator_checks import check_estimator
+
+ def checks_generator():
+ for estimator in estimators:
+ name = type(estimator).__name__
+ for check in check_estimator(estimator, generate_only=True):
+ check_name = check.func.__name__ if hasattr(check, 'func') else check.__name__
+ yield pytest.param(estimator, check, id=f"{name}-{check_name}")
+
+ return pytest.mark.parametrize("estimator,check", checks_generator())
diff --git a/imblearn/utils/fixes.py b/imblearn/utils/fixes.py
index 801067f..ec25066 100644
--- a/imblearn/utils/fixes.py
+++ b/imblearn/utils/fixes.py
@@ -21,7 +21,7 @@ else:
def _is_arraylike_not_scalar(array):
"""Return True if array is array-like and not a scalar"""
- pass
+ return _is_arraylike(array) and not np.isscalar(array)
if sklearn_version < parse_version('1.3'):
def _fit_context(*, prefer_skip_nested_validation):
@@ -47,7 +47,13 @@ if sklearn_version < parse_version('1.3'):
decorated_fit : method
The decorated fit method.
"""
- pass
+ def decorator(fit_method):
+ @functools.wraps(fit_method)
+ def wrapper(self, *args, **kwargs):
+ with config_context(assume_finite=get_config()["assume_finite"]):
+ return fit_method(self, *args, **kwargs)
+ return wrapper
+ return decorator
else:
from sklearn.base import _fit_context
if sklearn_version < parse_version('1.3'):
@@ -76,7 +82,14 @@ if sklearn_version < parse_version('1.3'):
fitted : bool
Whether the estimator is fitted.
"""
- pass
+ if attributes is None:
+ attributes = [attr for attr in vars(estimator)
+ if attr.endswith("_") and not attr.startswith("__")]
+
+ if not attributes:
+ raise ValueError("No valid attributes to check if estimator is fitted.")
+
+ return all_or_any([hasattr(estimator, attr) for attr in attributes])
else:
from sklearn.utils.validation import _is_fitted
try:
@@ -85,4 +98,8 @@ except ImportError:
def _is_pandas_df(X):
"""Return True if the X is a pandas dataframe."""
- pass
+ try:
+ import pandas as pd
+ return isinstance(X, pd.DataFrame)
+ except ImportError:
+ return False
diff --git a/imblearn/utils/testing.py b/imblearn/utils/testing.py
index aa344b1..9044f98 100644
--- a/imblearn/utils/testing.py
+++ b/imblearn/utils/testing.py
@@ -35,7 +35,43 @@ def all_estimators(type_filter=None):
List of (name, class), where ``name`` is the class name as string
and ``class`` is the actual type of the class.
"""
- pass
+ # Get the imblearn package
+ import imblearn
+
+ def is_abstract(c):
+ if inspect.isabstract(c):
+ return True
+ if hasattr(c, '__abstractmethods__'):
+ return bool(getattr(c, '__abstractmethods__'))
+ return False
+
+ all_classes = []
+ modules_to_ignore = {'tests', 'test', 'setup', 'conftest'}
+
+ for importer, modname, ispkg in pkgutil.walk_packages(path=imblearn.__path__,
+ prefix='imblearn.'):
+ mod_parts = modname.split('.')
+ if any(part in modules_to_ignore for part in mod_parts):
+ continue
+
+ module = import_module(modname)
+ classes = inspect.getmembers(module, inspect.isclass)
+ classes = [(name, est_cls) for name, est_cls in classes
+ if (issubclass(est_cls, BaseEstimator) and
+ est_cls.__module__ == modname and
+ not is_abstract(est_cls))]
+ all_classes.extend(classes)
+
+ all_classes = sorted(set(all_classes), key=itemgetter(0))
+
+ if type_filter is not None:
+ if not isinstance(type_filter, list):
+ type_filter = [type_filter]
+ all_classes = [est for est in all_classes
+ if any(hasattr(est[1], attr)
+ for attr in type_filter)]
+
+ return all_classes
class _CustomNearestNeighbors(BaseEstimator):
@@ -48,10 +84,38 @@ class _CustomNearestNeighbors(BaseEstimator):
self.n_neighbors = n_neighbors
self.metric = metric
- def kneighbors_graph(X=None, n_neighbors=None, mode='connectivity'):
+ def kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity'):
"""This method is not used within imblearn but it is required for
duck-typing."""
- pass
+ if X is None:
+ raise ValueError("X must be provided")
+ if n_neighbors is None:
+ n_neighbors = self.n_neighbors
+
+ n_samples = X.shape[0]
+ if sparse.issparse(X):
+ X = X.toarray()
+
+ kdtree = KDTree(X)
+ distances, indices = kdtree.query(X, k=n_neighbors + 1)
+
+ # Remove the first column (self-connections)
+ indices = indices[:, 1:]
+ distances = distances[:, 1:]
+
+ if mode == 'connectivity':
+ graph = sparse.lil_matrix((n_samples, n_samples), dtype=int)
+ graph[np.repeat(np.arange(n_samples), n_neighbors),
+ indices.ravel()] = 1
+ elif mode == 'distance':
+ graph = sparse.lil_matrix((n_samples, n_samples), dtype=float)
+ graph[np.repeat(np.arange(n_samples), n_neighbors),
+ indices.ravel()] = distances.ravel()
+ else:
+ raise ValueError("Unsupported mode, must be 'connectivity' "
+ "or 'distance' but got %s instead" % mode)
+
+ return graph.tocsr()
class _CustomClusterer(BaseEstimator):
diff --git a/imblearn/utils/tests/test_docstring.py b/imblearn/utils/tests/test_docstring.py
index f377d75..aed8ed4 100644
--- a/imblearn/utils/tests/test_docstring.py
+++ b/imblearn/utils/tests/test_docstring.py
@@ -11,7 +11,7 @@ def _dedent_docstring(docstring):
xref: https://github.com/python/cpython/issues/81283
"""
- pass
+ return textwrap.dedent(docstring)
func_docstring = """A function.
@@ -72,4 +72,16 @@ def test_docstring_with_python_OO():
Non-regression test for:
https://github.com/scikit-learn-contrib/imbalanced-learn/issues/945
"""
- pass
+ # Simulate Python -OO by setting __doc__ to None
+ original_doc = func.__doc__
+ func.__doc__ = None
+
+ try:
+ # This should not raise a warning
+ with pytest.warns(None) as record:
+ Substitution(param_1="Parameter 1", param_2="Parameter 2")(func)
+
+ assert len(record) == 0, "Unexpected warning raised"
+ finally:
+ # Restore the original docstring
+ func.__doc__ = original_doc
diff --git a/imblearn/utils/tests/test_estimator_checks.py b/imblearn/utils/tests/test_estimator_checks.py
index e93b1c3..df71c2c 100644
--- a/imblearn/utils/tests/test_estimator_checks.py
+++ b/imblearn/utils/tests/test_estimator_checks.py
@@ -12,27 +12,69 @@ class BaseBadSampler(BaseEstimator):
"""Sampler without inputs checking."""
_sampling_type = 'bypass'
+ def fit_resample(self, X, y):
+ return X, y
+
class SamplerSingleClass(BaseSampler):
"""Sampler that would sample even with a single class."""
_sampling_type = 'bypass'
+ def fit_resample(self, X, y):
+ return X, y
+
class NotFittedSampler(BaseBadSampler):
"""Sampler without target checking."""
+ def fit_resample(self, X, y):
+ self.fitted_ = True
+ return X, y
+
class NoAcceptingSparseSampler(BaseBadSampler):
"""Sampler which does not accept sparse matrix."""
+ def fit_resample(self, X, y):
+ if not isinstance(X, np.ndarray):
+ raise TypeError("A numpy array is required. Got %s" % type(X))
+ return X, y
+
class NotPreservingDtypeSampler(BaseSampler):
_sampling_type = 'bypass'
_parameter_constraints: dict = {'sampling_strategy': 'no_validation'}
+ def fit_resample(self, X, y):
+ return X.astype(float), y
+
class IndicesSampler(BaseOverSampler):
- pass
+ def _fit_resample(self, X, y):
+ check_classification_targets(y)
+ self.sampling_strategy_ = self.sampling_strategy
+ return self._sample(X, y)
+
+ def _sample(self, X, y):
+ n_samples, n_features = X.shape
+ classes = np.unique(y)
+ indices = np.arange(n_samples)
+
+ for class_sample in classes:
+ if self.sampling_strategy_ == 'auto':
+ n_samples_class = max(np.bincount(y))
+ else:
+ n_samples_class = self.sampling_strategy_[class_sample]
+
+ class_indices = indices[y == class_sample]
+ if len(class_indices) < n_samples_class:
+ resampled_indices = np.random.choice(
+ class_indices, size=n_samples_class, replace=True
+ )
+ indices = np.concatenate((indices, resampled_indices))
+ y = np.concatenate((y, [class_sample] * (n_samples_class - len(class_indices))))
+
+ return X[indices], y
mapping_estimator_error = {'BaseBadSampler': (AssertionError,
diff --git a/imblearn/utils/tests/test_param_validation.py b/imblearn/utils/tests/test_param_validation.py
index 8b0709d..70d370e 100644
--- a/imblearn/utils/tests/test_param_validation.py
+++ b/imblearn/utils/tests/test_param_validation.py
@@ -47,7 +47,36 @@ class _Estimator(_ParamsValidationMixin, BaseEstimator):
@pytest.mark.parametrize('interval_type', [Integral, Real])
def test_interval_range(interval_type):
"""Check the range of values depending on closed."""
- pass
+ left, right = 0, 10
+
+ # Test closed='left'
+ interval = Interval(interval_type, left, right, closed='left')
+ assert interval.is_satisfied_by(left)
+ assert not interval.is_satisfied_by(right)
+
+ # Test closed='right'
+ interval = Interval(interval_type, left, right, closed='right')
+ assert not interval.is_satisfied_by(left)
+ assert interval.is_satisfied_by(right)
+
+ # Test closed='both'
+ interval = Interval(interval_type, left, right, closed='both')
+ assert interval.is_satisfied_by(left)
+ assert interval.is_satisfied_by(right)
+
+ # Test closed='neither'
+ interval = Interval(interval_type, left, right, closed='neither')
+ assert not interval.is_satisfied_by(left)
+ assert not interval.is_satisfied_by(right)
+
+ # Test values inside the interval
+ middle = (left + right) / 2 if interval_type is Real else (left + right) // 2
+ assert all(interval.is_satisfied_by(middle) for interval in [
+ Interval(interval_type, left, right, closed='left'),
+ Interval(interval_type, left, right, closed='right'),
+ Interval(interval_type, left, right, closed='both'),
+ Interval(interval_type, left, right, closed='neither')
+ ])
@pytest.mark.parametrize('interval_type', [Integral, Real])
@@ -56,7 +85,20 @@ def test_interval_large_integers(interval_type):
non-regression test for #26648.
"""
- pass
+ large_int = 2**63 - 1 # Maximum value for 64-bit signed integer
+
+ # Test with large integers as bounds
+ interval = Interval(interval_type, -large_int, large_int, closed='both')
+ assert interval.is_satisfied_by(0)
+ assert interval.is_satisfied_by(-large_int)
+ assert interval.is_satisfied_by(large_int)
+ assert not interval.is_satisfied_by(-large_int - 1)
+ assert not interval.is_satisfied_by(large_int + 1)
+
+ # Test with large integer as a value
+ interval = Interval(interval_type, -10, 10, closed='both')
+ assert not interval.is_satisfied_by(large_int)
+ assert not interval.is_satisfied_by(-large_int)
def test_interval_inf_in_bounds():
@@ -64,7 +106,30 @@ def test_interval_inf_in_bounds():
Only valid for real intervals.
"""
- pass
+ import numpy as np
+
+ # Test positive infinity
+ interval = Interval(Real, None, 0, closed='left')
+ assert interval.is_satisfied_by(np.inf)
+
+ interval = Interval(Real, None, 0, closed='neither')
+ assert not interval.is_satisfied_by(np.inf)
+
+ # Test negative infinity
+ interval = Interval(Real, 0, None, closed='right')
+ assert interval.is_satisfied_by(-np.inf)
+
+ interval = Interval(Real, 0, None, closed='neither')
+ assert not interval.is_satisfied_by(-np.inf)
+
+ # Test both infinities
+ interval = Interval(Real, None, None, closed='both')
+ assert interval.is_satisfied_by(np.inf)
+ assert interval.is_satisfied_by(-np.inf)
+
+ interval = Interval(Real, None, None, closed='neither')
+ assert not interval.is_satisfied_by(np.inf)
+ assert not interval.is_satisfied_by(-np.inf)
@pytest.mark.parametrize('interval', [Interval(Real, 0, 1, closed='left'),
diff --git a/imblearn/utils/tests/test_show_versions.py b/imblearn/utils/tests/test_show_versions.py
index ca6a29e..979b929 100644
--- a/imblearn/utils/tests/test_show_versions.py
+++ b/imblearn/utils/tests/test_show_versions.py
@@ -1,2 +1,29 @@
"""Test for the show_versions helper. Based on the sklearn tests."""
+import pytest
from imblearn.utils._show_versions import _get_deps_info, show_versions
+
+
+def test_get_deps_info():
+ deps_info = _get_deps_info()
+ assert isinstance(deps_info, dict)
+ assert "pip" in deps_info
+ assert "sklearn" in deps_info
+ assert "numpy" in deps_info
+ assert "scipy" in deps_info
+ assert "pandas" in deps_info
+ assert "joblib" in deps_info
+
+
+@pytest.mark.parametrize("github", [True, False])
+def test_show_versions(capsys, github):
+ show_versions(github=github)
+ out, err = capsys.readouterr()
+ assert "System Information" in out
+ assert "Python Dependencies" in out
+ assert "imlearn version" in out
+ if github:
+ assert "**System Information**" in out
+ assert "**Python Dependencies**" in out
+ else:
+ assert "System Information\n" in out
+ assert "Python Dependencies\n" in out
diff --git a/imblearn/utils/tests/test_testing.py b/imblearn/utils/tests/test_testing.py
index 421be2b..b917e42 100644
--- a/imblearn/utils/tests/test_testing.py
+++ b/imblearn/utils/tests/test_testing.py
@@ -9,4 +9,16 @@ from imblearn.utils.testing import _CustomNearestNeighbors, all_estimators
def test_custom_nearest_neighbors():
"""Check that our custom nearest neighbors can be used for our internal
duck-typing."""
- pass
+ # Create an instance of _CustomNearestNeighbors
+ custom_nn = _CustomNearestNeighbors()
+
+ # Check if it's an instance of KNeighborsMixin
+ assert isinstance(custom_nn, KNeighborsMixin), "_CustomNearestNeighbors should be an instance of KNeighborsMixin"
+
+ # Check if it has the required methods for duck-typing
+ required_methods = ['kneighbors', 'fit']
+ for method in required_methods:
+ assert hasattr(custom_nn, method), f"_CustomNearestNeighbors should have a '{method}' method"
+
+ # Check if it's not an instance of SamplerMixin
+ assert not isinstance(custom_nn, SamplerMixin), "_CustomNearestNeighbors should not be an instance of SamplerMixin"