diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py
index 8101f52..291c058 100644
--- a/imblearn/over_sampling/_smote/base.py
+++ b/imblearn/over_sampling/_smote/base.py
@@ -39,8 +39,8 @@ class BaseSMOTE(BaseOverSampler):
"""Check the NN estimators shared across the different SMOTE
algorithms.
"""
- pass
-
+ self.nn_ = check_neighbors_object('k_neighbors', self.k_neighbors, additional_neighbor=1)
+ self.nn_.set_params(**{'n_jobs': self.n_jobs})
def _make_samples(self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.0, y=None):
"""A support function that returns artificial samples constructed along
the line connecting nearest neighbours.
@@ -82,7 +82,15 @@ class BaseSMOTE(BaseOverSampler):
y_new : ndarray of shape (n_samples_new,)
Target values for synthetic samples.
"""
- pass
+ random_state = check_random_state(self.random_state)
+ samples_indices = random_state.randint(low=0, high=nn_num.size, size=n_samples)
+ steps = step_size * random_state.uniform(size=n_samples)
+ rows = np.floor_divide(samples_indices, nn_num.shape[1])
+ cols = np.mod(samples_indices, nn_num.shape[1])
+
+ X_new, y_new = self._generate_samples(X, nn_data, nn_num, rows, cols, steps, y_type, y)
+
+ return X_new, y_new
def _generate_samples(self, X, nn_data, nn_num, rows, cols, steps, y_type=None, y=None):
"""Generate a synthetic sample.
@@ -131,8 +139,23 @@ class BaseSMOTE(BaseOverSampler):
-------
X_new : {ndarray, sparse matrix} of shape (n_samples, n_features)
Synthetically generated samples.
+
+ y_new : ndarray of shape (n_samples,)
+ Target values for synthetic samples.
"""
- pass
+ n_samples, n_features = X.shape
+ X_new = np.zeros((steps.shape[0], n_features))
+
+ if sparse.issparse(X):
+ for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
+ X_new[i] = X[row].toarray() + step * (nn_data[nn_num[row, col]].toarray() - X[row].toarray())
+ else:
+ for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
+ X_new[i] = X[row] + step * (nn_data[nn_num[row, col]] - X[row])
+
+ y_new = np.full(steps.shape[0], fill_value=y_type)
+
+ return X_new, y_new
def _in_danger_noise(self, nn_estimator, samples, target_class, y, kind='danger'):
"""Estimate if a set of sample are in danger or noise.
@@ -166,7 +189,17 @@ class BaseSMOTE(BaseOverSampler):
output : ndarray of shape (n_samples,)
A boolean array where True refer to samples in danger or noise.
"""
- pass
+ x = nn_estimator.kneighbors(samples, return_distance=False)[:, 1:]
+ nn_label = (y[x] != target_class).astype(int).sum(axis=1)
+
+ if kind == 'danger':
+ # Samples are in danger if some but not all neighbors are of a different class
+ return (nn_label > 0) & (nn_label < x.shape[1])
+ elif kind == 'noise':
+ # Samples are noise if all neighbors are of a different class
+ return nn_label == x.shape[1]
+ else:
+ raise ValueError("'kind' should be either 'danger' or 'noise'.")
@Substitution(sampling_strategy=BaseOverSampler._sampling_strategy_docstring, n_jobs=_n_jobs_docstring, random_state=_random_state_docstring)
class SMOTE(BaseSMOTE):
@@ -581,4 +614,4 @@ class SMOTEN(SMOTE):
def _validate_estimator(self):
"""Force to use precomputed distance matrix."""
- pass
\ No newline at end of file
+ pass
diff --git a/imblearn/over_sampling/_smote/tests/test_smote.py b/imblearn/over_sampling/_smote/tests/test_smote.py
index 18feb55..a507341 100644
--- a/imblearn/over_sampling/_smote/tests/test_smote.py
+++ b/imblearn/over_sampling/_smote/tests/test_smote.py
@@ -6,4 +6,44 @@ from imblearn.over_sampling import SMOTE
RND_SEED = 0
X = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234]])
Y = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0])
-R_TOL = 0.0001
\ No newline at end of file
+R_TOL = 0.0001
+
+def test_generate_samples():
+ smote = SMOTE(random_state=RND_SEED)
+ smote._validate_estimator()
+ X_new, y_new = smote._generate_samples(X, X, np.arange(len(X)), np.array([0, 1]), np.array([0.5, 0.5]), y_type=0)
+
+ assert X_new.shape[0] == 2
+ assert y_new.shape[0] == 2
+ assert np.all(y_new == 0)
+
+def test_in_danger_noise():
+ smote = SMOTE(random_state=RND_SEED)
+ nn_estimator = NearestNeighbors(n_neighbors=6)
+ nn_estimator.fit(X)
+
+ # Test 'danger' classification
+ danger_samples = smote._in_danger_noise(nn_estimator, X[Y==0], 0, Y, kind='danger')
+ assert isinstance(danger_samples, np.ndarray)
+ assert danger_samples.dtype == bool
+
+ # Test 'noise' classification
+ noise_samples = smote._in_danger_noise(nn_estimator, X[Y==0], 0, Y, kind='noise')
+ assert isinstance(noise_samples, np.ndarray)
+ assert noise_samples.dtype == bool
+
+ # Test invalid 'kind' parameter
+ try:
+ smote._in_danger_noise(nn_estimator, X[Y==0], 0, Y, kind='invalid')
+ except ValueError:
+ pass
+ else:
+ assert False, "ValueError not raised for invalid 'kind' parameter"
+
+def test_smote_fit_resample():
+ smote = SMOTE(random_state=RND_SEED)
+ X_resampled, y_resampled = smote.fit_resample(X, Y)
+
+ assert X_resampled.shape[0] > X.shape[0]
+ assert y_resampled.shape[0] > Y.shape[0]
+ assert np.sum(y_resampled == 0) == np.sum(y_resampled == 1)