Using Python3.6, TF 1.15, imblearn 0.0
I have an imbalanced data set, 3 classes, two are even, one is low. I am trying to apply SMOTE to the dataset, however, I am using flow from directory and I found out I can supposedly obtain X_train and y_train from the data generator using next(train_generator).
The problem is my generator appears to be outputting only one class to the y_train. If I use ravel it gives me the following error:
Found 22089 images belonging to 3 classes.
Found 2136 images belonging to 3 classes.
Found 792 images belonging to 3 classes.
Traceback (most recent call last):
File ".py", line 93, in <module>
X_train_smote, y_train_smote = smote.fit_sample(X_train.reshape(X_train.shape[0], -1), y_train.ravel())
File ".virtualenvs\TF15_Environment-fh5Z3l1i\lib\site-packages\imblearn\base.py", line 77, in fit_resample
X, y, binarize_y = self._check_X_y(X, y)
File ".virtualenvs\TF15_Environment-fh5Z3l1i\lib\site-packages\imblearn\base.py", line 135, in _check_X_y
X, y, reset=True, accept_sparse=accept_sparse
File ".virtualenvs\TF15_Environment-fh5Z3l1i\lib\site-packages\sklearn\base.py", line 432, in _validate_data
X, y = check_X_y(X, y, **check_params)
File ".virtualenvs\TF15_Environment-fh5Z3l1i\lib\site-packages\sklearn\utils\validation.py", line 72, in inner_f
return f(**kwargs)
File ".virtualenvs\TF15_Environment-fh5Z3l1i\lib\site-packages\sklearn\utils\validation.py", line 812, in check_X_y
check_consistent_length(X, y)
File ".virtualenvs\TF15_Environment-fh5Z3l1i\lib\site-packages\sklearn\utils\validation.py", line 256, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError: Found input variables with inconsistent numbers of samples: [2, 6]
(2, 224, 224, 3)
(2, 3)
Process finished with exit code 1
and if I just chuck in y_train without .ravel() I get this:
Found 22089 images belonging to 3 classes.
Found 2136 images belonging to 3 classes.
Found 792 images belonging to 3 classes.
(2, 224, 224, 3)
(2, 3)
Traceback (most recent call last):
File ".py", line 93, in <module>
X_train_smote, y_train_smote = smote.fit_sample(X_train.reshape(X_train.shape[0], -1), y_train)
File ".virtualenvs\TF15_Environment-fh5Z3l1i\lib\site-packages\imblearn\base.py", line 80, in fit_resample
self.sampling_strategy, y, self._sampling_type
File ".virtualenvs\TF15_Environment-fh5Z3l1i\lib\site-packages\imblearn\utils\_validation.py", line 533, in check_sampling_strategy
" Got {} class instead".format(np.unique(y).size)
ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead
Here is me code, appreciate any advice! Thanks :)
import datetime
import numpy as np
import cv2
import tensorflow as tf
from tensorflow.keras import backend as k
from tensorflow.keras.applications.mobilenet import MobileNet
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.mobilenet import preprocess_input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from imblearn.over_sampling import SMOTE
smote = SMOTE()
k.clear_session()
tf.set_random_seed(42)
np.random.seed(42)
currentDay = datetime.date.today()
now = datetime.datetime.now()
t = now.strftime("%H-%M-%S")
NAME = f'{currentDay}_{t}new_model_001.h5'
tboard = TensorBoard(log_dir=f'logs\\{NAME}',
update_freq="epoch",
histogram_freq=1,
write_grads=True,
write_graph=True,
)
# config
img_width = 224
img_height = 224
INPUT_DEPTH = 3
input_shape = (img_height, img_width, INPUT_DEPTH)
TRAIN_DATA_DIR = 'dataset/train/'
VALIDATION_DATA_DIR = 'dataset/validation/'
TESTING_DATA_DIR = 'dataset/test/'
MODEL_DIR = 'h5_Models/'
EPOCHS = 500
PATIENCE = 25
BATCH_SIZE = 2
MODEL_NAME = 'new_model_001.h5'
train_datagen = ImageDataGenerator(
rescale=1/255,
# zca_whitening=True,
# zca_epsilon=0.1,
# rotation_range=5,
width_shift_range=0.1,
height_shift_range=0.1,
shear_range=0.1,
zoom_range=(0.95, 0.95),
# data_format='channels_last',
horizontal_flip=True,
# vertical_flip=True,
fill_mode='nearest'
)
validation_datagen = ImageDataGenerator(rescale=1/255)
test_datagen = ImageDataGenerator(rescale=1/255)
train_generator = train_datagen.flow_from_directory(
TRAIN_DATA_DIR,
# color_mode='grayscale',
target_size=(img_height, img_width),
batch_size=BATCH_SIZE,
class_mode='categorical',
shuffle=True)
validation_generator = validation_datagen.flow_from_directory(
VALIDATION_DATA_DIR,
# color_mode='grayscale',
target_size=(img_height, img_width),
batch_size=BATCH_SIZE,
class_mode='categorical')
testing_generator = test_datagen.flow_from_directory(
TESTING_DATA_DIR,
# color_mode='grayscale',
target_size=(img_height, img_width),
batch_size=BATCH_SIZE,
class_mode='categorical',
)
X_train, y_train = next(train_generator)
print(X_train.shape)
print(y_train.shape)
X_train_smote, y_train_smote = smote.fit_sample(X_train.reshape(X_train.shape[0], -1), y_train.ravel())
print(X_train_smote.count)
X_train_smote = X_train_smote.reshape(X_train_smote.shape[0], 224, 224, 3)
When you use next(train_generator)
, you are simply considering a single batch of the train dataset which may just have a single class of images for some batches. SMOTE, however, if to be correctly applied, is applied with the entire dataset in consideration or a sample of it that is representative of all the classes and matching in distribution.