I want to make an augmented version of my dataset. The dataset was manually labelled and downloaded from this RIWA dataset.
I create class with this code
source_dir = r'./river-water-segmentation-dataset/riwa_v2'
subdir = os.listdir(source_dir)
filepaths = []
labels = []
for i in subdir:
classpath = os.path.join(source_dir, i)
if os.path.isdir(classpath):
file_list = os.listdir(classpath)
for f in file_list:
file_path = os.path.join(classpath, f)
filepaths.append(file_path)
labels.append(i)
paths = pd.Series(filepaths, name='paths')
labels = pd.Series(labels, name='labels')
df = pd.concat([paths, labels], axis=1)
print(df.head())
print("========================")
print(df['labels'].value_counts())
print("=========================")
print('Total data: ', len(df))
Then make them to 700 each for starter, might increase later for bigger dataset
sample_list = []
max_size = 1500# TODO: change this value
grouping = df.groupby('labels')
for label in df['labels'].unique():
group = grouping.get_group(label)
group_size = len(group)
if group_size > max_size:
samples = group.sample(max_size, replace=False, weights=None, axis=0).reset_index(drop=True)
else:
samples = group.sample(frac=1.0, replace=False, axis=0).reset_index(drop=True)
sample_list.append(samples)
df = pd.concat(sample_list, axis=0).reset_index(drop=True)
print(df['labels'].value_counts())
print('Total data: ', len(df))
From there I create augmented dataset with these
import os
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator
working_dir = r'./river-water-segmentation-dataset/riwa_v2/cropped'
aug_dir = os.path.join(working_dir, 'aug')
if os.path.isdir(aug_dir):
shutil.rmtree(aug_dir)
os.mkdir(aug_dir)
for label in df['labels'].unique():
dir_path=os.path.join(aug_dir, label)
os.mkdir(dir_path)
print(os.listdir(aug_dir))
target = 700 # set the target count for each class in df
gen = ImageDataGenerator(
rotation_range = 90,
horizontal_flip = True,
vertical_flip = True,
)
grouping = df.groupby('labels') # group by class
for label in df['labels'].unique(): # for every class
group = grouping.get_group(label) # a dataframe holding only rows with the specificied label
sample_count = len(group) # determine how many samples there are in this class
# if group.empty:
# print(f"No images found for label '{label}'. Skipping augmentation.")
# continue
if sample_count < target: # if the class has less than target number of images
aug_img_count = 0
delta = target - sample_count # number of augmented images to create
target_dir = os.path.join(aug_dir, label) # define where to write the images
aug_gen = gen.flow_from_dataframe(
group,
x_col = 'paths',
y_col = None,
target_size = (1420, 1080), # change this target size based on transfer learning model
class_mode = None,
batch_size = 1,
shuffle = False,
save_to_dir = target_dir,
save_prefix = 'aug-',
save_format='jpg'
)
images = next(aug_gen) # Try fetching a batch
print(f"Generated {len(images)} images.")
while aug_img_count < delta:
images = next(aug_gen)
aug_img_count += len(images)
At first the import from tensorflow.keras.preprocessing.image import ImageDataGenerator
is actually moved from from tensorflow.preprocessing.image import ImageDataGenerator
from other answer I cannot find anymore because for this version of keras it was moved to this tensorflow.keras.preprocessing.image
import route.
Ran the code over 10 minutes and still Found 0 validated image filenames
as a result, is there something I did wrong? Is this because I download cpu version of tensorflow?
Edit 1: I did think it was because the size problem so I crop all dataset images to the same size and the code still doesn't work.
The problem lies with the Riwa dataset, which contains images and masks in
separate folders. I moved these two folders (images and masks) under the train
folder, so that the train folder now contains both images and masks. After
using the ImageDataGenerator.flow_from_directory
method, I made these
adjustments, and the code works . Please refer to this gist