how to Split data in 3 folds (train,validation,test) using ImageDataGenerator when data is in different directories of each class
As you rightly mentioned, splitting the Data into 3 Folds is not possible in one line of code using Keras ImageDataGenerator
.
Work around would be to store the Images corresponding to Test Data
in a separate Folder and apply ImageDataGenerator
, as shown below:
# Path to Training Directory
train_dir = 'Dogs_Vs_Cats_Small/train'
# Path to Test Directory
test_dir = 'Dogs_Vs_Cats_Small/test'
Train_Gen = ImageDataGenerator(1./255)
Test_Gen = ImageDataGenerator(1./255)
Train_Generator = Train_Gen.flow_from_directory(train_dir, target_size = (150,150), batch_size = 20, class_mode = 'binary')
Test_Generator = Test_Gen.flow_from_directory(test_dir, target_size = (150, 150), class_mode = 'binary', batch_size = 20)
Sample Code to extract some images from the Original Directory and place them in two separate folders, train
and test
, which may help you, is shown below:
import os, shutil
# Path to the directory where the original dataset was uncompressed
original_dataset_dir = 'Dogs_Vs_Cats'
# Directory where you’ll store your smaller dataset
base_dir = 'Dogs_Vs_Cats_Small2'
os.mkdir(base_dir)
# Directory for the training splits
train_dir = os.path.join(base_dir, 'train')
os.mkdir(train_dir)
# Directory for the test splits
test_dir = os.path.join(base_dir, 'test')
os.mkdir(test_dir)
# Directory with training cat pictures
train_cats_dir = os.path.join(train_dir, 'cats')
os.mkdir(train_cats_dir)
# Directory with training dog pictures
train_dogs_dir = os.path.join(train_dir, 'dogs')
os.mkdir(train_dogs_dir)
# Directory with Test Cat Pictures
test_cats_dir = os.path.join(test_dir, 'cats')
os.mkdir(test_cats_dir)
# Directory with Test Dog Pictures
test_dogs_dir = os.path.join(test_dir, 'dogs')
os.mkdir(test_dogs_dir)
# Copies the first 1,000 cat images to train_cats_dir.
fnames = ['cat.{}.jpg'.format(i) for i in range(1000)]
for fname in fnames:
src = os.path.join(original_dataset_dir, 'train', fname)
dst = os.path.join(train_cats_dir, fname)
shutil.copyfile(src, dst)
# Copies the next 500 cat images to test_cats_dir
fnames = ['cat.{}.jpg'.format(i) for i in range(1500, 2000)]
for fname in fnames:
src = os.path.join(original_dataset_dir, 'train', fname)
dst = os.path.join(test_cats_dir, fname)
shutil.copyfile(src, dst)
# Copies the first 1,000 dog images to train_dogs_dir
fnames = ['dog.{}.jpg'.format(i) for i in range(1000)]
for fname in fnames:
src = os.path.join(original_dataset_dir, 'train', fname)
dst = os.path.join(train_dogs_dir, fname)
shutil.copyfile(src, dst)
# Copies the next 500 dog images to test_dogs_dir
fnames = ['dog.{}.jpg'.format(i) for i in range(1500, 2000)]
for fname in fnames:
src = os.path.join(original_dataset_dir, 'train', fname)
dst = os.path.join(test_dogs_dir, fname)
shutil.copyfile(src, dst)
# Sanity Check to ensure that Training, Validation and Test Folders have the expected number of images
print('Number of Cat Images in Training Directory is {}'.format(len(os.listdir(train_cats_dir))))
print('Number of Dog Images in Training Directory is {}'.format(len(os.listdir(train_dogs_dir))))
print('Number of Cat Images in Testing Directory is {}'.format(len(os.listdir(test_cats_dir))))
print('Number of Dog Images in Testing Directory is {}'.format(len(os.listdir(test_dogs_dir))))
Hope this helps.
Split image dataset into train-test datasets
This should do it. It will calculate how many images are in each folder and then splits them accordingly, saving test data in a different folder with the same structure.
Save the code in main.py
file and run command:
python3 main.py ----data_path=/path1 --test_data_path_to_save=/path2 --train_ratio=0.7
import shutil
import os
import numpy as np
import argparse
def get_files_from_folder(path):
files = os.listdir(path)
return np.asarray(files)
def main(path_to_data, path_to_test_data, train_ratio):
# get dirs
_, dirs, _ = next(os.walk(path_to_data))
# calculates how many train data per class
data_counter_per_class = np.zeros((len(dirs)))
for i in range(len(dirs)):
path = os.path.join(path_to_data, dirs[i])
files = get_files_from_folder(path)
data_counter_per_class[i] = len(files)
test_counter = np.round(data_counter_per_class * (1 - train_ratio))
# transfers files
for i in range(len(dirs)):
path_to_original = os.path.join(path_to_data, dirs[i])
path_to_save = os.path.join(path_to_test_data, dirs[i])
#creates dir
if not os.path.exists(path_to_save):
os.makedirs(path_to_save)
files = get_files_from_folder(path_to_original)
# moves data
for j in range(int(test_counter[i])):
dst = os.path.join(path_to_save, files[j])
src = os.path.join(path_to_original, files[j])
shutil.move(src, dst)
def parse_args():
parser = argparse.ArgumentParser(description="Dataset divider")
parser.add_argument("--data_path", required=True,
help="Path to data")
parser.add_argument("--test_data_path_to_save", required=True,
help="Path to test data where to save")
parser.add_argument("--train_ratio", required=True,
help="Train ratio - 0.7 means splitting data in 70 % train and 30 % test")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
main(args.data_path, args.test_data_path_to_save, float(args.train_ratio))
YoloV4 Custom Dataset Train Test Split
Ok !!, You can do this
Split images function
def split_img_label(data_train,data_test,folder_train,foler_test):
os.mkdir(folder_train)
os.mkdir(folder_test)
train_ind=list(data_train.index)
test_ind=list(data_test.index)
# Train folder
for i in tqdm(range(len(train_ind))):
os.system('cp '+data_train[train_ind[i]]+' ./'+ folder_train + '/' +data_train[train_ind[i]].split('/')[2])
os.system('cp '+data_train[train_ind[i]].split('.jpg')[0]+'.txt'+' ./'+ folder_train + '/' +data_train[train_ind[i]].split('/')[2].split('.jpg')[0]+'.txt')
# Test folder
for j in tqdm(range(len(test_ind))):
os.system('cp '+data_test[test_ind[j]]+' ./'+ folder_test + '/' +data_test[test_ind[j]].split('/')[2])
os.system('cp '+data_test[test_ind[j]].split('.jpg')[0]+'.txt'+' ./'+ folder_test + '/' +data_test[test_ind[j]].split('/')[2].split('.jpg')[0]+'.txt')
CODE
import pandas as pd
import os
PATH = './TrainingsData/'
list_img=[img for img in os.listdir(PATH) if img.endswith('.jpg')==True]
list_txt=[img for img in os.listdir(PATH) if img.endswith('.txt')==True]
path_img=[]
for i in range (len(list_img)):
path_img.append(PATH+list_img[i])
df=pd.DataFrame(path_img)
# split
data_train, data_test, labels_train, labels_test = train_test_split(df[0], df.index, test_size=0.20, random_state=42)
# Function split
split_img_label(data_train,data_test,folder_train_name,folder_test_name)
OUTPUT
len(list_img)
583
100%|████████████████████████████████████████████████████████████████████████████████| 466/466 [00:26<00:00, 17.42it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 117/117 [00:07<00:00, 16.61it/s]
Finally, you will have 2 folders (folder_train_name,folder_test_name) with the same images and labels .
Related Topics
How to Split a CSV File Row to Columns in Python
Python | Make the Percentage of a List
How to Split Folder of Images into Test/Training/Validation Sets With Stratified Sampling
Truth Value of a Series Is Ambiguous. Use A.Empty, A.Bool(), A.Item(), A.Any() or A.All()
Replace a Word in a String by Indexing Without "String Replace Function" -Python
How to Split Vector into Columns - Using Pyspark
Why Is This Python Script With Matplotlib So Slow
Moving Position of Character Within an Item in List
Centering Text in Ipython Notebook Markdown/Heading Cells
How to Run a Function Multiple Times and Return Different Result Python
Grab a Number After a String in a File
Python, Anaconda, Spyder - Uninstalling Python Package Using Pip Does Not Work in Spyder + Ipython
Iterating Over Every Two Elements in a List
How to Disable Pylint Unused Import Error Messages in VS Code
How to Tell Python to Convert Integers into Words
Add Missing Dates to Pandas Dataframe
How to Find the Closest Values in a Pandas Series to an Input Number