I am doing multi-class classification using ML. After preprocessing the data, I am using train_test_split function to divide the data into training and testing dataset. Is there a way to know how many samples from each class are present in the training and testing dataset? For example:
Class | No. of Training Samples | No. of Testing Samples |
---|---|---|
a | 30 | 5 |
b | 20 | 10 |
c | 25 | 5 |
My Code:
classes = ['a','b','c']
def pp():
data_list=[]
for index,label in enumerate(classes):
class_list=[]
if label=='silence':
silence_path = os.path.join(C["dire"],'silence')
if not os.path.exists(silence_path):
os.mkdir(silence_path)
silence_stride = 2000
#sample_rate = 16000
folder = os.path.join(C["dire"],'_background_noise_')
for file_ in os.listdir(folder):
if '.wav' in file_:
load_path = os.path.join(folder,file_)
sample_rate,y = wavfile.read(load_path)
for i in range(0,len(y)-sample_rate,silence_stride):
file_path = "silence/{}_{}.wav".format(file_[:-4],i)
y_slice = y[i:i+sample_rate]
wavfile.write(os.path.join(C["dire"],file_path),sample_rate,y_slice)
class_list.append(file_path)
else:
folder = os.path.join(C["dire"],label)
for file_ in os.listdir(folder):
file_path = '{}/{}'.format(label,file_)
class_list.append(file_path)
random.shuffle(class_list)
data_list.append(class_list)
X = []
Y = []
preemphasis = 0.985
print("Feature Extraction Started")
for i,class_list in enumerate(data_list):
for j,samples in enumerate(class_list):
if(samples.endswith('.wav')):
sample_rate,audio = wavfile.read(os.path.join(C["dire"],samples))
if(audio.size<sample_rate):
audio = np.pad(audio,(sample_rate-audio.size,0),mode="constant")
coeff = mfccforconfidence.mfcc(audio,sample_rate,preemphasis)
X.append(coeff)
#print(X)
if(samples.split('/')[0] in classes):
Y.append(samples.split('/')[0])
elif(samples.split('/')[0]=='_background_noise_'):
Y.append('silence')
A = np.zeros((len(X),X[0].shape[0],X[0][0].shape[0]),dtype='object')
for i in range(0,len(X)):
A[i] = np.array(X[i]) #Converting list X into array A
# print(A.shape)
end1 = time.time()
print("Time taken for feature extraction:{}sec".format(end1-start))
MLB = MultiLabelBinarizer() # one hot encoding for converting labels into binary form
MLB.fit(pd.Series(Y).fillna("missing").str.split(', '))
Y_MLB = MLB.transform(pd.Series(Y).fillna("missing").str.split(', '))
MLB.classes_ #Same like classes array
print(Y_MLB.shape)
Y = Y_MLB
X = tf.keras.utils.normalize(X)
X_train,X_valtest,Y_train,Y_valtest = train_test_split(X,Y,test_size=0.2,random_state=37)
X_val,X_test,Y_val,Y_test = train_test_split(X_valtest,Y_valtest,test_size=0.5,random_state=37)
print(X_train.shape,X_val.shape,X_test.shape,Y_train.shape,Y_val.shape,Y_test.shape)
So, basically I am using ML for audio classification. After extracting the features, I divide the data into training and testing dataset.
I hope that this piece of code will be useful to answer the question.
If you have a "3D numpy array", here's a demonstration of one way you could do it.
import numpy as np
from random import randint,choices
# Create some data
my_data = np.array(list(zip(
(randint(0,100) for _ in range(100)),
(choices(["a","b","c"], k=100)),
(randint(0,100) for _ in range(100))
))
)
# Show the first 5 elements
print(my_data[0:5,:])
# [['69' 'a' '38']
# ['18' 'c' '73']
# ['57' 'a' '50']
# ['35' 'a' '60']
# ['52' 'b' '1']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(my_data[:,[0,1]], my_data[:,2])
from collections import Counter
print(Counter(X_train[:,1]))
# Counter({'c': 31, 'b': 26, 'a': 18})
print(Counter(X_train[:,1])["a"])
# 18
print(Counter(X_test[:,1]))
# Counter({'b': 12, 'c': 7, 'a': 6})