scikit-learn training-data train-test-split

Identifying the contents of training and testing dataset after using train_test_split()

I am doing multi-class classification using ML. After preprocessing the data, I am using train_test_split function to divide the data into training and testing dataset. Is there a way to know how many samples from each class are present in the training and testing dataset? For example:

Class	No. of Training Samples	No. of Testing Samples
a	30	5
b	20	10
c	25	5

My Code:

classes = ['a','b','c']

def pp():
  data_list=[] 
  for index,label in enumerate(classes):
    class_list=[]
    if label=='silence': 
      silence_path = os.path.join(C["dire"],'silence')
      if not os.path.exists(silence_path):
        os.mkdir(silence_path)
      silence_stride = 2000
      #sample_rate = 16000
      folder = os.path.join(C["dire"],'_background_noise_') 

      for file_ in os.listdir(folder):
        if '.wav' in file_:
          load_path = os.path.join(folder,file_)
          sample_rate,y = wavfile.read(load_path)
          for i in range(0,len(y)-sample_rate,silence_stride):
            file_path = "silence/{}_{}.wav".format(file_[:-4],i)
            y_slice = y[i:i+sample_rate]
            wavfile.write(os.path.join(C["dire"],file_path),sample_rate,y_slice)
            class_list.append(file_path)
            
    else:
      folder = os.path.join(C["dire"],label)
      for file_ in os.listdir(folder):
        file_path = '{}/{}'.format(label,file_)    
        class_list.append(file_path)

    random.shuffle(class_list)              
    data_list.append(class_list)


  X = []
  Y = []
  preemphasis = 0.985
  print("Feature Extraction Started")
  for i,class_list in enumerate(data_list): 
    for j,samples in enumerate(class_list):    
      if(samples.endswith('.wav')):
        sample_rate,audio = wavfile.read(os.path.join(C["dire"],samples))
        if(audio.size<sample_rate):
            audio = np.pad(audio,(sample_rate-audio.size,0),mode="constant")
        coeff = mfccforconfidence.mfcc(audio,sample_rate,preemphasis) 
        X.append(coeff)
          #print(X)
        if(samples.split('/')[0] in classes):
            Y.append(samples.split('/')[0])
        elif(samples.split('/')[0]=='_background_noise_'):
            Y.append('silence')
        
  A = np.zeros((len(X),X[0].shape[0],X[0][0].shape[0]),dtype='object')
  
  for i in range(0,len(X)):
    A[i] = np.array(X[i])      #Converting list X into array A
    # print(A.shape)
    
  
  end1 = time.time()
  print("Time taken for feature extraction:{}sec".format(end1-start))

  
  MLB = MultiLabelBinarizer() # one hot encoding for converting labels into binary form
  
  MLB.fit(pd.Series(Y).fillna("missing").str.split(', '))
  Y_MLB = MLB.transform(pd.Series(Y).fillna("missing").str.split(', '))
  MLB.classes_        #Same like classes array
  print(Y_MLB.shape)
  Y = Y_MLB

  
  X = tf.keras.utils.normalize(X)
  X_train,X_valtest,Y_train,Y_valtest = train_test_split(X,Y,test_size=0.2,random_state=37)
  X_val,X_test,Y_val,Y_test = train_test_split(X_valtest,Y_valtest,test_size=0.5,random_state=37)

  print(X_train.shape,X_val.shape,X_test.shape,Y_train.shape,Y_val.shape,Y_test.shape)

So, basically I am using ML for audio classification. After extracting the features, I divide the data into training and testing dataset.

I hope that this piece of code will be useful to answer the question.

Solution

If you have a "3D numpy array", here's a demonstration of one way you could do it.

import numpy as np
from random import randint,choices

# Create some data
my_data = np.array(list(zip(
    (randint(0,100) for _ in range(100)),
    (choices(["a","b","c"], k=100)),
    (randint(0,100) for _ in range(100))
  ))
)

# Show the first 5 elements
print(my_data[0:5,:])
# [['69' 'a' '38']
#  ['18' 'c' '73']
#  ['57' 'a' '50']
#  ['35' 'a' '60']
#  ['52' 'b' '1']]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(my_data[:,[0,1]], my_data[:,2])

from collections import Counter

print(Counter(X_train[:,1]))
# Counter({'c': 31, 'b': 26, 'a': 18})

print(Counter(X_train[:,1])["a"])
# 18
print(Counter(X_test[:,1]))
# Counter({'b': 12, 'c': 7, 'a': 6})