During my hyperparams finding loop operation, I can see that my model seems to persist during init phase and so relauching epoch from the last one instead of starting a full new cycle.
I'm trying to perform a regression problem with a simple Dense on a CP then I'll go to GPU.
I turn on Python 3.10.4 pytorch 2.3.0+cu118
import gc
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import ParameterGrid
def set_seeds(seed):
"""
MAJ :04/05
"""
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
# tf.config.experimental.enable_op_determinism()
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False
set_seeds(SEED)
class EarlyStopping: # For stopping training when val_loos improve no more
def __init__(self, patience=25, min_delta=0):
self.patience = patience
self.min_delta = min_delta
self.best_val_loss = float("inf")
self.counter = 0
def should_stop(self, val_loss):
if val_loss < self.best_val_loss - self.min_delta:
self.best_val_loss = val_loss
self.counter = 0
else:
self.counter += 1
if self.counter >= self.patience:
return True
return False
def weights_init(module):
if module.__class__.__name__.find('Linear') != -1:
# apply a uniform distribution to the weights and a bias=0
module.weight.data.uniform_(0.0, 1/np.sqrt(module.in_features))
module.bias.data.fill_(0)
class Dense_Regressor_RELU(nn.Module): # My simple regressor model
def __init__(self,input_Features_nb,Model_params,Target_Number=3):
super().__init__()
self.params=Model_params
#Génération des couches dense empilées
dense_layers=OrderedDict() #On crée un dictionnaire de tuples ordonnés
for couche in range(1,Model_params['layer_number']):
dense_layers["Dense"+str(couche)]=nn.Linear(Model_params['Neurons_number'], Model_params['Neurons_number']) #Input shape, Neurons Nb
dense_layers["Norm"+str(couche)]=nn.BatchNorm1d(Model_params['Neurons_number'])
dense_layers["Act"+str(couche)]=nn.ReLU()
dense_layers["Drop"+str(couche)]=nn.Dropout(Model_params['Drop_rate'])
self.input_layer=nn.Sequential(
nn.Linear(input_Features_nb, Model_params['Neurons_number']) #Input shape, Neurons Nb
,nn.BatchNorm1d(Model_params['Neurons_number'])
,nn.ReLU()
,nn.Dropout(Model_params['Drop_rate'])
)
self.dense_layer=nn.Sequential(dense_layers)
self.decision_layer=nn.Linear(Model_params['Neurons_number'], Target_Number)
# self.cuda()
def forward(self, x):
x=self.input_layer(x)
x=self.dense_layer(x)
x=self.decision_layer(x)
return x
support="ELEC"
Test_Window=25
Model_Verbose=2
Loop_params={ 'RNN_ACT':'',
'RNN_neurons': 0,
'RNN_layers': 0,
'RNN_drop': 0,
"RNN_bidir":False,
'Time_Layering':0,#4,8,12
'Time_Layer_Order':"Act_to_Past",#"Past_to_Act"
"layer_number":4,
"Neurons_number":128,
"Drop_rate":0.3,
"Epochs_number":500,
"batch_size":128,
"Cutter__Compo":100,
'Min_Max_Loss_Weight':5,
'OPTI_LR':0.001
}
Params_New={
'Model_type':"DL",
'Model_output_type':'multiple', #or Multiple if only one model to predict close min & max
'Model_pred_type':'evolution', # or absolute, evolution if prediction on % evolution, full_evolution if dropping all absolute values
'Inputs':["Evolution_Past","Add_FinancialV3","Add_Period","Add_Croisement","Add_DELTA","Add_Correlation"],
'Model_Params':{}
}
Params_New['Model_Params'].update(**Loop_params)
#Oracle is my object containing data loading, cleaning transforming etc...
Oracle_Support=Oracle_custom(support,Custom_dict=Params_New,API_CALL=False,Test_Window=Test_Window)
Oracle_Support.Load_And_Clean_Data()
Oracle_Support.Pipe_Fit()
X_Train,Y_Train,X_Test,Y_Test=Oracle_Support.Return_Train_Test(To_Tensor=True,Transform=True)
#Model Init
Model_params=Loop_params
model=Dense_Regressor_RELU(input_Features_nb=X_Train.shape[1],Model_params=Model_params)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=Model_params['OPTI_LR'])
early_stopping = EarlyStopping(patience=25)
### Trainning Loop
best_val_loss = float("inf")
best_epoch=0
model.apply(weights_init)
# https://pytorch.org/torcheval/main/metric_example.html
for epoch in range(Model_params['Epochs_number']):
model.train()
for i in range(0, len(X_Train), Model_params['batch_size']):
Xbatch = X_Train[i:i+Model_params['batch_size']]
ybatch = Y_Train[i:i+Model_params['batch_size']]
y_pred = model(Xbatch)
loss = Model_params['Min_Max_Loss_Weight']*loss_fn(y_pred[:,1], ybatch[:,1])+Model_params['Min_Max_Loss_Weight']*loss_fn(y_pred[:,2], ybatch[:,2])+loss_fn(y_pred[:,2], ybatch[:,2])
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
model.eval()
val_loss = 0.0
with torch.no_grad():
y_pred = model(X_Test)
val_loss = loss_fn(y_pred[:,1], Y_Test[:,1])+loss_fn(y_pred[:,2], Y_Test[:,2])+loss_fn(y_pred[:,2], Y_Test[:,2])
val_loss /= X_Test.shape[0]
if Model_Verbose>1:
print(f"Époque {epoch}, latest loss {loss}, val_loss: {val_loss:.4f}")
# Best Model Saving
if val_loss < best_val_loss:
best_val_loss = val_loss
best_model_state_dict = model.state_dict()
if Model_Verbose>1:
print("Meilleur modèle trouvé")
#Early Stopping
if early_stopping.should_stop(val_loss):
if Model_Verbose>0:
print(f'{support} Arrêt anticipé, Meilleure Epoch : {best_epoch} // Avec Val Loss : {best_val_loss}')
break
else :
best_epoch=epoch
# Restaurer le meilleur modèle
if best_model_state_dict:
model.load_state_dict(best_model_state_dict)
# self.Best_Epoch=best_epoch
if Model_Verbose>1:
print("Meilleur modèle chargé !")
else:
print("Aucun modèle à récupérer.")
# variables deleting
del best_model_state_dict,best_epoch
del X_Train,X_Test,Y_Train,Y_Test
del Xbatch, ybatch,loss_fn,optimizer,early_stopping,loss
del model
gc.collect()
#### Fin Model TRAINING
When i'm launching this code, I got this results :
First time lauching =>
NVIDIA Arrêt anticipé, Meilleure Epoch : 25 // Avec Val Loss : 0.0004996684729121625
Second =>
NVIDIA Arrêt anticipé, Meilleure Epoch : 24 // Avec Val Loss : 0.0005961030256003141
Third=>
NVIDIA Arrêt anticipé, Meilleure Epoch : 61 // Avec Val Loss : 0.0003799324913416058
If I restard Kernel :
First time lauching =>
NVIDIA Arrêt anticipé, Meilleure Epoch : 25 // Avec Val Loss : 0.0004996684729121625
Second =>
NVIDIA Arrêt anticipé, Meilleure Epoch : 24 // Avec Val Loss : 0.0005961030256003141
Third=>
NVIDIA Arrêt anticipé, Meilleure Epoch : 61 // Avec Val Loss : 0.0003799324913416058
I don't understand how the model it continuing training instead of restarting. Only restarting Kernel is making me back to the first time result.
Please help ! :)
Moving from GPU to CPU, Destroying my containers, only restarting kernel between loops is working...
How to ensure model and all hidden variables are killed between loops to do a safe hyper params tuning loops ?
I've solved my problem !!!
here is the point : randwom seeds needs to be set inside the hyperparams loop
Means =>
Hyperparams dict set
For params in hyper params :
set_seeds(SEED)
#Model Init
Model_params=Loop_params
model=Dense_Regressor_RELU(input_Features_nb=X_Train.shape[1],Model_params=Model_params)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=Model_params['OPTI_LR'])
early_stopping = EarlyStopping(patience=25)
### Trainning Loop
best_val_loss = float("inf")
best_epoch=0
model.apply(weights_init)
# https://pytorch.org/torcheval/main/metric_example.html
for epoch in range(Model_params['Epochs_number']):
model.train()
for i in range(0, len(X_Train), Model_params['batch_size']):
Xbatch = X_Train[i:i+Model_params['batch_size']]
ybatch = Y_Train[i:i+Model_params['batch_size']]
y_pred = model(Xbatch)
loss = Model_params['Min_Max_Loss_Weight']*loss_fn(y_pred[:,1], ybatch[:,1])+Model_params['Min_Max_Loss_Weight']*loss_fn(y_pred[:,2], ybatch[:,2])+loss_fn(y_pred[:,2], ybatch[:,2])
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()