I am trying to implement and train an SVM multi-class classifier from scratch using python and numpy in jupyter notebooks.
I have been using the CS231n course as my base of knowledge, especially this page: https://cs231n.github.io/optimization-1/ which discusses gradient descent. I have implemented a class, SVM, that I believe is on the right track.
Here is the basic profile for that class:
class SVM:
def __init__(self):
self.weights = np.random.randn(len(labels), X_train.shape[1]) * 0.1
self.history = []
def predict(self, X):
'''
returns class predictions in np array of size
n x num_classes, where n is the number of examples in X
'''
#matrix multiplication to apply weights to X
bounds = self.weights @ X.T
#return the predictions
return np.array(bounds).T
def loss(self, scores, y, delta=1):
'''computes the loss'''
#calculate and return the loss for a prediction and corresponding truth label
#hinge loss in this case
total_loss = 0
#compute loss for each example...
for i in range(len(scores)):
#extract values for this example
scores_of_x = scores[i]
label = y[i]
correct_score = scores_of_x[label]
incorrect_scores = np.concatenate((scores_of_x[:label], scores_of_x[label+1:]))
#use the scores for example x to compute the loss at x
wj_xi = correct_score #these should be a vector of INCORRECT scores
wyi_xi = incorrect_scores #this should be a vector of the CORRECT score
wy_xi = wj_xi - wyi_xi + delta #core of the hinge loss formula
losses = np.maximum(0, wy_xi) #lower bound the losses at 0
loss = np.sum(losses) #sum the losses
#add to the total loss
total_loss += loss
#return the loss
avg_loss = total_loss / len(scores)
return avg_loss
def gradient(self, scores, X, y, delta=1):
'''computes the gradient'''
#calculate the loss and the gradient of the loss function
#gradient of hinge loss function
gradient = np.zeros(self.weights.shape)
#calculate the gradient in each example in x
for i in range(len(X)):
#extract values for this example
scores_of_x = scores[i]
label = y[i]
x = X[i]
correct_score = scores_of_x[label]
incorrect_scores = np.concatenate((scores_of_x[:label], scores_of_x[label+1:]))
#
##
### start by computing the gradient of the weights of the correct classifier
##
#
wj_xi = correct_score #these should be a vector of INCORRECT scores
wyi_xi = incorrect_scores #this should be a vector of the CORRECT score
wy_xi = wj_xi - wyi_xi + delta #core of the hinge loss formula
losses = np.maximum(0, wy_xi) #lower bound the losses at 0
#get number of nonzero losses, and scale data vector by them to get the loss
num_contributing_classifiers = np.count_nonzero(losses)
#print(f"Num loss contributors: {num_contributing_classifiers}")
g = -1 * x * num_contributing_classifiers #NOTE the -, very important here, doesn't apply to other scores
#add the gradient of the correct classifier to the gradient
gradient[label] += g #because arrays are 0-indexed, but the labels are 1-indexed
# print(f"correct label: {label}")
#print(f"gradient:\n{gradient}")
#
##
### then, compute the gradient of the weights for each incorrect classifier
##
#
for j in range(len(scores_of_x)):
#skip the correct score, since we already did it
if j == label:
continue
wj_xi = scores_of_x[j] #should be a vector containing the score of the CURRENT classifier
wyi_xi = correct_score #should be a vector containing the score of the CORRECT classifier
wy_xi = wj_xi - wyi_xi + delta #core of the hinge loss formula
loss = np.maximum(0, wy_xi) #lower bound the loss at 0
#get whether this classifier contributed to the loss, and scale the data vector by that to get the gradient
contributed_to_loss = 0
if loss > 0:
contributed_to_loss = 1
g = x * contributed_to_loss #either times 1 or times 0
#add the gradient of the incorrect classifier to the gradient
gradient[j] += g
#divide the gradient by number of examples to get the average gradient
return gradient / len(X)
def fit(self, X, y, epochs = 1000, batch_size = 256, lr=1e-2, verbose=True):
#gradient descent loop
for epoch in range(epochs):
self.history.append({'epoch': epoch})
#create a batch of samples to calculate the gradient
#NOTE: this significantly boosts the speed of training
indices = np.random.choice(len(X), batch_size, replace=False)
X_batch = X.iloc[indices]
y_batch = y.iloc[indices]
X_batch = X_batch.to_numpy()
y_batch = y_batch.to_numpy()
#evaluate class scores on training set
predictions = self.predict(X_batch)
predicted_classes = np.argmax(predictions, axis=1)
#compute the loss: average hinge loss
loss = self.loss(predictions, y_batch)
self.history[-1]['loss'] = loss
#compute accuracy on the test set, for an intuitive metric
accuracy = np.mean(predicted_classes == y_batch)
self.history[-1]['accuracy'] = accuracy
#print progress
if epoch%50 == 0 and verbose:
print(f"Epoch: {epoch} | Loss: {loss} | Accuracy: {accuracy} | LR: {lr} \n")
#compute the gradient on the scores assigned by the classifier
gradient = self.gradient(predictions, X_batch, y_batch)
#backpropagate the gradient to the weights + bias
step = gradient * lr
#perform a parameter update, in the negative??? direction of the gradient
self.weights += step
That is my implementation. The fit() method is the one that trains the weights on the data passed in. I am at a stage where loss tends to decrease from one iteration to the next.
But, the problem is, accuracy drops down to zero even as loss decreases.
I know that they are not directly related, but shouldn't my accuracy generally trend upwards as loss goes down? This makes me think I have done something wrong in the loss() and gradient() methods. But, I can't seem to find where I went wrong. Also, sometimes, my loss will increase from one epoch to the next. This could be an impact of my batched evaluation of the gradient, but I am not certain.
Here is a link to my Jupyter notebook, which should let you run my code in its current state:
https://colab.research.google.com/drive/12z4DevKDicmT4iE6AlMGrRiN6He8R9_4#scrollTo=uBTUQlscWksP And here is a link to the data set I am using: https://www.kaggle.com/datasets/taweilo/fish-species-sampling-weight-and-height-data/code
To anyone who runs across this thread, I solved my problem. Turns out, I was misreading the formula, and had the locations of two of the terms mixed up. The comments in my original code were actually correct. The variables wj_xi and wyi_xi should actually be defined like this (in both the gradient and the loss methods):
wj_xi = incorrect_scores #these should be a vector of INCORRECT scores
wyi_xi = correct_score #this should be a vector of the CORRECT score
I had them flipped around. Also, as was mentioned in the replies, it is important to update the weights in the negative direction of the gradient, like so:
self.weights -= step
Full code:
class SVM:
def __init__(self):
self.weights = np.random.randn(len(labels), X_train.shape[1]) * 0.1 #9 sets of weights (9 classes) and 4 entries per set (3 features + 1 bias, shape= (9x4))
self.history = []
def predict(self, X):
'''
returns class predictions in np array of size
n x 10, where n is the number of examples in X
'''
#matrix multiplication to apply weights to X
bounds = self.weights @ X.T
#return the predictions
return np.array(bounds).T
def loss(self, scores, y, delta=1):
'''
returns the average hinge loss of the batch
'''
#calculate and return the loss for a prediction and corresponding truth label
#hinge loss in this case
total_loss = 0
#compute loss for each example...
for i in range(len(scores)):
#extract values for this example
scores_of_x = scores[i]
label = y[i]
correct_score = scores_of_x[label]
incorrect_scores = np.concatenate((scores_of_x[:label], scores_of_x[label+1:]))
#use the scores for example x to compute the loss at x
wj_xi = incorrect_scores #these should be a vector of INCORRECT scores
wyi_xi = correct_score #this should be a vector of the CORRECT score
wy_xi = wj_xi - wyi_xi + delta #core of the hinge loss formula
losses = np.maximum(0, wy_xi) #lower bound the losses at 0
loss = np.sum(losses) #sum the losses
#add to the total loss
total_loss += loss
#return the loss
avg_loss = total_loss / len(scores) #divide by the number of examples to fund the average hinge loss per-example
return avg_loss
def gradient(self, scores, X, y, delta=1):
'''
returns the gradient of the loss function
'''
#calculate the loss and the gradient of the loss function
#gradient of hinge loss function
gradient = np.zeros(self.weights.shape)
#calculate the gradient in each example in x
for i in range(len(X)):
#extract values for this example
scores_of_x = scores[i]
label = y[i]
x = X[i]
correct_score = scores_of_x[label] #because arrays are 0-indexed, but the labels are 1-indexed
incorrect_scores = np.concatenate((scores_of_x[:label], scores_of_x[label+1:]))
#
##
### start by computing the gradient of the weights of the correct classifier
##
#
wj_xi = incorrect_scores #these should be a vector of INCORRECT scores
wyi_xi = correct_score #this should be a vector of the CORRECT score
wy_xi = wj_xi - wyi_xi + delta #core of the hinge loss formula
losses = np.maximum(0, wy_xi) #lower bound the losses at 0
#get number of nonzero losses, and scale data vector by them to get the loss
num_contributing_classifiers = np.count_nonzero(losses)
#print(f"Num loss contributors: {num_contributing_classifiers}")
g = -1 * x * num_contributing_classifiers #NOTE the -, very important here, doesn't apply to other scores
#add the gradient of the correct classifier to the gradient
gradient[label] += g #because arrays are 0-indexed, but the labels are 1-indexed
# print(f"correct label: {label}")
#print(f"gradient:\n{gradient}")
#
##
### then, compute the gradient of the weights for each incorrect classifier
##
#
for j in range(len(scores_of_x)):
#skip the correct score, since we already did it
if j == label:
continue
wj_xi = scores_of_x[j] #should be a vector containing the score of the CURRENT classifier
wyi_xi = correct_score #should be a vector containing the score of the CORRECT classifier
wy_xi = wj_xi - wyi_xi + delta #core of the hinge loss formula
loss = np.maximum(0, wy_xi) #lower bound the loss at 0
#get whether this classifier contributed to the loss, and scale the data vector by that to get the gradient
contributed_to_loss = 0
if loss > 0:
contributed_to_loss = 1
g = x * contributed_to_loss #either times 1 or times 0
#add the gradient of the incorrect classifier to the gradient
gradient[j] += g
#divide the gradient by number of examples to get the average gradient
return gradient / len(X)
def fit(self, X, y, epochs = 1000, batch_size = 256, lr=1e-2, verbose=True):
'''
trains the model on the training set
'''
#gradient descent loop
for epoch in range(epochs):
self.history.append({'epoch': epoch})
#create a batch of samples to calculate the gradient
#NOTE: this significantly boosts the speed of training
indices = np.random.choice(len(X), batch_size, replace=False)
X_batch = X.iloc[indices]
y_batch = y.iloc[indices]
X_batch = X_batch.to_numpy()
y_batch = y_batch.to_numpy()
#evaluate class scores on training set
predictions = self.predict(X_batch)
predicted_classes = np.argmax(predictions, axis=1)
if epoch%50 == 0 and verbose:
print(f"pred: {predicted_classes[:10]}")
print(f"true: {y_batch[:10]}")
#compute the loss: average hinge loss
loss = self.loss(predictions, y_batch)
self.history[-1]['loss'] = loss
#compute accuracy on the test set, for an intuitive metric
accuracy = np.mean(predicted_classes == y_batch)
self.history[-1]['accuracy'] = accuracy
#reduce the learning rate as training progresses
# lr *= 0.999
# self.history[-1]['lr'] = lr
if epoch%50 == 0 and verbose:
print(f"Epoch: {epoch} | Loss: {loss} | Accuracy: {accuracy} | LR: {lr} \n")
#compute the gradient on the scores assigned by the classifier
gradient = self.gradient(predictions, X_batch, y_batch)
#print(gradient)
#backpropagate the gradient to the weights + bias
step = gradient * lr
#perform a parameter update, in the negative direction of the gradient
self.weights -= step
sm = SVM()
pred = sm.predict(np.array(X_train[0:1]))
sm.fit(X_train, y_train)