I'm working on an SVM model for a homework. no matter what I do, the model picks the wrong data points as support vectors: scatter svm
this is my data: csv data
this is my code:
def decision_hyperplane(clf, x, y=None, dimension=2):
"""
Return a decision line (dimension 2, return y based on x) or a
decision plane (dimension 3, return z based on x and y).
Decision plane equation is wx + b = 0, so in 2d case:
w.dot(x) + b = w_x * x + w_y * y + b = 0
y = (-w_x * x - b) / w_y
"""
if dimension == 2:
return (-clf.intercept_[0] - clf.coef_[0][0] * x) / clf.coef_[0][1]
elif dimension == 3:
return (-clf.intercept_[0] - clf.coef_[0][0] * x - clf.coef_[0][1] * y) / clf.coef_[0][2]
file_path = 'cell_samples.csv' # Replace with the actual file path
df = pd.read_csv(file_path)
x_column= 'UnifSize'
y_column= 'UnifShape'
class_name = 'Class'
X = df[[x_column, y_column]]
y = df[class_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=255)
clf = svm.SVC(kernel='linear', C=100000)
# fit the model
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
#scatter data points
plt.scatter(X_test[x_column][y_pred == 2], X_test[y_column][y_pred == 2], label='Type 1', c='blue', marker='o')
plt.scatter(X_test[x_column][y_pred == 4], X_test[y_column][y_pred == 4], label='Type 2', c='red', marker='s')
w = clf.coef_[0] # parameters of the hyperplane
b = clf.intercept_ #hyperplane interception
a = -(w[1] / w[0])
xx = np.linspace(X_test.iloc[:, 0].min(), X_test.iloc[:, 0].max())
yy = a * xx - (b / w[0]) # getting corresponding y-points
# plot the decision boundary (hyperplane)
hyperplane = decision_hyperplane(clf, xx)
plt.plot(xx, hyperplane, linewidth=2, color='black')
# Highlight only the closest support vectors
vectors = clf.support_vectors_[np.abs(clf.decision_function(clf.support_vectors_)).argsort()]
# get unique vectors (no duplicates)
vectors = np.unique(vectors, axis=0)
plt.scatter(vectors[:, 0],
vectors[:, 1],
s=100, facecolors='none',
linewidth=1,
edgecolors='k', alpha=.5,
marker='o', label='Support Vectors'
)
# plot description`
plt.title('Scatter plot with SVM Decision Boundary')
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.legend(loc='upper right')
# Show the plot
plt.show()
from the scatter plot, you can easily see that there are only 4 support vectors.
I tried to use a C value of 100000 but the program lags and takes too long to finish
The support vectors are learnt from the training data, but in the code only the test points were being plotted. I also corrected the part that pulls out the closest unique vectors.
You can run the SVC with default settings (C=1), and get the results below which include both the test and train data:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
def decision_hyperplane(clf, x, y=None, dimension=2):
"""
Return a decision line (dimension 2, return y based on x) or a
decision plane (dimension 3, return z based on x and y).
Decision plane equation is wx + b = 0, so in 2d case:
w.dot(x) + b = w_x * x + w_y * y + b = 0
y = (-w_x * x - b) / w_y
"""
if dimension == 2:
return (-clf.intercept_[0] - clf.coef_[0][0] * x) / clf.coef_[0][1]
elif dimension == 3:
return (-clf.intercept_[0] - clf.coef_[0][0] * x - clf.coef_[0][1] * y) / clf.coef_[0][2]
file_path = '../cell_samples.csv' # Replace with the actual file path
df = pd.read_csv(file_path)
x_column= 'UnifSize'
y_column= 'UnifShape'
class_name = 'Class'
X = df[[x_column, y_column]]
y = df[class_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=255)
clf = svm.SVC(kernel='linear', C=1)
# fit the model
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)
#train: scatter data points
plt.scatter(X_train[x_column][y_pred_train == 2], X_train[y_column][y_pred_train == 2],
label='train Type 1', c='darkblue', marker='o', s=50)
plt.scatter(X_train[x_column][y_pred_train == 4], X_train[y_column][y_pred_train == 4],
label='train Type 2', c='darkred', marker='s', s=50)
#test: scatter data points
plt.scatter(X_test[x_column][y_pred == 2], X_test[y_column][y_pred == 2],
label='test Type 1', c='dodgerblue', marker='o', s=13)
plt.scatter(X_test[x_column][y_pred == 4], X_test[y_column][y_pred == 4],
label='train Type 2', c='tomato', marker='s', s=13)
w = clf.coef_[0] # parameters of the hyperplane
b = clf.intercept_ #hyperplane interception
a = -(w[1] / w[0])
xx = np.linspace(X_test.iloc[:, 0].min(), X_test.iloc[:, 0].max())
yy = a * xx - (b / w[0]) # getting corresponding y-points
# plot the decision boundary (hyperplane)
hyperplane = decision_hyperplane(clf, xx)
plt.plot(xx, hyperplane, linewidth=2, color='black')
# Highlight only the closest support vectors
unique_svs = np.unique(clf.support_vectors_, axis=0)
sv_distances = np.abs(clf.decision_function(unique_svs))
sv_ixs_sorted = sv_distances.argsort()
closest_vectors = unique_svs[sv_ixs_sorted[:20]] #the 20 closest
plt.scatter(
closest_vectors[:, 0],
closest_vectors[:, 1],
s=220,
# color='black', alpha=0.3, #support vectors single colour
c=abs(clf.decision_function(closest_vectors)), cmap='bone_r', #shade by distance
linewidth=3,
edgecolors='none',
marker='o', label='Support Vectors',
zorder=-1
)
# plot description
plt.title('Scatter plot with SVM Decision Boundary')
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.gcf().legend(loc='lower left', ncols=3, bbox_to_anchor=(0.12, 0.1), fontsize=9)
plt.gcf().set_size_inches(8, 3.5)
# Show the plot
plt.show()