pythonscikit-learnsvm

SVM model highlights the wrong data points as support vectors


I'm working on an SVM model for a homework. no matter what I do, the model picks the wrong data points as support vectors: scatter svm

this is my data: csv data

this is my code:

def decision_hyperplane(clf, x, y=None, dimension=2):
    """
    Return a decision line (dimension 2, return y based on x) or a
    decision plane (dimension 3, return z based on x and y).

    Decision plane equation is wx + b = 0, so in 2d case:
    w.dot(x) + b = w_x * x + w_y * y + b = 0
    y = (-w_x * x - b) / w_y
    """
    if dimension == 2:
        return (-clf.intercept_[0] - clf.coef_[0][0] * x) / clf.coef_[0][1]
    elif dimension == 3:
        return (-clf.intercept_[0] - clf.coef_[0][0] * x - clf.coef_[0][1] * y) / clf.coef_[0][2]


file_path = 'cell_samples.csv'  # Replace with the actual file path
df = pd.read_csv(file_path)
x_column= 'UnifSize'
y_column= 'UnifShape'
class_name = 'Class'
X = df[[x_column, y_column]]
y = df[class_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=255)

clf = svm.SVC(kernel='linear', C=100000)
# fit the model
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

#scatter data points
plt.scatter(X_test[x_column][y_pred == 2], X_test[y_column][y_pred == 2], label='Type 1', c='blue', marker='o')
plt.scatter(X_test[x_column][y_pred == 4], X_test[y_column][y_pred == 4], label='Type 2', c='red', marker='s')

w = clf.coef_[0] # parameters  of the hyperplane
b = clf.intercept_ #hyperplane interception
a = -(w[1] / w[0])
xx = np.linspace(X_test.iloc[:, 0].min(), X_test.iloc[:, 0].max())
yy = a * xx - (b / w[0]) # getting corresponding y-points

# plot the decision boundary (hyperplane)
hyperplane = decision_hyperplane(clf, xx)
plt.plot(xx, hyperplane, linewidth=2, color='black')

# Highlight only the closest support vectors
vectors = clf.support_vectors_[np.abs(clf.decision_function(clf.support_vectors_)).argsort()]
# get unique vectors (no duplicates)
vectors = np.unique(vectors, axis=0)

plt.scatter(vectors[:, 0],
            vectors[:, 1],
            s=100, facecolors='none',
            linewidth=1,
            edgecolors='k', alpha=.5,
            marker='o', label='Support Vectors'
            )

# plot description`
plt.title('Scatter plot with SVM Decision Boundary')
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.legend(loc='upper right')

# Show the plot
plt.show()

from the scatter plot, you can easily see that there are only 4 support vectors.

I tried to use a C value of 100000 but the program lags and takes too long to finish


Solution

  • The support vectors are learnt from the training data, but in the code only the test points were being plotted. I also corrected the part that pulls out the closest unique vectors.

    You can run the SVC with default settings (C=1), and get the results below which include both the test and train data:

    enter image description here

    import pandas as pd
    import numpy as np
    from matplotlib import pyplot as plt
    from sklearn import svm
    from sklearn.model_selection import train_test_split
    
    def decision_hyperplane(clf, x, y=None, dimension=2):
        """
        Return a decision line (dimension 2, return y based on x) or a
        decision plane (dimension 3, return z based on x and y).
    
        Decision plane equation is wx + b = 0, so in 2d case:
        w.dot(x) + b = w_x * x + w_y * y + b = 0
        y = (-w_x * x - b) / w_y
        """
        if dimension == 2:
            return (-clf.intercept_[0] - clf.coef_[0][0] * x) / clf.coef_[0][1]
        elif dimension == 3:
            return (-clf.intercept_[0] - clf.coef_[0][0] * x - clf.coef_[0][1] * y) / clf.coef_[0][2]
    
    
    file_path = '../cell_samples.csv'  # Replace with the actual file path
    df = pd.read_csv(file_path)
    x_column= 'UnifSize'
    y_column= 'UnifShape'
    class_name = 'Class'
    
    X = df[[x_column, y_column]]
    y = df[class_name]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=255)
    
    clf = svm.SVC(kernel='linear', C=1)
    # fit the model
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred = clf.predict(X_test)
    
    #train: scatter data points
    plt.scatter(X_train[x_column][y_pred_train == 2], X_train[y_column][y_pred_train == 2],
                label='train Type 1', c='darkblue', marker='o', s=50)
    plt.scatter(X_train[x_column][y_pred_train == 4], X_train[y_column][y_pred_train == 4],
                label='train Type 2', c='darkred', marker='s', s=50)
    
    #test: scatter data points
    plt.scatter(X_test[x_column][y_pred == 2], X_test[y_column][y_pred == 2],
                label='test Type 1', c='dodgerblue', marker='o', s=13)
    plt.scatter(X_test[x_column][y_pred == 4], X_test[y_column][y_pred == 4],
                label='train Type 2', c='tomato', marker='s', s=13)
    
    w = clf.coef_[0] # parameters  of the hyperplane
    b = clf.intercept_ #hyperplane interception
    a = -(w[1] / w[0])
    xx = np.linspace(X_test.iloc[:, 0].min(), X_test.iloc[:, 0].max())
    yy = a * xx - (b / w[0]) # getting corresponding y-points
    
    # plot the decision boundary (hyperplane)
    hyperplane = decision_hyperplane(clf, xx)
    plt.plot(xx, hyperplane, linewidth=2, color='black')
    
    # Highlight only the closest support vectors
    unique_svs = np.unique(clf.support_vectors_, axis=0)
    sv_distances = np.abs(clf.decision_function(unique_svs))
    sv_ixs_sorted = sv_distances.argsort()
    closest_vectors = unique_svs[sv_ixs_sorted[:20]] #the 20 closest
    
    plt.scatter(
        closest_vectors[:, 0],
        closest_vectors[:, 1],
        s=220,
        # color='black', alpha=0.3, #support vectors single colour
        c=abs(clf.decision_function(closest_vectors)), cmap='bone_r', #shade by distance
        linewidth=3,
        edgecolors='none',
        marker='o', label='Support Vectors',
        zorder=-1
    )
    
    # plot description
    plt.title('Scatter plot with SVM Decision Boundary')
    plt.xlabel(x_column)
    plt.ylabel(y_column)
    plt.gcf().legend(loc='lower left', ncols=3, bbox_to_anchor=(0.12, 0.1), fontsize=9)
    plt.gcf().set_size_inches(8, 3.5)
    
    # Show the plot
    plt.show()