supported vector machine

Posted by neverset on December 29, 2020

Support vector machines (SVM) is a supervised machine learning. Application areas include:

  • classification
  • regression Its biggest advantage is that it can define both a linear or a non-linear decision boundary by using kernel functions.
    support vector machines also draws a margin around the decision boundary to increase prediction confidence.
    Support vector machines allow some misclassification during the learning process. So they can do a better job at classifying most vectors in the testing set.

usage

    import random
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    def generate_random_dataset(size):
    """ Generate a random dataset and that follows a quadratic  distribution
    """
    x = []
    y = []
    target = []
    for i in range(size):
            # class zero
            x.append(np.round(random.uniform(0, 2.5), 1))
            y.append(np.round(random.uniform(0, 20), 1))
            target.append(0)
            # class one
            x.append(np.round(random.uniform(1, 5), 2))
            y.append(np.round(random.uniform(20, 25), 2))
            target.append(1)
            x.append(np.round(random.uniform(3, 5), 2))
            y.append(np.round(random.uniform(5, 25), 2))
            target.append(1)
    df_x = pd.DataFrame(data=x)
    df_y = pd.DataFrame(data=y)
    df_target = pd.DataFrame(data=target)
    data_frame = pd.concat([df_x, df_y], ignore_index=True, axis=1)
    data_frame = pd.concat([data_frame, df_target], ignore_index=True, axis=1)
    data_frame.columns = ['x', 'y', 'target']
    return data_frame

    # Generate dataset
    size = 100
    dataset = generate_random_dataset(size)
    features = dataset[['x', 'y']]
    label = dataset['target']
    # Hold out 20% of the dataset for training
    test_size = int(np.round(size * 0.2, 0))
    # Split dataset into training and testing sets
    x_train = features[:-test_size].values
    y_train = label[:-test_size].values
    x_test = features[-test_size:].values
    y_test = label[-test_size:].values
    # Plotting the training set
    fig, ax = plt.subplots(figsize=(12, 7))
    # removing to and right border
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # adding major gridlines
    ax.grid(color='grey', linestyle='-', linewidth=0.25, alpha=0.5)
    ax.scatter(features[:-test_size]['x'], features[:-test_size]['y'], color="#8C7298")
    plt.show()  

    #train model
    from sklearn import svm
    model = svm.SVC(kernel='poly', degree=2)
    model.fit(x_train, y_train)    

    #plot decision boundary
    fig, ax = plt.subplots(figsize=(12, 7))
    # Removing to and right border
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # Create grid to evaluate model
    xx = np.linspace(-1, max(features['x']) + 1, len(x_train))
    yy = np.linspace(0, max(features['y']) + 1, len(y_train))
    YY, XX = np.meshgrid(yy, xx)
    xy = np.vstack([XX.ravel(), YY.ravel()]).T
    train_size = len(features[:-test_size]['x'])
    # Assigning different colors to the classes
    colors = y_train
    colors = np.where(colors == 1, '#8C7298', '#4786D1')
    # Plot the dataset
    ax.scatter(features[:-test_size]['x'], features[:-test_size]['y'], c=colors)
    # Get the separating hyperplane
    Z = model.decision_function(xy).reshape(XX.shape)
    # Draw the decision boundary and margins
    ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])
    # Highlight support vectors with a circle around them
    ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=100, linewidth=1, facecolors='none', edgecolors='k')
    plt.show()  

    #run test
    from sklearn.metrics import accuracy_score
    predictions_poly = model.predict(x_test)
    accuracy_poly = accuracy_score(y_test, predictions_poly)
    print("2nd degree polynomial Kernel\nAccuracy (normalized): " + str(accuracy_poly))