This notebook is derived from (https://zoo.cs.yale.edu/classes/cs470/lectures/machinelearning.html)
Here's how it works. In a classification task, you are given the following.
The training data are sets of points in the 2-d space, indicating the bumpiness and grade of the terrain. Associated with each point is a label indicating if the car should drive slow (red) or fast (blue).
Generated with code: prep_terrain_data.py
import random
def makeTerrainData(n_points=1000):
###############################################################################
### make the toy dataset
random.seed(42)
grade = [random.random() for ii in range(0,n_points)]
bumpy = [random.random() for ii in range(0,n_points)]
error = [random.random() for ii in range(0,n_points)]
y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii]) for ii in range(0,n_points)]
for ii in range(0, len(y)):
if grade[ii]>0.8 or bumpy[ii]>0.8:
y[ii] = 1.0
### split into train/test sets
X = [[gg, ss] for gg, ss in zip(grade, bumpy)]
split = int(0.75*n_points)
X_train = X[0:split]
X_test = X[split:]
y_train = y[0:split]
y_test = y[split:]
grade_sig = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==0]
bumpy_sig = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==0]
grade_bkg = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==1]
bumpy_bkg = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==1]
training_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
, "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}
grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]
test_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
, "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}
return X_train, y_train, X_test, y_test
features_train, labels_train, features_test, labels_test = makeTerrainData()
We will visualize the training data.
import matplotlib.pyplot as plt
%matplotlib inline
XX = [n[0] for n in features_train]
YY = [n[1] for n in features_train]
try:
#plt.plot(X, color="blue")
plt.scatter(XX, YY)
plt.ylabel('grade')
plt.xlabel('bumpiness')
except NameError:
pass
# plt.show()
colors = ["b", "r"]
for ii, pp in enumerate(XX):
plt.scatter(XX[ii],YY[ii], color = colors[int(labels_train[ii])])
plt.show()
For hw6, you might find it useful to visualize your data. Here we are using 2D. In the iris notebook, we showed 3D.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
We create an object clf which is an instance of the Naive Bayes classifier. Each sklearn classifier has a fit() method which has parameters for the training features and labels.
clf.fit(features_train, labels_train)
The classifier creates a decision boundary which segregates the data into the different classes or categories. In the image above the area with the blue background indicates the conditions the program has learned under which the car can drive fast and the red background indicates the conditions where the car should slow down.
clf.predict(features_test)
clf.score(features_test, labels_test)
You can also create a separate set of predictions for the test data and independently assess the accuracy.
pred = clf.predict(features_test)
from sklearn.metrics import accuracy_score
accuracy_score(labels_test, pred)
We added some timing code to see how fast the classification and training goes.
from time import time
print ("\nNaive bayes classifier: \n")
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
t0 = time()
clf.fit(features_train, labels_train)
print ("training time:", round(time()-t0, 3), "s")
t1 = time()
print (clf.score(features_test, labels_test))
print ("scoring time:", round(time()-t1, 3), "s")
The accuracy is 88.4%. You probably want your self-driving car to do better than that.
'''Classifier comparison
Classifier comparison
A comparison of a several classifiers in scikit-learn on synthetic
datasets. The point of this example is to illustrate the nature of
decision boundaries of different classifiers. This should be taken
with a grain of salt, as the intuition conveyed by these examples does
not necessarily carry over to real datasets. Particularly in
high-dimensional spaces, data can more easily be separated linearly
and the simplicity of classifiers such as naive Bayes and linear SVMs
might lead to better generalization than is achieved by other
classifiers. The plots show training points in solid colors and
testing points semi-transparent. The lower right shows the
classification accuracy on the test set.
../../_images/sphx_glr_plot_classifier_comparison_001.png
'''
print(__doc__)
%matplotlib inline
# Code source: Gael Varoquaux
# Andreas Muller
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
h = .02 # step size in the mesh
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
"Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
"Naive Bayes", "QDA"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
MLPClassifier(alpha=1),
AdaBoostClassifier(),
GaussianNB(),
QuadraticDiscriminantAnalysis()]
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [make_moons(noise=0.3, random_state=0),
make_circles(noise=0.2, factor=0.5, random_state=1),
linearly_separable
]
figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=.4, random_state=42)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if ds_cnt == 0:
ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
if ds_cnt == 0:
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
plt.tight_layout()
plt.show()