1. Load the data

1.1 Import libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split

%matplotlib inline

1.2 Load dataset

Attribute Information:

  1. sepal length in cm
  2. sepal width in cm
  3. petal length in cm
  4. petal width in cm
  5. class: -- Iris Setosa -- Iris Versicolour -- Iris Virginica
In [2]:
filename = "iris.csv"
attribute_names = ["sepal-length", "sepal-width", "petal-length", "petal-width", "class"]
dataset = pd.read_csv(filename, header=None, names=attribute_names)

1.3 Summarize dataset

In [3]:
# Dimensions
print("Dimensions: ", dataset.shape)
Dimensions:  (150, 5)
In [4]:
# Dataset structure
print(dataset.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal-length    150 non-null float64
sepal-width     150 non-null float64
petal-length    150 non-null float64
petal-width     150 non-null float64
class           150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB
None
In [5]:
# Peek at the data
print(dataset.head(10))
   sepal-length  sepal-width  petal-length  petal-width        class
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
5           5.4          3.9           1.7          0.4  Iris-setosa
6           4.6          3.4           1.4          0.3  Iris-setosa
7           5.0          3.4           1.5          0.2  Iris-setosa
8           4.4          2.9           1.4          0.2  Iris-setosa
9           4.9          3.1           1.5          0.1  Iris-setosa
In [6]:
# Statistical summary
print(dataset.describe())
       sepal-length  sepal-width  petal-length  petal-width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000
In [7]:
# Class distribution
print(dataset.groupby("class").size())
class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64

1.4 Data visualization

In [8]:
# Box and Whisker plots
sns.set_style("white")
sns.set_palette("Paired")
dataset.plot(kind="box", subplots=True, layout=(2,2), sharex=False, sharey=False, figsize=(8, 8))
plt.show()
In [9]:
# Histogram plots
sns.set_style("whitegrid")
dataset.plot(kind="hist", subplots=True, layout=(2,2), figsize=(8, 8), sharex=True, sharey=False)
plt.show()
In [10]:
# Multivariate plots
scatter_matrix(dataset, figsize=(12, 12))
plt.show()

1.5 Evaluate some algorithms

1.5.1 Create validation / test set

In [11]:
# Split the data set into 80% test and 20% validation
X = dataset[dataset.columns[:4]].copy()
y = dataset[dataset.columns[4]].copy()

# 20%  validation/test set
test_set_size = 0.2

# random seed
seed = 8

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=seed)

1.5.2 Build models

1.5.2.1 Logistic regression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Fit the model and calculate accuracy
model = LogisticRegression()
model.fit(X_train, y_train)
score = model.score(X_test, y_test)

print("Logistic Regression accuracy: ", round(score, 3))

# Use cross validation and recalculate accuracy
kfold = KFold(n_splits=10, random_state=seed)
cval_score = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")

print ("Logistic Regression cross validation score: ", cval_score.mean(), cval_score.std())
Logistic Regression accuracy:  0.933
Logistic Regression cross validation score:  0.9583333333333333 0.10034662148993581

1.5.2.2 Generalize to apply various models

  • Linear Discriminant Analysis (LDA)
  • K-nearest Neighbours (KNN)
  • Classification and Regression trees (CART)
  • Gaussian Naive Bayes (NB)
  • Support Vector Machines (SVM)
In [13]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Create models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print(name, "\tmean: ", round(cv_results.mean(),4), "\tstd: ", round(cv_results.std(),4))
LR 	mean:  0.9583 	std:  0.1003
LDA 	mean:  0.9833 	std:  0.05
KNN 	mean:  0.9833 	std:  0.0333
CART 	mean:  0.9917 	std:  0.025
NB 	mean:  0.975 	std:  0.0382
SVM 	mean:  0.975 	std:  0.0382

1.5.3 Comparing models

In [14]:
fig = plt.figure()
fig.suptitle('Algorithm comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

1.6 Make predictions

1.6.1 Make predictions with KNN

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_predictions = knn.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, knn_predictions))
print("\nConfusion matrix:\n", confusion_matrix(knn_predictions, y_test))
print("\nClassification report:\n", classification_report(y_test, knn_predictions))
Accuracy:  0.9

Confusion matrix:
 [[10  0  0]
 [ 0  8  2]
 [ 0  1  9]]

Classification report:
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.80      0.89      0.84         9
 Iris-virginica       0.90      0.82      0.86        11

    avg / total       0.90      0.90      0.90        30

1.6.2 Make predictions with CART

In [16]:
cart = DecisionTreeClassifier()
cart.fit(X_train, y_train)
cart_predictions = cart.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, cart_predictions))
print("\nConfusion matrix:\n", confusion_matrix(cart_predictions, y_test))
print("\nClassification report:\n", classification_report(y_test, cart_predictions))
Accuracy:  0.9

Confusion matrix:
 [[10  0  0]
 [ 0  8  2]
 [ 0  1  9]]

Classification report:
                  precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.80      0.89      0.84         9
 Iris-virginica       0.90      0.82      0.86        11

    avg / total       0.90      0.90      0.90        30