Predict Unknown

 
# PREDICT UNKNOWN - KNN
# ---------------------
# Concept:
#   - We provide training dataset ponts (features) and label (target).
#   - We train the model (with k=3 nearest neighbors constrain).
#   - We are able to predict the label (y) for a new (unknown) data point.
# ------------------------------------------------

from sklearn.neighbors import KNeighborsClassifier

# Training dataset
# ----------------
X = [[0,0],
     [1,1],
     [2,2],
     [3,3]]
y = [0, 1, 0, 1]

# Train the model
# ---------------
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X, y)

# Make predictions for unknown
# ----------------------------
x_unknown = [1,2]
y_pred = knn.predict([x_unknown])

# Output results
# --------------
print(x_unknown)  # [1, 2]
print(y_pred)     # [0]

Predict from dataset

 
# PREDICT from DATASET - KNN
# --------------------------
# We use pandas library to transform a dictonary dataset into a DataFrame.
# Fruits dataset contains heights, widths and labels (fruit name).
# ------------------------------------------------

from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from icecream import ic

# Training dataset (dictionary)
# ----------------------------
data = {
  'height': [
    3.91, 7.09, 10.48, 9.21, 7.95, 7.62, 7.95, 4.69, 7.50, 7.11, 
    4.15, 7.29, 8.49, 7.44, 7.86, 3.93, 4.40, 5.5, 8.10, 8.69
  ], 
  'width': [
     5.76, 7.69, 7.32, 7.20, 5.90, 7.51, 5.32, 6.19, 5.99, 7.02, 
     5.60, 8.38, 6.52, 7.89, 7.60, 6.12, 5.90, 4.5, 6.15, 5.82
  ],
  'fruit': [
    'Mandarin', 'Apple', 'Lemon', 'Lemon', 'Lemon', 'Apple', 'Mandarin', 
    'Mandarin', 'Lemon', 'Apple', 'Mandarin', 'Apple', 'Lemon', 'Apple', 
    'Apple', 'Apple', 'Mandarin', 'Lemon', 'Lemon', 'Lemon'
  ]
} 

# Transform dataset
# -----------------
df = pd.DataFrame(data)
df = df.sort_values(by=['fruit', 'width', 'height'])
ic(df)

X = df[['height', 'width']].values
y = df.fruit.values
ic(X, y)

# Train the model
# ---------------
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X, y)

# Make predictions
# ----------------
unknown_item = [9,3]
unknown_items = [[9, 3], [4, 5], [2, 5], [8, 9], [5, 7]]

prediction = knn.predict([unknown_item])
predictions = knn.predict(unknown_items)


# Output results
# --------------
ic(unknown_item, prediction)
ic(unknown_items,predictions)

"""
ic| df:     height  width     fruit
        15    3.93   6.12     Apple
        9     7.11   7.02     Apple
        5     7.62   7.51     Apple
        14    7.86   7.60     Apple
        1     7.09   7.69     Apple
        13    7.44   7.89     Apple
        11    7.29   8.38     Apple
        17    5.50   4.50     Lemon
        19    8.69   5.82     Lemon
        4     7.95   5.90     Lemon
        8     7.50   5.99     Lemon
        18    8.10   6.15     Lemon
        12    8.49   6.52     Lemon
        3     9.21   7.20     Lemon
        2    10.48   7.32     Lemon
        6     7.95   5.32  Mandarin
        10    4.15   5.60  Mandarin
        0     3.91   5.76  Mandarin
        16    4.40   5.90  Mandarin
        7     4.69   6.19  Mandarin
ic| X: array([[ 3.93,  6.12],
              [ 7.11,  7.02],
              [ 7.62,  7.51],
              [ 7.86,  7.6 ],
              [ 7.09,  7.69],
              [ 7.44,  7.89],
              [ 7.29,  8.38],
              [ 5.5 ,  4.5 ],
              [ 8.69,  5.82],
              [ 7.95,  5.9 ],
              [ 7.5 ,  5.99],
              [ 8.1 ,  6.15],
              [ 8.49,  6.52],
              [ 9.21,  7.2 ],
              [10.48,  7.32],
              [ 7.95,  5.32],
              [ 4.15,  5.6 ],
              [ 3.91,  5.76],
              [ 4.4 ,  5.9 ],
              [ 4.69,  6.19]])
    y: array(['Apple', 'Apple', 'Apple', 'Apple', 'Apple', 'Apple', 'Apple',
              'Lemon', 'Lemon', 'Lemon', 'Lemon', 'Lemon', 'Lemon', 'Lemon',
              'Lemon', 'Mandarin', 'Mandarin', 'Mandarin', 'Mandarin',
              'Mandarin'], dtype=object)
ic| unknown_item: [9, 3], prediction: array(['Lemon'], dtype=object)
ic| unknown_items: [[9, 3], [4, 5], [2, 5], [8, 9], [5, 7]]
    predictions: array(['Lemon', 'Mandarin', 'Mandarin', 'Apple', 'Mandarin'], dtype=object)
"""

Model Evaluation

 
# MODEL EVALUATION - KNN
# ----------------------
# The dataset is splitted in two datasets (training and test) that
# will be used to evaluate the model.
# The score is the difference between actual and predicted labels.
# A score of 1.0 means that the model correctly predicted all labels (100%).
# ------------------------------------------------

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# Training and TEST datasets
# --------------------------
D1 = pd.DataFrame({
  'height': [
    3.91, 7.09, 10.48, 9.21, 7.95, 7.62, 7.95, 4.69, 7.50, 7.11, 
    4.15, 7.29, 8.49, 7.44, 7.86, 3.93, 4.40, 5.5, 8.10, 8.69
  ], 
  'width': [
     5.76, 7.69, 7.32, 7.20, 5.90, 7.51, 5.32, 6.19, 5.99, 7.02, 
     5.60, 8.38, 6.52, 7.89, 7.60, 6.12, 5.90, 4.5, 6.15, 5.82
  ],
  'fruit': [
    'Mandarin', 'Apple', 'Lemon', 'Lemon', 'Lemon', 'Apple', 'Mandarin', 
    'Mandarin', 'Lemon', 'Apple', 'Mandarin', 'Apple', 'Lemon', 'Apple', 
    'Apple', 'Apple', 'Mandarin', 'Lemon', 'Lemon', 'Lemon'
  ]
})

# Test dataset
D2 = pd.DataFrame({
    'height': [4, 4.47, 6.49, 7.51, 8.34],
    'width':  [6.5, 7.13, 7, 5.01, 4.23],
    'fruit':  ['Mandarin', 'Mandarin', 'Apple', 'Lemon', 'Lemon']
})


# Transform datasets
# ------------------
X1 = D1[['height', 'width']].values
y1 = D1.fruit.values

X2 = D2[['height', 'width']].values
y2 = D2.fruit.values

# Train the model
# ---------------
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X1, y1)


# Evaluate the model
# ------------------
predictions1 = knn.predict(X1)
predictions2 = knn.predict(X2)

score1 = metrics.accuracy_score(y1, predictions1)
score2 = metrics.accuracy_score(y2, predictions2)

print("Model score on training dataset:", score1 * 100)
print("Model score on test dataset:", score2 * 100)

# -----------------------------------
# Model score on training dataset: 85.0
# Model score on test dataset: 100.0

Score Graph

 
# KNN - SCORE GRAPH
# -----------------
# By plotting the results graph we can see that the model 
# perform optimally for k between 3 and 7 (for test set).
# ------------------------------------------------

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as plt

# Training and TEST datasets
# --------------------------
D1 = pd.DataFrame({
  'height': [
    3.91, 7.09, 10.48, 9.21, 7.95, 7.62, 7.95, 4.69, 7.50, 7.11, 
    4.15, 7.29, 8.49, 7.44, 7.86, 3.93, 4.40, 5.5, 8.10, 8.69
  ], 
  'width': [
     5.76, 7.69, 7.32, 7.20, 5.90, 7.51, 5.32, 6.19, 5.99, 7.02, 
     5.60, 8.38, 6.52, 7.89, 7.60, 6.12, 5.90, 4.5, 6.15, 5.82
  ],
  'fruit': [
    'Mandarin', 'Apple', 'Lemon', 'Lemon', 'Lemon', 'Apple', 'Mandarin', 
    'Mandarin', 'Lemon', 'Apple', 'Mandarin', 'Apple', 'Lemon', 'Apple', 
    'Apple', 'Apple', 'Mandarin', 'Lemon', 'Lemon', 'Lemon'
  ]
})

# Test dataset
D2 = pd.DataFrame({
    'height': [4, 4.47, 6.49, 7.51, 8.34],
    'width':  [6.5, 7.13, 7, 5.01, 4.23],
    'fruit':  ['Mandarin', 'Mandarin', 'Apple', 'Lemon', 'Lemon']
})


# Transform datasets
# ------------------
X1 = D1[['height', 'width']].values
y1 = D1.fruit.values

X2 = D2[['height', 'width']].values
y2 = D2.fruit.values


# Evaluate the score for different params
# ---------------------------------------
k = []
score1 = []
score2 = []

for i in range(len(X1)):
    _k = i + 1  # different k neighbors

    knn = KNeighborsClassifier(n_neighbors=_k)
    knn.fit(X1, y1)

    _score1 = metrics.accuracy_score(y1, knn.predict(X1))
    _score2 = metrics.accuracy_score(y2, knn.predict(X2))  # test dataset

    k.append(_k)
    score1.append(_score1 * 100)
    score2.append(_score2 * 100)

    # Output accuracy for each _k (n_neighbors)
    print(f"k={_k} | score1: {score1[i]} | score2: {score2[i]}")

    """
    k=1 | score1: 100.0 | score2: 40.0
    k=2 | score1: 95.0 | score2: 60.0
    k=3 | score1: 85.0 | score2: 100.0
    k=4 | score1: 85.0 | score2: 100.0
    k=5 | score1: 85.0 | score2: 100.0
    k=6 | score1: 85.0 | score2: 100.0
    k=7 | score1: 85.0 | score2: 100.0
    k=8 | score1: 85.0 | score2: 100.0
    k=9 | score1: 85.0 | score2: 80.0
    k=10 | score1: 85.0 | score2: 60.0
    k=11 | score1: 80.0 | score2: 60.0
    k=12 | score1: 90.0 | score2: 60.0
    k=13 | score1: 65.0 | score2: 60.0
    k=14 | score1: 55.00000000000001 | score2: 60.0
    k=15 | score1: 55.00000000000001 | score2: 60.0
    k=16 | score1: 45.0 | score2: 60.0
    k=17 | score1: 50.0 | score2: 60.0
    k=18 | score1: 50.0 | score2: 60.0
    k=19 | score1: 40.0 | score2: 40.0
    k=20 | score1: 40.0 | score2: 40.0
    """


# Plot results
# ------------

# Plot train score
plt.scatter(k, score1)  # function
plt.plot(k, score1, '-', label='train')  # data points

# Plot test score
plt.scatter(k, score2)  # function
plt.plot(k, score2, '-', label='test')  # data points

# Plot configurations
plt.axis([max(k),min(k)+1, 0, 100])
plt.xlabel('number of nearest neighbours (k)', size = 13)
plt.ylabel('accuracy score', size = 13)
plt.title('Model Performance vs Complexity', size = 20)
plt.legend()

# Output
plt.show()

Decision Boundaries

Decision boundaries of KNN on a graph (optimal fit for k=5)

Requirements

 
Create and activate a venv.

    cd .\developments\python\mlearning\
    python -m venv .\venv
    .\venv\Scripts\activate
    (venv) PS

Requirements file for packages.

    numpy>=1.24,<3.0
    pandas>=2.0,<3.0
    matplotlib>=3.7,<4.0
    scikit-learn>=1.3,<2.0
    icecream>=2.1,<3.0

Upgrade pip/setuptools/wheel (helps with binary wheels).

    python -m pip install --upgrade pip setuptools wheel

Install your requirements.

    pip install -r requirements.txt




References: