Euclidian Distance
The algorithm works by finding the k nearest neighbors to the input data point.
The predicted label is assign based on the majority vote of the k-nearest neighbors.
The nearest neighbor is find using euclidean distance between two points.
$$ d(x,y) = \sqrt{(x2 - x1)^2 + (y2 - y1)^2} $$
import matplotlib.pyplot as plt
from icecream import ic
import math
X = [
[2, 2], [2, 2.5], [2.5, 2.5], [2.5, 2], [2.25, 2.25],
[3, 3], [3, 3.5], [3.5, 3.5], [3.5, 3], [3.25, 3.25],
[4, 4], [4, 4.5], [4.5, 4.5], [4.5, 4], [4.25, 4.25],
]
y = [
1, 1, 1, 1, 1,
2, 2, 2, 2, 2,
3, 3, 3, 3, 3,
]
x_unknown = [3.6, 1.8]
k = 3
def euclidean_distance(x1, y1, x2, y2):
dx = x2 - x1
dy = y2 - y1
sum = dx**2 + dy**2
return math.sqrt(sum)
D = []
for x in X:
distance = euclidean_distance(x[1], x[0], x_unknown[1], x_unknown[0])
D.append(distance)
ic(D)
D_nearest = sorted(D)
D_nearest = D_nearest[:k]
ic(D_nearest)
key_list = []
for i in range(len(D)):
for d in D_nearest:
if d == D[i]:
key_list.append(i)
knn_keys = key_list[:k]
ic(knn_keys)
knn_targets = [y[i] for i in knn_keys]
ic(knn_targets)
target_counts = {}
for t in knn_targets:
if t in target_counts:
target_counts[t] += 1
else:
target_counts[t] = 1
max_target = max(target_counts, key=target_counts.get)
knn_class = max_target
print("Class prediction for", x_unknown, "=", knn_class)
fig, ax = plt.subplots()
ax.set_xlabel('x1')
ax.set_ylabel('x2')
z = x_unknown
x_values = [point[0] for point in X]
y_values = [point[1] for point in X]
plt.scatter(x_values, y_values, c=y)
plt.scatter(z[0], z[1], marker='x', color='r', label='Class =%s' %knn_class)
for i in knn_keys:
plt.plot((z[0], X[i][0]), (z[1], X[i][1]), color='gray', linestyle='--')
plt.title('K-nearest Neigbors')
plt.xlim(0, 6)
plt.ylim(0, 6)
plt.legend()
plt.show()
Three dimensions
Extending the Euclidean distance formula from two dimensions to three dimensions involves
adding another term for the third dimension.
The Euclidean distance between two points in three dimensions is computed as:
$$ d(x,y,z) = \sqrt{(x2 - x1)^2 + (y2 - y1)^2 + (z2 - z1)^2} $$
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
X = np.array([
[2, 2, 2], [2, 2.5, 2.5], [2.5, 2.5, 2], [2.5, 2, 2.5], [2.25, 2.25, 2.25],
[3, 3, 3], [3, 3.5, 3.5], [3.5, 3.5, 3], [3.5, 3, 3.5], [3.25, 3.25, 3.25],
[4, 4, 4], [4, 4.5, 4.5], [4.5, 4.5, 4], [4.5, 4, 4.5], [4.25, 4.25, 4.25],
])
y = [
1, 1, 1, 1, 1,
2, 2, 2, 2, 2,
3, 3, 3, 3, 3,
]
x_unknown = np.array([3.6, 1.8, 3.6])
k = 3
def euclidean_distance(point1, point2):
return np.sqrt(np.sum((point1 - point2)**2))
D = [euclidean_distance(x, x_unknown) for x in X]
D_nearest = np.argsort(D)[:k]
knn_targets = [y[i] for i in D_nearest]
target_counts = {label: knn_targets.count(label) for label in np.unique(knn_targets)}
knn_class = max(target_counts, key=target_counts.get)
print("Class prediction for", x_unknown, "=", knn_class)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel('X1')
ax.set_ylabel('X2')
ax.set_zlabel('X3')
colors = {1: 'r', 2: 'g', 3: 'b'}
for i, label in enumerate(y):
ax.scatter(X[i, 0], X[i, 1], X[i, 2], c=colors[label])
ax.scatter(x_unknown[0], x_unknown[1], x_unknown[2], marker='x', c='black')
for i in D_nearest:
ax.plot([x_unknown[0], X[i, 0]], [x_unknown[1], X[i, 1]], [x_unknown[2], X[i, 2]], color='gray')
plt.show()
KNN Class
The algorithm doesn't work by running separate KNN calculations for each feature pair independently.
Instead, it considers all features simultaneously for each data point.
When you calculate KNN for a new data point with multiple features, the algorithm measures the distance
between this new point and all other points in the dataset using all features.
Lets encapsulate the algorithm into a class that classify items.
The class implements a k-nearest neighbors algorithm for classification.
The prediction is based on the majority class of its nearest neighbors in the training dataset.
$$ d(X) = \sqrt{\sum_{i=1}^{n} (x2_i - x1_i)^2} $$
import numpy as np
import matplotlib.pyplot as plt
X = [
[2, 2], [2, 2.5], [2.5, 2.5], [2.5, 2], [2.25, 2.25],
[3, 3], [3, 3.5], [3.5, 3.5], [3.5, 3], [3.25, 3.25],
[4, 4], [4, 4.5], [4.5, 4.5], [4.5, 4], [4.25, 4.25],
]
y = [
1, 1, 1, 1, 1,
2, 2, 2, 2, 2,
3, 3, 3, 3, 3,
]
class KNeighborsClassifier:
def __init__(self, n_neighbors):
self.k = n_neighbors
def fit(self, X_train, y_train):
self.X = np.array(X_train)
self.y = np.array(y_train)
def predict(self, x_unknown):
z = np.array(x_unknown)
SD = np.sqrt(np.sum((self.X - z)**2, axis=1))
keys = np.argsort(SD)
keys_knn = keys[:self.k]
targets_knn = self.y[keys_knn]
most_common = np.bincount(targets_knn)
result = most_common.argmax()
return result
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X, y)
x_unknown = [3.6, 1.8]
knn_class = knn.predict(x_unknown)
print("Prediction for", x_unknown, "= class", knn_class)