Basis Expansion
What is basis expansion? How does it improve the model's accuracy? How can we detect overfitting?
Polynomial
We can use polynomial basis expansion to add non-linear features.
""" Basis Expansion / Polynomial Model
Basis expansion is a technique used to transform a dataset
by creating new features from existing ones, in order to improve
the accuracy of the model.
By expanding the set of input variables using basis functions,
the model can capture nonlinear relationships between the inputs
and the target variable, which might not be possible with linear models.
Even though the fifth-degree polynomial model has the lowest
SSR_training, it also has huge SSR_test, a good sign of overfitting.
Another sign of overfiting is by evaluating the coeficients (weights or
sum of coeficients). The higher the sum, the more the model tends to overfit.
"""
import numpy as np
import matplotlib.pyplot as plt
# Training datasets
X1 = [30, 46, 60, 65, 77, 95] # area (m^2)
y1 = [31, 30, 80, 49, 70, 118] # price (10,000$)
# Test dataset
X2 = [17, 40, 55, 57, 70, 85]
y2 = [19, 50, 60, 32, 90, 110]
# N-degree polynomial
def pred_polinomial(degree, x_unknown, X1, y1):
#---------------------------------------------------------------------
# Basis expansion
p = np.poly1d(np.polyfit(X1, y1, degree)) # fits a polynomial to X1, y1
t = np.linspace(0, 100, 100)
#---------------------------------------------------------------------
# Plot train, test data and prediction line
plt.figure(figsize=(6,4))
plt.scatter(X1, y1, color='blue', label='Training set')
plt.scatter(X2, y2, color='red', label='Test set')
plt.plot(t, p(t), color='orange') # line
# Evaluate the model (sum of residuals)
SSR1 = sum((p(X1) - y1) ** 2).round()
SSR2 = sum((p(X2) - y2) ** 2).round()
# Evaluate the weight (sum of coeficients)
weight = round(sum(abs(p.coef)))
# Plot prediction
xa = x_unknown
ya = round(p(xa),2)
plt.scatter(xa, ya, color='r', marker='x')
plt.annotate(f'({xa}, {ya}, SSR1 = {SSR1}) SSR2 = {SSR2})', (xa+0.1, ya-10))
plt.title(f'{degree}-degree polynomial')
plt.legend(loc='best')
plt.xlabel('Area (m^2)')
plt.ylabel('Price (10,000$)')
plt.xlim((0, 100))
plt.ylim((0, 130))
# Print prediction functions format
xf1 = "{:.1f}x + {:.1f}"
xf2 = "{:.1f}x^2 + " + xf1
xf3 = "{:.1f}x^3 + " + xf2
xf4 = "{:.1f}x^4 + " + xf3
xf5 = "{:.1f}x^5 + " + xf4
if degree == 1:
print(("p(x) = " + xf1).format(p[1], p[0]))
elif degree == 2:
print(("p(x) = " + xf2).format(p[2], p[1], p[0]))
elif degree == 3:
print(("p(x) = " + xf3).format(p[3], p[2], p[1], p[0]))
elif degree == 4:
print(("p(x) = " + xf4).format(p[4], p[3], p[2], p[1], p[0]))
elif degree == 5:
print(("p(x) = " + xf5).format(p[5], p[4], p[3], p[2], p[1], p[0]))
print('SSR1 =', SSR1, ' / ', 'SSR2 =', SSR2)
print('weight = ', weight, '\n')
return
# Plot prediction
x_unknown = 50
pred_polinomial(1, x_unknown, X1, y1)
pred_polinomial(2, x_unknown, X1, y1)
pred_polinomial(3, x_unknown, X1, y1)
pred_polinomial(4, x_unknown, X1, y1)
pred_polinomial(5, x_unknown, X1, y1)
plt.show()
"""
p(x) = 1.3x + -18.0
SSR1 = 1248.0 / SSR2 = 1681.0
weight = 19
p(x) = 0.0x^2 + -0.5x + 31.9
SSR1 = 995.0 / SSR2 = 1530.0
weight = 32
p(x) = 0.0x^3 + -0.0x^2 + 3.0x + -29.5
SSR1 = 967.0 / SSR2 = 1671.0
weight = 33
p(x) = 0.0x^4 + -0.0x^3 + 1.8x^2 + -66.5x + 876.9
SSR1 = 651.0 / SSR2 = 29011.0
weight = 945
p(x) = -0.0x^5 + 0.0x^4 + -1.1x^3 + 66.8x^2 + -1866.2x + 19915.1
SSR1 = 0.0 / SSR2 = 6719065.0
weight = 21849
"""
$$ f(x) = w_0 + w_1 x $$
$$ f(x) = w_0 + w_1 x + w_2 x^2 $$
$$ f(x) = w_0 + w_1 x + w_2 x^2 + w_3 x^3 $$
$$ f(x) = w_0 + w_1 x + w_2 x^2 + w_3 x^3 + w_4 x^4 $$
$$ f(x) = w_0 + w_1 x + w_2 x^2 + w_3 x^3 + w_4 x^4 + w_5 x^5 $$
Last update: 403 days ago