Linear Regression / One Variable
\(
h(x) = ax + b \)
\(
h_\theta = \theta_0 + \theta_1 x_1
\)
# LINERAR REGRESION - ONE VARIABLE
# --------------------------------
# Concept:
# - From a training dataset (one variable) we are able to find the line that
# fit the data best and use that line to make predictions.
# - It is one of the most popular tools in statistics.
#
# h(x) = ax + b
# --------------
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Training Dataset
# ----------------
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1) # column vector
Y = np.array([31, 30, 80, 49, 70, 118]) # row vector
# Learn a prediction function
# ---------------------------
r = LinearRegression().fit(X, Y)
a = r.coef_[0].round(1)
b = r.intercept_.round(1)
# Predict unknown
# ---------------
x1 = 80
y1 = a*x1 + b
# Output result and Draw graphics
# -------------------------------
print(f"h(x) = {a}x + {b}") # h(x) = 1.3x + -18.0
print(f"h({x1}) = {y1}") # h(80) = 86.0
fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)
ax.plot(X, Y, 'x', color='g', label='training data') # Dataset points
ax.plot(x1, y1, 'o', color='r', label=f'h({x1}) = {y1}') # Unknown point
ax.plot(X, a*X + b, label=f'h(x) = {b} + {a}x') # Function line
plt.legend()
plt.show()
Linear Regression / Two Variables
\(
h(x) = ax + by + c
\)
\(
h_\theta = \theta_0 + \theta_1 x_1 + \theta_2 x_2
\)
# LINEAR REGRESION - TWO VARIABLES
# --------------------------------
# With multiple regression we can throw in more vairables (like the weight of a car).
# Example:
# Predict the CO2 emission of a car based on the
# SIZE (volume) of the engine and the WEIGHT of the car.
#
# h(x) = ax + by + c
# ------------------
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pathlib
DIR = pathlib.Path(__file__).resolve().parent
# Training dataset
# *****-
with open(DIR / 'data/cars.csv') as file:
df = pd.read_csv(file)
X = df[['Weight', 'Volume']].values
y = df['CO2'].values
# Learn prediction function
# -------------------------
r = LinearRegression().fit(X, y)
# Predictions
# -----------
X1 = [1600, 1252] # Honda Civic, 1600, 1252 / CO2: 94
y1 = r.predict([X1]) # CO2: 101.5
X2 = [1200, 780] # Unknwon car
y2 = r.predict([X2]) # CO2: 94.8
# Results & Graphics
# ------------------
print(df, "\n")
print("Honda Civic, 1600, 1252 / CO2:", y1.round(1).item())
print("Unknow car, 1200, 780 / CO2:", y2.round(1).item())
# Draw surface
fig = plt.figure()
Ax, Ay = np.meshgrid(
np.linspace(df.Weight.min(), df.Weight.max(), 100),
np.linspace(df.Volume.min(), df.Volume.max(), 100)
)
onlyX = pd.DataFrame({'Weight': Ax.ravel(), 'Volume': Ay.ravel()})
fittedY = r.predict(onlyX)
fittedY = np.array(fittedY)
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['Weight'], df['Volume'], df['CO2'], c='g', marker='x', alpha=0.5)
ax.plot_surface(Ax, Ay, fittedY.reshape(Ax.shape), color='b', alpha=0.3)
ax.set_xlabel('Weight')
ax.set_ylabel('Volume')
ax.set_zlabel('CO2')
ax.plot(X1[0], X1[1], y1[0], 'o', color='r')
ax.plot(X2[0], X2[1], y2[0], 's', color='g')
plt.show()
"""
Car Model Volume Weight CO2
0 Toyoty Aygo 1000 790 99
1 Mitsubishi Space Star 1200 1160 95
2 Skoda Citigo 1000 929 95
3 Fiat 500 900 865 90
4 Mini Cooper 1500 1140 105
5 VW Up! 1000 929 105
6 Skoda Fabia 1400 1109 90
7 Mercedes A-Class 1500 1365 92
8 Ford Fiesta 1500 1112 98
9 Audi A1 1600 1150 99
10 Hyundai I20 1100 980 99
11 Suzuki Swift 1300 990 101
12 Ford Fiesta 1000 1112 99
13 Honda Civic 1600 1252 94
14 Hundai I30 1600 1326 97
15 Opel Astra 1600 1330 97
16 BMW 1 1600 1365 99
17 Mazda 3 2200 1280 104
18 Skoda Rapid 1600 1119 104
19 Ford Focus 2000 1328 105
20 Ford Mondeo 1600 1584 94
21 Opel Insignia 2000 1428 99
22 Mercedes C-Class 2100 1365 99
23 Skoda Octavia 1600 1415 99
24 Volvo S60 2000 1415 99
25 Mercedes CLA 1500 1465 102
26 Audi A4 2000 1490 104
27 Audi A6 2000 1725 114
28 Volvo V70 1600 1523 109
29 BMW 5 2000 1705 114
30 Mercedes E-Class 2100 1605 115
31 Volvo XC70 2000 1746 117
32 Ford B-Max 1600 1235 104
33 BMW 216 1600 1390 108
34 Opel Zafira 1600 1405 109
35 Mercedes SLK 2500 1395 120
Honda Civic, 1600, 1252 / CO2: 101.5
Unknow car, 1200, 780 / CO2: 94.8
"""
Multiple variables
\(
h_\theta = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + \cdots + \theta_n x_n
\)
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np, sys
import pandas as pd
import pathlib
DIR = pathlib.Path(__file__).resolve().parent
with open(DIR / 'data/real_estate.csv') as file:
df = pd.read_csv(file)
# Features
X = df[[
'X1 transaction date',
'X2 house age',
'X3 distance to the nearest MRT station',
'X4 number of convenience stores',
'X5 latitude',
'X6 longitude',
]].values
# Label
y = df['Y house price of unit area'].values
# Train the model
r = LinearRegression().fit(X, y)
# Predictions
X1 = [2013.17, 13, 732.85, 0, 24.98, 121.53] # price: 39 (train data)
X2 = [2013.58, 16.6, 323.69, 6, 24.98, 121.54] # price: 51 (train data)
X3 = [2013.17, 33, 732.85, 0, 24.98, 121.53] # ?
print(df.head())
print('Predict training item1, price =', r.predict([X1]).round(1).item())
print('Predict training item2, price =', r.predict([X2]).round(1).item())
print('Predict unknow item, price =', r.predict([X3]).round(1).item())
"""
No X1 transaction date X2 house age ...
0 1 2012.917 32.0 ...
1 2 2012.917 19.5 ...
2 3 2013.583 13.3 ...
3 4 2013.500 13.3 ...
4 5 2012.833 5.0 ...
[5 rows x 8 columns]
Predict training item1, price = 38.8
Predict training item2, price = 48.5
Predict unknow item, price = 33.4
"""
Residuals
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])
# Learn a prediction function
r = LinearRegression().fit(X, Y)
a = r.coef_[0].round(1)
b = r.intercept_.round(1)
# Evaluate the model
P = [] # Predictions (on training dataset)
R = [] # Residuals
SSR = 0 # Sum of squared residuals
for i in range(len(X)):
P = np.append(P, -18 + 1.3*X[i])
R = np.append(R, Y[i] - P[i])
SSR += R[i] ** 2
print(f'Prediction function: f(x) = {a}x + {b}')
print('Residuals:', R)
print(f'SSR = {SSR.round(2).item()}')
# Draw graphics
fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)
ax.plot(X, Y, 'x', color='g', label='training data') # Dataset points
ax.plot(X, a*X + b, label=f'h(x) = {b} + {a}x') # Function line
for i in range(len(X)): # Residuals
ax.plot([X[i], X[i]], [P[i], Y[i]], '-', color='c')
plt.legend()
plt.show()
"""
Prediction function: f(x) = 1.3x + -18.0
Residuals: [10. -11.8 20. -17.5 -12.1 12.5]
SSR = 1248.15
"""
Polynomial Features
# POLYNOMIAL FEATURES - Example (Degree = 3)
# ------------------------------------------
# PolynomialFeatures expands simple input features into polynomial terms
# so a Linear model can learn nonlinear patterns.
#
# We start with very simple data:
# X = [[2],
# [3]]
#
# Degree = 3 means:
# create [1, x, x^2, x^3]
# -------------------------
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
# Original input feature (1 feature, 2 samples)
X = np.array([[2],
[3]])
# Create the transformer: degree = 3
poly = PolynomialFeatures(degree=3)
# Expand X into polynomial features
X_poly = poly.fit_transform(X)
print("Original X:")
print(X)
print("Polynomial-expanded X (degree=3):")
print(X_poly)
"""
Original X:
[[2]
[3]]
Polynomial-expanded X (degree=3):
[[ 1. 2. 4. 8.]
[ 1. 3. 9. 27.]]
"""
Polynomial Regression
# POLYNOMIAL REGRESSION (Degree = 3)
# ----------------------------------
# Goal:
# Fit a nonlinear relation using LinearRegression on
# polynomial-expanded features (degree=3).
#
# Steps:
# 1) Define small, hardcoded 1D feature x
# 2) Define target y from a cubic function
# 3) Expand features with PolynomialFeatures(degree=3)
# 4) Fit LinearRegression on the expanded features
# 5) Predict a few values to see it in action
# ---------------------------------------------
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# Training data
x = np.array([
[-3.0],
[-2.0],
[-1.0],
[ 0.0],
[ 1.0],
[ 2.0],
[ 3.0]
])
# Target from a cubic function:
y = 2 + 0.5*x - 1.2*(x**2) + 0.3*(x**3)
# Polynomial expansion to degree 3
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(x) # columns are [x, x^2, x^3]
# Fit linear regression on the expanded features
model = LinearRegression()
model.fit(X_poly, y)
# Evaluate fit quality (should be R^2 = 1.0 with noiseless data)
y_pred = model.predict(X_poly)
print("R^2 on training data:", round(r2_score(y, y_pred), 4))
# Predictions (include a value not in training)
X_unknown = [1.5]
x = X_unknown[0]
y = y = 2 + 0.5*x - 1.2*(x**2) + 0.3*(x**3)
print("Cubic function:", f"{y:.4f}")
X_val_poly = poly.transform(np.array([X_unknown]))
pred = model.predict(X_val_poly)
print("Prediction:", pred[0])
# --------------------
# Cubic function: 1.06
# Prediction: [1.0625]