Linear Regression / One Variable

\( h(x) = ax + b \)
\( h_\theta = \theta_0 + \theta_1 x_1 \)
 
# LINERAR REGRESION - ONE VARIABLE
# --------------------------------
# Concept:
#   - From a training dataset (one variable) we are able to find the line that 
#     fit the data best and use that line to make predictions.
#   - It is one of the most popular tools in statistics.
#
# h(x) = ax + b
# --------------

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Training Dataset
# ----------------
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)  # column vector
Y = np.array([31, 30, 80, 49, 70, 118])  # row vector

# Learn a prediction function
# ---------------------------
r = LinearRegression().fit(X, Y)
a = r.coef_[0].round(1)
b = r.intercept_.round(1)

# Predict unknown
# ---------------
x1 = 80
y1 = a*x1 + b

# Output result and Draw graphics
# -------------------------------
print(f"h(x) = {a}x + {b}")     # h(x) = 1.3x + -18.0
print(f"h({x1}) = {y1}")        # h(80) = 86.0

fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)

ax.plot(X,  Y,  'x', color='g', label='training data')      # Dataset points
ax.plot(x1, y1, 'o', color='r', label=f'h({x1}) = {y1}')    # Unknown point
ax.plot(X, a*X + b,  label=f'h(x) = {b} + {a}x')            # Function line

plt.legend()
plt.show()

Linear Regression / Two Variables

\( h(x) = ax + by + c \)
\( h_\theta = \theta_0 + \theta_1 x_1 + \theta_2 x_2 \)
 
# LINEAR REGRESION - TWO VARIABLES
# --------------------------------
# With multiple regression we can throw in more vairables (like the weight of a car).
# Example:
#     Predict the CO2 emission of a car based on the 
#     SIZE (volume) of the engine and the WEIGHT of the car.
#
# h(x) = ax + by + c
# ------------------

from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pathlib

DIR = pathlib.Path(__file__).resolve().parent

# Training dataset
# *****-
with open(DIR / 'data/cars.csv') as file:
    df = pd.read_csv(file)
    X = df[['Weight', 'Volume']].values
    y = df['CO2'].values

# Learn prediction function
# -------------------------
r = LinearRegression().fit(X, y)


# Predictions
# -----------
X1 = [1600, 1252]     # Honda Civic, 1600, 1252 / CO2: 94
y1 = r.predict([X1])  # CO2: 101.5

X2 = [1200, 780]      # Unknwon car
y2 = r.predict([X2])    # CO2: 94.8


# Results & Graphics
# ------------------
print(df, "\n")
print("Honda Civic, 1600, 1252 / CO2:", y1.round(1).item())
print("Unknow car, 1200, 780 / CO2:", y2.round(1).item())

# Draw surface
fig = plt.figure()
Ax, Ay = np.meshgrid(
    np.linspace(df.Weight.min(), df.Weight.max(), 100),
    np.linspace(df.Volume.min(), df.Volume.max(), 100)
)
onlyX = pd.DataFrame({'Weight': Ax.ravel(), 'Volume': Ay.ravel()})
fittedY = r.predict(onlyX)
fittedY = np.array(fittedY)

ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['Weight'], df['Volume'], df['CO2'], c='g', marker='x', alpha=0.5)
ax.plot_surface(Ax, Ay, fittedY.reshape(Ax.shape), color='b', alpha=0.3)
ax.set_xlabel('Weight')
ax.set_ylabel('Volume')
ax.set_zlabel('CO2')

ax.plot(X1[0], X1[1], y1[0], 'o', color='r')
ax.plot(X2[0], X2[1], y2[0], 's', color='g')

plt.show()

"""
           Car       Model  Volume  Weight  CO2
0       Toyoty        Aygo    1000     790   99
1   Mitsubishi  Space Star    1200    1160   95
2        Skoda      Citigo    1000     929   95
3         Fiat         500     900     865   90
4         Mini      Cooper    1500    1140  105
5           VW         Up!    1000     929  105
6        Skoda       Fabia    1400    1109   90
7     Mercedes     A-Class    1500    1365   92
8         Ford      Fiesta    1500    1112   98
9         Audi          A1    1600    1150   99
10     Hyundai         I20    1100     980   99
11      Suzuki       Swift    1300     990  101
12        Ford      Fiesta    1000    1112   99
13       Honda       Civic    1600    1252   94
14      Hundai         I30    1600    1326   97
15        Opel       Astra    1600    1330   97
16         BMW           1    1600    1365   99
17       Mazda           3    2200    1280  104
18       Skoda       Rapid    1600    1119  104
19        Ford       Focus    2000    1328  105
20        Ford      Mondeo    1600    1584   94
21        Opel    Insignia    2000    1428   99
22    Mercedes     C-Class    2100    1365   99
23       Skoda     Octavia    1600    1415   99
24       Volvo         S60    2000    1415   99
25    Mercedes         CLA    1500    1465  102
26        Audi          A4    2000    1490  104
27        Audi          A6    2000    1725  114
28       Volvo         V70    1600    1523  109
29         BMW           5    2000    1705  114
30    Mercedes     E-Class    2100    1605  115
31       Volvo        XC70    2000    1746  117
32        Ford       B-Max    1600    1235  104
33         BMW         216    1600    1390  108
34        Opel      Zafira    1600    1405  109
35    Mercedes         SLK    2500    1395  120

Honda Civic, 1600, 1252 / CO2: 101.5
Unknow car, 1200, 780 / CO2: 94.8
"""

Multiple variables

\( h_\theta = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + \cdots + \theta_n x_n \)
 
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import numpy as np, sys
import pandas as pd

import pathlib
DIR = pathlib.Path(__file__).resolve().parent

with open(DIR / 'data/real_estate.csv') as file:
    df = pd.read_csv(file)

    # Features
    X = df[[
        'X1 transaction date',
        'X2 house age',
        'X3 distance to the nearest MRT station',
        'X4 number of convenience stores',
        'X5 latitude',
        'X6 longitude',
    ]].values

    # Label
    y = df['Y house price of unit area'].values

# Train the model
r = LinearRegression().fit(X, y)

# Predictions
X1 = [2013.17, 13, 732.85, 0, 24.98, 121.53]     # price: 39 (train data)
X2 = [2013.58, 16.6, 323.69, 6, 24.98, 121.54]   # price: 51 (train data)
X3 = [2013.17, 33, 732.85, 0, 24.98, 121.53]     # ?

print(df.head())
print('Predict training item1, price =', r.predict([X1]).round(1).item())
print('Predict training item2, price =', r.predict([X2]).round(1).item())
print('Predict unknow item, price =',    r.predict([X3]).round(1).item())

"""
          No  X1 transaction date  X2 house age  ...
    0      1             2012.917          32.0  ...
    1      2             2012.917          19.5  ...
    2      3             2013.583          13.3  ...
    3      4             2013.500          13.3  ...
    4      5             2012.833           5.0  ...

    [5 rows x 8 columns]

    Predict training item1, price = 38.8
    Predict training item2, price = 48.5
    Predict unknow item,    price = 33.4
"""

Residuals

 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Training Dataset
X = np.array([30, 46, 60, 65, 77, 95]).reshape(6,1)
Y = np.array([31, 30, 80, 49, 70, 118])

# Learn a prediction function
r = LinearRegression().fit(X, Y)
a = r.coef_[0].round(1)
b = r.intercept_.round(1)

# Evaluate the model
P = []                  # Predictions (on training dataset)
R = []                  # Residuals  
SSR = 0                 # Sum of squared residuals

for i in range(len(X)):
    P = np.append(P, -18 + 1.3*X[i])
    R = np.append(R, Y[i] - P[i])
    SSR += R[i] ** 2

print(f'Prediction function: f(x) = {a}x + {b}')
print('Residuals:', R)
print(f'SSR = {SSR.round(2).item()}')

# Draw graphics
fig, ax = plt.subplots()
plt.ylim(0, 140)
plt.xlim(0, 140)

ax.plot(X, Y, 'x', color='g', label='training data')     # Dataset points
ax.plot(X, a*X + b, label=f'h(x) = {b} + {a}x')          # Function line
for i in range(len(X)):                                  # Residuals
    ax.plot([X[i], X[i]], [P[i], Y[i]], '-', color='c')

plt.legend()
plt.show()

"""
    Prediction function: f(x) = 1.3x + -18.0
    Residuals: [10.  -11.8  20.  -17.5 -12.1  12.5]
    SSR = 1248.15
"""

Polynomial Features

 
# POLYNOMIAL FEATURES - Example (Degree = 3)
# ------------------------------------------
# PolynomialFeatures expands simple input features into polynomial terms 
# so a Linear model can learn nonlinear patterns.
#
# We start with very simple data:
#       X = [[2],
#            [3]]
#
# Degree = 3 means:
#   create [1, x, x^2, x^3]
# -------------------------

import numpy as np
from sklearn.preprocessing import PolynomialFeatures

# Original input feature (1 feature, 2 samples)
X = np.array([[2],
              [3]])

# Create the transformer: degree = 3
poly = PolynomialFeatures(degree=3)

# Expand X into polynomial features
X_poly = poly.fit_transform(X)

print("Original X:")
print(X)
print("Polynomial-expanded X (degree=3):")
print(X_poly)

"""
Original X:
[[2]
 [3]]
Polynomial-expanded X (degree=3):
[[ 1.  2.  4.  8.]
 [ 1.  3.  9. 27.]]
"""

Polynomial Regression

 
# POLYNOMIAL REGRESSION (Degree = 3)
# ----------------------------------
# Goal:
#   Fit a nonlinear relation using LinearRegression on
#   polynomial-expanded features (degree=3).
#
# Steps:
#   1) Define small, hardcoded 1D feature x
#   2) Define target y from a cubic function
#   3) Expand features with PolynomialFeatures(degree=3)
#   4) Fit LinearRegression on the expanded features
#   5) Predict a few values to see it in action
# ---------------------------------------------

import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Training data
x = np.array([
    [-3.0],
    [-2.0],
    [-1.0],
    [ 0.0],
    [ 1.0],
    [ 2.0],
    [ 3.0]
])

# Target from a cubic function:
y = 2 + 0.5*x - 1.2*(x**2) + 0.3*(x**3)

# Polynomial expansion to degree 3
poly = PolynomialFeatures(degree=3)
X_poly = poly.fit_transform(x)  # columns are [x, x^2, x^3]

# Fit linear regression on the expanded features
model = LinearRegression()
model.fit(X_poly, y)


# Evaluate fit quality (should be R^2 = 1.0 with noiseless data)
y_pred = model.predict(X_poly)
print("R^2 on training data:", round(r2_score(y, y_pred), 4))


# Predictions (include a value not in training)
X_unknown = [1.5]

x = X_unknown[0]
y = y = 2 + 0.5*x - 1.2*(x**2) + 0.3*(x**3)
print("Cubic function:", f"{y:.4f}")

X_val_poly = poly.transform(np.array([X_unknown]))
pred = model.predict(X_val_poly)
print("Prediction:", pred[0])

# --------------------
# Cubic function: 1.06
# Prediction: [1.0625]




References: