Nielsen's Figures

ntfs. (note-to-future-self)

these are all svgs created in python using matplotlib.

I could not get my dirty little paws on nielsen's tikz code that he used to produce the neural net diagrams. he compiled them as pngs on his own site. he also used mathjax to typeset his mathematics.

relu

CLOSED: [2025-04-12 Sat 17:22]

State "DONE" from [2025-04-12 Sat 17:22]

/projects/ml/dl/neural-nets/fig/
relu.svg — relu

code

import numpy as np
import matplotlib.pyplot as plt

z = np.arange(-2, 2, .1)
zero = np.zeros(len(z))
y = np.max([zero, z], axis=0)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(z, y)
ax.set_ylim([-2.0, 2.0])
ax.set_xlim([-2.0, 2.0])
ax.grid(True)
ax.set_xlabel('z')
ax.set_title('Rectified linear unit')

# Save as SVG instead of displaying
plt.savefig('relu.svg', format='svg')
plt.close(fig)  # Close the figure to prevent display

sigmoid

CLOSED: [2025-04-12 Sat 17:22]

State "DONE" from [2025-04-12 Sat 17:22]

/projects/ml/dl/neural-nets/fig/
sigmoid.svg — sigmoid

code

import numpy
import matplotlib.pyplot as plt

z = numpy.arange(-5, 5, .1)
sigma_fn = numpy.vectorize(lambda z: 1/(1+numpy.exp(-z)))
sigma = sigma_fn(z)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(z, sigma)
ax.set_ylim([-0.5, 1.5])
ax.set_xlim([-5,5])
ax.grid(True)
ax.set_xlabel('z')
ax.set_title('sigmoid function')

plt.savefig('sigmoid.svg', format='svg')

step

:PROPERTIES: CLOSED: [2025-04-12 Sat 17:22]

State "DONE" from [2025-04-12 Sat 17:22]

:END:

/projects/ml/dl/neural-nets/fig/
step.svg — step

code

import numpy
import matplotlib.pyplot as plt

z = numpy.arange(-5, 5, .02)
step_fn = numpy.vectorize(lambda z: 1.0 if z >= 0.0 else 0.0)
step = step_fn(z)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(z, step)
ax.set_ylim([-0.5, 1.5])
ax.set_xlim([-5,5])
ax.grid(True)
ax.set_xlabel('z')
ax.set_title('step function')

plt.savefig('step.svg', format='svg')

tanh

CLOSED: [2025-04-12 Sat 17:22]

State "DONE" from [2025-04-12 Sat 17:22]

/projects/ml/dl/neural-nets/fig/
tanh.svg — tanh

code

import numpy as np
import matplotlib.pyplot as plt

z = np.arange(-5, 5, .1)
t = np.tanh(z)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(z, t)
ax.set_ylim([-1.0, 1.0])
ax.set_xlim([-5,5])
ax.grid(True)
ax.set_xlabel('z')
ax.set_title('tanh function')

plt.savefig('tanh.svg', format='svg')

diff sigmoid

CLOSED: [2025-04-15 Tue 12:07]

State "DONE" from [2025-04-15 Tue 12:07]

/projects/ml/dl/neural-nets/fig/
sigmoid_prime.svg — diff sigmoid

code

import numpy as np
import matplotlib.pyplot as plt

# Define sigmoid and its derivative
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_prime(z):
    s = sigmoid(z)
    return s * (1 - s)

# z values
z = np.arange(-5, 5, 0.1)
s_prime = sigmoid_prime(z)

# Plot
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(z, s_prime)
ax.set_ylim([0.0, 0.3])
ax.set_xlim([-5, 5])
ax.grid(True)
ax.set_xlabel('z')
ax.set_title('Derivative of the sigmoid function')

plt.savefig('sigmoid_prime.svg', format='svg')

valley

CLOSED: [2025-04-12 Sat 17:50]

State "DONE" from [2025-04-12 Sat 17:50]

/projects/ml/dl/neural-nets/fig/
valley.svg — valley

code

"""
valley - Plots a function of two variables to minimize.
The function is a fairly generic valley function.
"""

# Third party libraries
from matplotlib.ticker import LinearLocator
from mpl_toolkits.mplot3d import axes3d 
import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')  # FIXED: create a 3D axis properly

X = np.arange(-1, 1, 0.1)
Y = np.arange(-1, 1, 0.1)
X, Y = np.meshgrid(X, Y)
Z = X**2 + Y**2

colortuple = ('w', 'b')
colors = np.empty(X.shape, dtype=str)
for x in range(len(X)):
    for y in range(len(Y)):
        colors[x, y] = colortuple[(x + y) % 2]

surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, facecolors=colors,
        linewidth=0)

ax.set_xlim3d(-1, 1)
ax.set_ylim3d(-1, 1)
ax.set_zlim3d(0, 2)
ax.xaxis.set_major_locator(LinearLocator(3))
ax.yaxis.set_major_locator(LinearLocator(3))
ax.zaxis.set_major_locator(LinearLocator(3))
ax.text(1.79, 0, 1.62, "$C$", fontsize=20)
ax.text(0.05, -1.8, 0, "$v_1$", fontsize=20)
ax.text(1.5, -0.25, 0, "$v_2$", fontsize=20)

plt.savefig('valley.svg', format='svg')
plt.show()

valley2

CLOSED: [2025-04-12 Sat 17:51]

State "DONE" from [2025-04-12 Sat 17:51]

/projects/ml/dl/neural-nets/fig/
valley2.svg — valley2

code

"""
valley2 - Plots a function of two variables to minimize.
This is the second valley function visualization.
"""

# Third party libraries
from matplotlib.ticker import LinearLocator
from mpl_toolkits.mplot3d import axes3d 
import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')  # FIXED: Use add_subplot for 3D

X = np.arange(-1, 1, 0.1)
Y = np.arange(-1, 1, 0.1)
X, Y = np.meshgrid(X, Y)
Z = X**2 + 10*Y**2

colortuple = ('w', 'b')
colors = np.empty(X.shape, dtype=str)
for x in range(len(X)):
    for y in range(len(Y)):
        colors[x, y] = colortuple[(x + y) % 2]

surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, facecolors=colors,
        linewidth=0)

ax.set_xlim3d(-1, 1)
ax.set_ylim3d(-1, 1)
ax.set_zlim3d(0, 10)
ax.xaxis.set_major_locator(LinearLocator(3))
ax.yaxis.set_major_locator(LinearLocator(3))
ax.zaxis.set_major_locator(LinearLocator(3))
ax.text(1.79, 0, 8.4, "$C$", fontsize=20)
ax.text(0.05, -1.8, 0, "$v_1$", fontsize=20)
ax.text(1.5, -0.25, 0, "$v_2$", fontsize=20)

plt.savefig('valley2.svg', format='svg')
plt.close(fig)

false_minima

CLOSED: [2025-04-12 Sat 17:51]

State "DONE" from [2025-04-12 Sat 17:51]

/projects/ml/dl/neural-nets/fig/
false_minima.svg — false minima

code

"""
false_minima - Plots a function of two variables with many false minima.
"""

from matplotlib.ticker import LinearLocator
from mpl_toolkits.mplot3d import axes3d 
import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')  # FIXED

X = np.arange(-5, 5, 0.1)
Y = np.arange(-5, 5, 0.1)
X, Y = np.meshgrid(X, Y)
Z = np.sin(X) * np.sin(Y) + 0.2 * X

colortuple = ('w', 'b')
colors = np.empty(X.shape, dtype=str)
for x in range(len(X)):
    for y in range(len(Y)):
        colors[x, y] = colortuple[(x + y) % 2]

surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, facecolors=colors, linewidth=0)

ax.set_xlim3d(-5, 5)
ax.set_ylim3d(-5, 5)
ax.set_zlim3d(-2, 2)
ax.xaxis.set_major_locator(LinearLocator(3))  # FIXED
ax.yaxis.set_major_locator(LinearLocator(3))
ax.zaxis.set_major_locator(LinearLocator(3))

plt.savefig('false_minima.svg', format='svg')
plt.close(fig)

misleading_gradient_contours

CLOSED: [2025-04-12 Sat 17:51]

State "DONE" from [2025-04-12 Sat 17:51]

/projects/ml/dl/neural-nets/fig/
misleading_gradient_contours.svg — misleading gradient contours

code

"""
misleading_gradient_contours - Plots the contours of a function with misleading gradients
"""

# Third party libraries
import matplotlib.pyplot as plt
import numpy as np

X = np.arange(-1, 1, 0.02)
Y = np.arange(-1, 1, 0.02)
X, Y = np.meshgrid(X, Y)
Z = X**2 + 10*Y**2

plt.figure()
CS = plt.contour(X, Y, Z, levels=[0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])
plt.xlabel("$w_1$", fontsize=16)
plt.ylabel("$w_2$", fontsize=16)

plt.savefig('misleading_gradient_contours.svg', format='svg')
plt.close()  # Close the figure to prevent display

misleading_gradient

CLOSED: [2025-04-12 Sat 17:51]

State "DONE" from [2025-04-12 Sat 17:51]

/projects/ml/dl/neural-nets/fig/
misleading_gradient.svg — misleading gradient

code

"""
misleading_gradient - Plots a function which misleads the gradient descent algorithm.
"""

from matplotlib.ticker import LinearLocator
from mpl_toolkits.mplot3d import axes3d 
import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')  # FIXED

X = np.arange(-1, 1, 0.025)
Y = np.arange(-1, 1, 0.025)
X, Y = np.meshgrid(X, Y)
Z = X**2 + 10*Y**2

colortuple = ('w', 'b')
colors = np.empty(X.shape, dtype=str)
for x in range(len(X)):
    for y in range(len(Y)):
        colors[x, y] = colortuple[(x + y) % 2]

surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, facecolors=colors, linewidth=0)

ax.set_xlim3d(-1, 1)
ax.set_ylim3d(-1, 1)
ax.set_zlim3d(0, 12)
ax.xaxis.set_major_locator(LinearLocator(3))  # FIXED
ax.yaxis.set_major_locator(LinearLocator(3))
ax.zaxis.set_major_locator(LinearLocator(3))
ax.text(0.05, -1.8, 0, "$w_1$", fontsize=20)
ax.text(1.5, -0.25, 0, "$w_2$", fontsize=20)
ax.text(1.79, 0, 9.62, "$C$", fontsize=20)

plt.savefig('misleading_gradient.svg', format='svg')
plt.close(fig)

pca_limitations

CLOSED: [2025-04-12 Sat 17:51]

State "DONE" from [2025-04-12 Sat 17:51]

/projects/ml/dl/neural-nets/fig/
pca_limitations_data.svg — PCA limitations (data points)

/projects/ml/dl/neural-nets/fig/
pca_limitations_helix.svg — PCA limitations (data with helix)

code

"""
pca_limitations - Plot graphs to illustrate the limitations of PCA.
"""

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

# Data points only
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')  # FIXED
z = np.linspace(-2, 2, 20)
theta = np.linspace(-4 * np.pi, 4 * np.pi, 20)
x = np.sin(theta) + 0.03 * np.random.randn(20)
y = np.cos(theta) + 0.03 * np.random.randn(20)
ax.plot(x, y, z, 'ro')

plt.savefig('pca_limitations_data.svg', format='svg')
plt.close(fig)

# Helix + data
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')  # FIXED
z_helix = np.linspace(-2, 2, 100)
theta_helix = np.linspace(-4 * np.pi, 4 * np.pi, 100)
x_helix = np.sin(theta_helix)
y_helix = np.cos(theta_helix)
ax.plot(x, y, z, 'ro')  # replotting noisy data
ax.plot(x_helix, y_helix, z_helix, 'b-')

plt.savefig('pca_limitations_helix.svg', format='svg')
plt.close(fig)

backprop_magnitude_nabla

CLOSED: [2025-04-12 Sat 17:51]

State "DONE" from [2025-04-12 Sat 17:51]

/projects/ml/dl/neural-nets/fig/
backprop_magnitude_nabla.svg — Magnitude of gradient terms during backpropagation

code

"""
backprop_magnitude_nabla - Plotting the magnitude of gradient terms during backpropagation
"""

# Third-party libraries
import matplotlib.pyplot as plt

# Data from backpropagation in a 784-30-30-30-30-30-10 network
nw1 = [0.129173436407863, 0.4242933114455002, 
       1.6154682713449411, 7.5451567587160069]
nw2 = [0.12571016850457151, 0.44231149185805047, 
       1.8435833504677326, 7.61973813981073]
nw3 = [0.15854489503205446, 0.70244235144444678, 
       2.6294803575724157, 10.427062019753425]

plt.figure()
plt.plot(range(1, 5), nw1, "ro-", range(1, 5), nw2, "go-", 
         range(1, 5), nw3, "bo-")
plt.xlabel('Layer $l$')
plt.ylabel(r"$\Vert\nabla C^l_w\Vert$")
plt.xticks([1, 2, 3, 4])

plt.savefig('backprop_magnitude_nabla.svg', format='svg')
plt.close()  # Close the figure to prevent display

softmax

CLOSED: [2025-04-12 Sat 17:51]

State "DONE" from [2025-04-12 Sat 17:51]

/projects/ml/dl/neural-nets/fig/
softmax.svg — softmax

code

"""
softmax - Plot the softmax activation function for different temperature values
"""

import numpy as np
import matplotlib.pyplot as plt

# Define the softmax function
def softmax(x, temperature=1.0):
    """Compute softmax values for array of logits with temperature scaling."""
    # Subtract max for numerical stability (prevents overflow)
    x = x / temperature
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

# Create input values
x = np.array([0.1, 0.2, 0.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

# Plot softmax with different temperature values
fig, ax = plt.subplots(figsize=(10, 5))

temps = [0.5, 1.0, 2.0]
bar_width = 0.25
index = np.arange(len(x))

for i, temp in enumerate(temps):
    y = softmax(x, temperature=temp)
    offset = (i - 1) * bar_width
    ax.bar(index + offset, y, bar_width, label=f'T={temp}')
    
ax.set_xlabel('Class')
ax.set_ylabel('Probability')
ax.set_title('Softmax function with varying temperature')
ax.set_xticks(index)
ax.set_ylim(0, 1)
ax.legend()
ax.grid(True, axis='y', alpha=0.3)

plt.savefig('softmax.svg', format='svg')

leaky_relu

CLOSED: [2025-04-12 Sat 17:52]

State "DONE" from [2025-04-12 Sat 17:52]

/projects/ml/dl/neural-nets/fig/
leaky_relu.svg — leaky relu

code

"""
leaky_relu - Plot the leaky ReLU activation function
"""

import numpy as np
import matplotlib.pyplot as plt

z = np.arange(-2, 2, .1)
alpha = 0.1
y = np.maximum(alpha * z, z)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(z, y)
ax.set_ylim([-0.5, 2.0])
ax.set_xlim([-2.0, 2.0])
ax.grid(True)
ax.set_xlabel('z')
ax.set_title('Leaky Rectified Linear Unit (alpha=0.1)')

plt.savefig('leaky_relu.svg', format='svg')

gradient_descent

CLOSED: [2025-04-12 Sat 17:52]

State "DONE" from [2025-04-12 Sat 17:52]

/projects/ml/dl/neural-nets/fig/
gradient_descent.svg — gradient descent

code

"""
gradient_descent - Visualize gradient descent optimization in 2D
"""

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches

# Create a simple quadratic function
def f(x, y):
    return x**2 + 10*y**2

# Create grid of x, y values
x = np.linspace(-2, 2, 100)
y = np.linspace(-0.7, 0.7, 100)
X, Y = np.meshgrid(x, y)
Z = f(X, Y)

# Gradient descent path (simulated)
start_x, start_y = -1.8, 0.6
learning_rate = 0.1
points = [(start_x, start_y)]

for _ in range(15):
    grad_x = 2 * points[-1][0]
    grad_y = 20 * points[-1][1]
    new_x = points[-1][0] - learning_rate * grad_x
    new_y = points[-1][1] - learning_rate * grad_y
    points.append((new_x, new_y))

# Create plot
fig, ax = plt.subplots(figsize=(10, 6))

# Plot contour
CS = plt.contour(X, Y, Z, levels=np.logspace(0, 2, 10))
plt.clabel(CS, inline=True, fontsize=8)

# Plot gradient descent path
path_x, path_y = zip(*points)
ax.plot(path_x, path_y, 'ro-', markersize=6, linewidth=1.5, 
        label='Gradient Descent Path', alpha=0.7)

# Annotate start and finish points
ax.annotate('Start', xy=(start_x, start_y), xytext=(start_x-0.4, start_y+0.1),
            arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
ax.annotate('End', xy=(points[-1][0], points[-1][1]), 
            xytext=(points[-1][0]+0.3, points[-1][1]+0.1),
            arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))

ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_title('Gradient Descent Optimization')
ax.grid(True)
ax.legend(loc='upper right')

plt.savefig('gradient_descent.svg', format='svg')

simple_neural_network

CLOSED: [2025-04-12 Sat 17:52]

State "DONE" from [2025-04-12 Sat 17:52]

/projects/ml/dl/neural-nets/fig/
simple_neural_network.svg — simple neural network

code

"""
simple_neural_network - Visualize a simple neural network architecture with clearer structure
"""

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, FancyArrowPatch

# Set up the figure and axis
fig, ax = plt.subplots(figsize=(10, 7))

# Network parameters
layer_sizes = [4, 5, 3]  # Number of neurons per layer
n_layers = len(layer_sizes)
layer_names = ['Input\nLayer', 'Hidden\nLayer', 'Output\nLayer']

# Colors and sizes
node_colors = ['#b3e0ff', '#d9d9d9', '#b3ffb3']  # Light blue, light gray, light green
node_size = 0.15
layer_spacing = 2.0
vertical_spacing = 0.7

# Positions for each layer
layer_positions = [i * layer_spacing for i in range(n_layers)]

# Draw the network
for l, layer_size in enumerate(layer_sizes):
    # Calculate vertical positions for this layer
    y_positions = np.linspace(0, (layer_size-1) * vertical_spacing, layer_size)
    # Center the layer vertically
    y_positions = y_positions - np.mean(y_positions)
    
    # Draw the nodes
    for i, y in enumerate(y_positions):
        # Create and draw the neuron circle
        circle = Circle((layer_positions[l], y), node_size, 
                       color=node_colors[l], ec='black', zorder=4)
        ax.add_patch(circle)
        
        # Label the neurons
        if l == 0:  # Input layer
            ax.text(layer_positions[l] - 0.1, y, f'$x_{i+1}$', 
                   ha='right', va='center', fontsize=12)
        elif l == n_layers - 1:  # Output layer
            ax.text(layer_positions[l] + 0.1, y, f'$y_{i+1}$', 
                   ha='left', va='center', fontsize=12)
    
    # Add layer label
    ax.text(layer_positions[l], -layer_sizes[0]*vertical_spacing/1.7, 
           layer_names[l], ha='center', va='top', fontsize=14,
           bbox=dict(facecolor='white', alpha=0.7, boxstyle='round,pad=0.5'))
    
    # Draw connections to next layer
    if l < n_layers - 1:
        next_y_positions = np.linspace(0, (layer_sizes[l+1]-1) * vertical_spacing, layer_sizes[l+1])
        next_y_positions = next_y_positions - np.mean(next_y_positions)
        
        for i, y_start in enumerate(y_positions):
            for j, y_end in enumerate(next_y_positions):
                # Draw an arrow from this node to the next
                arrow = FancyArrowPatch(
                    (layer_positions[l] + node_size, y_start),
                    (layer_positions[l+1] - node_size, y_end),
                    connectionstyle=f"arc3,rad=0.1",
                    arrowstyle="-|>", linewidth=0.8, color='gray', alpha=0.6, zorder=1
                )
                ax.add_patch(arrow)

# Set limits and remove axes
ax.set_xlim(-0.5, layer_positions[-1] + 0.5)
ax.set_ylim(-layer_sizes[0]*vertical_spacing/1.5, layer_sizes[0]*vertical_spacing/1.5)
ax.axis('off')
ax.set_title('Neural Network Architecture', fontsize=16)

plt.tight_layout()
plt.savefig('simple_neural_network.svg', format='svg')

vanishing_gradient

CLOSED: [2025-04-12 Sat 17:53]

State "DONE" from [2025-04-12 Sat 17:53]

/projects/ml/dl/neural-nets/fig/
vanishing_gradient.svg — vanishing gradient

code

"""
vanishing_gradient - Visualize the vanishing gradient problem in deep networks
"""

import numpy as np
import matplotlib.pyplot as plt

# Sigmoid function and its derivative
def sigmoid(z):
    return 1.0/(1.0 + np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

# Create input values
z = np.linspace(-10, 10, 1000)
sigmoid_z = sigmoid(z)
derivative = sigmoid_prime(z)

# Plot the sigmoid and its derivative
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Sigmoid function
ax1.plot(z, sigmoid_z, 'b-', linewidth=2)
ax1.set_title('Sigmoid Function')
ax1.set_xlabel('z')
ax1.set_ylabel('σ(z)')
ax1.grid(True)

# Derivative of sigmoid
ax2.plot(z, derivative, 'r-', linewidth=2)
ax2.set_title('Derivative of Sigmoid')
ax2.set_xlabel('z')
ax2.set_ylabel("σ'(z)")
ax2.grid(True)

# Add annotation to show vanishing gradient
ax2.annotate('Vanishing gradient\nregions', xy=(-8, 0.0004), xytext=(-7, 0.05),
            arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
ax2.annotate('Vanishing gradient\nregions', xy=(8, 0.0004), xytext=(7, 0.05),
            arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))

plt.tight_layout()
plt.savefig('vanishing_gradient.svg', format='svg')

learning_rate_effects

CLOSED: [2025-04-12 Sat 17:53]

State "DONE" from [2025-04-12 Sat 17:53]

/projects/ml/dl/neural-nets/fig/
learning_rate_effects.svg — learning rate effects

code

"""
learning_rate_effects - Visualize the effect of different learning rates in gradient descent
"""

import numpy as np
import matplotlib.pyplot as plt

# Function to optimize
def f(x):
    return 0.1 * x**4 - 0.5 * x**3 - 0.2 * x**2 + 2 * x + 2

# Derivative of the function
def df(x):
    return 0.4 * x**3 - 1.5 * x**2 - 0.4 * x + 2

# Create x values
x = np.linspace(-3, 3, 1000)
y = f(x)

# Define different learning rates and starting points
learning_rates = [0.01, 0.05, 0.2]
start_x = 2.5
iterations = 20

# Plot function
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(x, y, 'b-', linewidth=2, label='f(x)')
ax.grid(True)

# Colors for different learning rates
colors = ['green', 'orange', 'red']
markers = ['o', 's', '^']

# Run gradient descent with different learning rates
for i, lr in enumerate(learning_rates):
    path_x = [start_x]
    path_y = [f(start_x)]
    
    current_x = start_x
    
    for _ in range(iterations):
        # Gradient descent update
        gradient = df(current_x)
        current_x = current_x - lr * gradient
        
        # Store points for plotting
        path_x.append(current_x)
        path_y.append(f(current_x))
    
    # Plot path
    ax.plot(path_x, path_y, color=colors[i], marker=markers[i], markersize=6, 
            linewidth=1.5, alpha=0.7, label=f'η = {lr}')
    
    # Add annotation for the final point
    ax.annotate(f'Final (η={lr})', xy=(path_x[-1], path_y[-1]), 
                xytext=(path_x[-1] + 0.3, path_y[-1] + 0.5),
                arrowprops=dict(facecolor=colors[i], shrink=0.05, width=1.5))

# Annotate starting point
ax.annotate('Start', xy=(start_x, f(start_x)), xytext=(start_x + 0.3, f(start_x) + 1.5),
            arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))

ax.set_xlabel('x')
ax.set_ylabel('f(x)')
ax.set_title('Effect of Learning Rate on Gradient Descent')
ax.legend(loc='upper right')

plt.savefig('learning_rate_effects.svg', format='svg')

dropout_regularization

CLOSED: [2025-04-12 Sat 17:54]

State "DONE" from [2025-04-12 Sat 17:54]

/projects/ml/dl/neural-nets/fig/
dropout_regularization.svg — dropout regularization

code

"""
dropout_regularization - Visualize dropout regularization in neural networks with improved clarity
"""

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, FancyArrowPatch

# Function to draw a neural network with optional dropout
def draw_network(ax, title, dropout=False):
    # Network parameters
    layer_sizes = [3, 8, 8, 2]  # Number of neurons per layer
    n_layers = len(layer_sizes)
    layer_names = ['Input', 'Hidden 1', 'Hidden 2', 'Output']
    
    # Colors and sizes
    active_color = '#b3e0ff'  # Light blue for active neurons
    dropout_color = '#ffcccc'  # Light red for dropped out neurons
    node_size = 0.15
    layer_spacing = 2.0
    vertical_spacing = 0.5
    
    # Set random seed for reproducibility
    np.random.seed(42)
    
    # Generate dropout masks for hidden layers
    dropout_masks = []
    for l in range(1, n_layers-1):  # Only for hidden layers
        # 50% dropout rate
        mask = np.random.rand(layer_sizes[l]) > 0.5 if dropout else np.ones(layer_sizes[l])
        dropout_masks.append(mask)
    
    # Positions for each layer
    layer_positions = [i * layer_spacing for i in range(n_layers)]
    
    # Store node positions for connection drawing
    node_positions = {}
    
    # Draw the network
    for l in range(n_layers):
        # Calculate vertical positions for this layer
        y_positions = np.linspace(0, (layer_sizes[l]-1) * vertical_spacing, layer_sizes[l])
        # Center the layer vertically
        y_positions = y_positions - np.mean(y_positions)
        
        # Draw the nodes
        for i, y in enumerate(y_positions):
            # Determine if this neuron is dropped out
            is_dropout = False
            if dropout and l > 0 and l < n_layers-1:
                is_dropout = not dropout_masks[l-1][i]
            
            # Store position for connections
            node_positions[(l, i)] = (layer_positions[l], y)
            
            # Create and draw the neuron circle
            if not is_dropout:
                # Active neuron
                circle = Circle((layer_positions[l], y), node_size, 
                               color=active_color, ec='black', zorder=4)
                ax.add_patch(circle)
            else:
                # Dropped out neuron - draw with dashed lines
                circle = Circle((layer_positions[l], y), node_size, 
                               color=dropout_color, ec='red', 
                               linestyle='dashed', alpha=0.7, zorder=4)
                ax.add_patch(circle)
                
                # Add a slash through dropped neurons
                ax.plot([layer_positions[l]-node_size, layer_positions[l]+node_size],
                       [y+node_size, y-node_size], 'r-', linewidth=1.5, zorder=5)
        
        # Add layer label
        ax.text(layer_positions[l], -2.5, 
               layer_names[l], ha='center', va='center', fontsize=12,
               bbox=dict(facecolor='white', alpha=0.7, boxstyle='round,pad=0.3'))
    
    # Draw connections between layers
    for l in range(n_layers-1):
        for i in range(layer_sizes[l]):
            # Skip connections from dropped out neurons
            if dropout and l > 0 and l < n_layers-1 and not dropout_masks[l-1][i]:
                continue
                
            for j in range(layer_sizes[l+1]):
                # Skip connections to dropped out neurons
                if dropout and l+1 < n_layers-1 and not dropout_masks[l][j]:
                    continue
                    
                # Get node positions
                start_pos = node_positions[(l, i)]
                end_pos = node_positions[(l+1, j)]
                
                # Draw an arrow from this node to the next
                arrow = FancyArrowPatch(
                    (start_pos[0] + node_size, start_pos[1]),
                    (end_pos[0] - node_size, end_pos[1]),
                    connectionstyle=f"arc3,rad=0.1",
                    arrowstyle="-", linewidth=0.8, 
                    color='gray', alpha=0.6, zorder=1
                )
                ax.add_patch(arrow)
    
    # Set limits and remove axes
    ax.set_xlim(-0.5, layer_positions[-1] + 0.5)
    ax.set_ylim(-3, 2)
    ax.axis('off')
    ax.set_title(title, fontsize=14)

# Create the figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

# Draw the standard network
draw_network(ax1, 'Standard Neural Network')

# Draw the network with dropout
draw_network(ax2, 'Network with Dropout (50%)', dropout=True)

plt.tight_layout()
plt.savefig('dropout_regularization.svg', format='svg')

momentum_optimization

CLOSED: [2025-04-12 Sat 18:10]

State "DONE" from [2025-04-12 Sat 18:10]

/projects/ml/dl/neural-nets/fig/
momentum_optimization.svg — momentum optimization

code

"""
momentum_optimization - Visualization of gradient descent with momentum
"""

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches

# Create a function with a ravine - common challenge for optimization
def f(x, y):
    return 0.1 * x**2 + y**2

# Create grid of x, y values
x = np.linspace(-2, 2, 100)
y = np.linspace(-1, 1, 100)
X, Y = np.meshgrid(x, y)
Z = f(X, Y)

# Run standard gradient descent
start_x, start_y = -1.8, 0.6
learning_rate = 0.1
std_points = [(start_x, start_y)]

for _ in range(20):
    grad_x = 0.2 * std_points[-1][0]  # Partial derivative with respect to x
    grad_y = 2 * std_points[-1][1]    # Partial derivative with respect to y
    new_x = std_points[-1][0] - learning_rate * grad_x
    new_y = std_points[-1][1] - learning_rate * grad_y
    std_points.append((new_x, new_y))

# Run gradient descent with momentum
beta = 0.9  # Momentum parameter
momentum_points = [(start_x, start_y)]
v_x, v_y = 0, 0  # Initialize velocity

for _ in range(20):
    grad_x = 0.2 * momentum_points[-1][0]
    grad_y = 2 * momentum_points[-1][1]
    
    # Update velocity with momentum
    v_x = beta * v_x - learning_rate * grad_x
    v_y = beta * v_y - learning_rate * grad_y
    
    # Update position
    new_x = momentum_points[-1][0] + v_x
    new_y = momentum_points[-1][1] + v_y
    
    momentum_points.append((new_x, new_y))

# Create plot
fig, ax = plt.subplots(figsize=(10, 6))

# Plot contour
CS = plt.contour(X, Y, Z, levels=np.logspace(-1, 1, 10))
plt.clabel(CS, inline=True, fontsize=8)

# Plot paths
std_x, std_y = zip(*std_points)
mom_x, mom_y = zip(*momentum_points)

ax.plot(std_x, std_y, 'r.-', markersize=8, linewidth=1.5, 
        label='Standard Gradient Descent', alpha=0.7)
ax.plot(mom_x, mom_y, 'b.-', markersize=8, linewidth=1.5, 
        label='Gradient Descent with Momentum', alpha=0.7)

# Add annotations
ax.annotate('Start', xy=(start_x, start_y), xytext=(start_x-0.4, start_y+0.2),
            arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))

# Highlight oscillations in standard GD
oscillation_idx = 10
ax.annotate('Oscillation', xy=(std_x[oscillation_idx], std_y[oscillation_idx]),
            xytext=(std_x[oscillation_idx]-0.7, std_y[oscillation_idx]-0.2),
            arrowprops=dict(facecolor='red', shrink=0.05, width=1.5))

# Highlight momentum's smoother path
smooth_idx = 10
ax.annotate('Smoother path', xy=(mom_x[smooth_idx], mom_y[smooth_idx]),
            xytext=(mom_x[smooth_idx]+0.5, mom_y[smooth_idx]),
            arrowprops=dict(facecolor='blue', shrink=0.05, width=1.5))

ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_title('Gradient Descent With and Without Momentum')
ax.grid(True)
ax.legend(loc='upper right')

plt.savefig('momentum_optimization.svg', format='svg')

batch_normalization

CLOSED: [2025-04-12 Sat 18:10]

State "DONE" from [2025-04-12 Sat 18:10]

/projects/ml/dl/neural-nets/fig/
batch_normalization.svg — batch normalization

code

"""
batch_normalization - Visualization of how batch normalization affects feature distributions
"""

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse

# Set random seed for reproducibility
np.random.seed(42)

# Generate original feature distribution (skewed and shifted)
n_samples = 1000
original_data = np.random.randn(n_samples, 2)
# Apply a transformation to make data non-standard
original_data[:, 0] = 3 * original_data[:, 0] + 2  # Mean=2, Std=3
original_data[:, 1] = 0.5 * original_data[:, 1] - 1  # Mean=-1, Std=0.5

# Apply batch normalization
def batch_normalize(data):
    # Calculate mean and std along first axis (across samples)
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    
    # Normalize
    normalized_data = (data - mean) / (std + 1e-8)
    return normalized_data, mean, std

normalized_data, mean, std = batch_normalize(original_data)

# Create the plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Plot original data
ax1.scatter(original_data[:, 0], original_data[:, 1], alpha=0.5, color='red')
ax1.set_title('Before Batch Normalization')
ax1.set_xlabel('Feature 1')
ax1.set_ylabel('Feature 2')
ax1.grid(True)
ax1.set_xlim(-10, 14)
ax1.set_ylim(-4, 4)

# Add an annotation about mean and variance
ax1.text(0.05, 0.95, f'Feature 1: μ={mean[0]:.1f}, σ={std[0]:.1f}\nFeature 2: μ={mean[1]:.1f}, σ={std[1]:.1f}', 
         transform=ax1.transAxes, va='top', bbox=dict(boxstyle='round,pad=0.5'))

# Add ellipse to show the spread
ellipse = Ellipse(xy=(mean[0], mean[1]), width=2*std[0], height=2*std[1], 
                 angle=0, alpha=0.2, color='red')
ax1.add_patch(ellipse)

# Plot normalized data
ax2.scatter(normalized_data[:, 0], normalized_data[:, 1], alpha=0.5, color='blue')
ax2.set_title('After Batch Normalization')
ax2.set_xlabel('Feature 1')
ax2.set_ylabel('Feature 2')
ax2.grid(True)
ax2.set_xlim(-4, 4)
ax2.set_ylim(-4, 4)

# Add an annotation about mean and variance
norm_mean = np.mean(normalized_data, axis=0)
norm_std = np.std(normalized_data, axis=0)
ax2.text(0.05, 0.95, f'Feature 1: μ={norm_mean[0]:.1f}, σ={norm_std[0]:.1f}\nFeature 2: μ={norm_mean[1]:.1f}, σ={norm_std[1]:.1f}', 
         transform=ax2.transAxes, va='top', bbox=dict(boxstyle='round,pad=0.5))

# Add ellipse to show the spread
ellipse = Ellipse(xy=(0, 0), width=2, height=2, 
                 angle=0, alpha=0.2, color='blue')
ax2.add_patch(ellipse)

plt.tight_layout()
plt.savefig('batch_normalization.svg', format='svg')

convolutional_layer

CLOSED: [2025-04-12 Sat 18:10]

State "DONE" from [2025-04-12 Sat 18:10]

/projects/ml/dl/neural-nets/fig/
convolutional_layer.svg — convolutional layer

code

"""
convolutional_layer - Visualization of how convolutional filters work
"""

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

# Create a simple 8x8 input image with a pattern
input_image = np.zeros((8, 8))
input_image[2:6, 2:6] = 1  # A small square in the middle

# Define a few different 3x3 convolutional filters
edge_detect_filter = np.array([
    [-1, -1, -1],
    [-1,  8, -1],
    [-1, -1, -1]
])

horizontal_filter = np.array([
    [-1, -1, -1],
    [ 2,  2,  2],
    [-1, -1, -1]
])

vertical_filter = np.array([
    [-1, 2, -1],
    [-1, 2, -1],
    [-1, 2, -1]
])

# Apply convolution
def apply_convolution(image, kernel):
    # Get dimensions
    image_height, image_width = image.shape
    kernel_height, kernel_width = kernel.shape
    
    # Calculate output dimensions
    output_height = image_height - kernel_height + 1
    output_width = image_width - kernel_width + 1
    
    # Initialize output
    output = np.zeros((output_height, output_width))
    
    # Apply convolution
    for i in range(output_height):
        for j in range(output_width):
            output[i, j] = np.sum(image[i:i+kernel_height, j:j+kernel_width] * kernel)
    
    return output

# Apply filters
edge_output = apply_convolution(input_image, edge_detect_filter)
horiz_output = apply_convolution(input_image, horizontal_filter)
vert_output = apply_convolution(input_image, vertical_filter)

# Create a custom colormap for better visualization
custom_cmap = LinearSegmentedColormap.from_list(
    'custom_divergent',
    ['blue', 'white', 'red'],
    N=256
)

# Create the visualization
fig, axs = plt.subplots(2, 4, figsize=(16, 8))

# Helper function to plot an image with consistent settings
def plot_image(ax, data, title, is_filter=False):
    if is_filter:
        im = ax.imshow(data, cmap=custom_cmap, vmin=-2, vmax=8)
    else:
        im = ax.imshow(data, cmap='viridis')
    ax.set_title(title)
    ax.set_xticks([])
    ax.set_yticks([])
    return im

# First row - the process for edge detection
plot_image(axs[0, 0], input_image, 'Input Image')
plot_image(axs[0, 1], edge_detect_filter, 'Edge Detection Filter', True)
axs[0, 2].text(0.5, 0.5, 'Convolution\nOperation', ha='center', va='center', fontsize=12)
axs[0, 2].set_xticks([])
axs[0, 2].set_yticks([])
axs[0, 2].add_patch(plt.Rectangle((0.2, 0.3), 0.6, 0.4, fill=False, edgecolor='black'))
axs[0, 2].arrow(0.35, 0.5, 0.25, 0, head_width=0.1, head_length=0.05, fc='black', ec='black')
plot_image(axs[0, 3], edge_output, 'Edge Detection Output')

# Second row - comparison of different filters
plot_image(axs[1, 0], input_image, 'Input Image')
plot_image(axs[1, 1], horizontal_filter, 'Horizontal Filter', True)
plot_image(axs[1, 2], vertical_filter, 'Vertical Filter', True)
plot_image(axs[1, 3], np.stack([edge_output, horiz_output, vert_output], axis=2), 'Combined Output\n(RGB Channels)')

plt.tight_layout()
plt.savefig('convolutional_layer.svg', format='svg')

recurrent_neural_network

CLOSED: [2025-04-12 Sat 18:10]

State "DONE" from [2025-04-12 Sat 18:10]

/projects/ml/dl/neural-nets/fig/
recurrent_neural_network.svg — recurrent neural network

code

"""
recurrent_neural_network - Visualization of RNN unfolding over time
"""

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, FancyArrowPatch, Rectangle

# Create the figure
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# First plot: Compact RNN representation
def draw_compact_rnn(ax):
    # Colors
    input_color = '#b3e0ff'   # Light blue
    hidden_color = '#d9d9d9'  # Light gray
    output_color = '#b3ffb3'  # Light green
    
    # Node positions
    input_pos = (0.3, 0.5)
    hidden_pos = (0.5, 0.5)
    output_pos = (0.7, 0.5)
    
    # Draw nodes
    input_node = Circle(input_pos, 0.1, color=input_color, ec='black', zorder=4)
    hidden_node = Circle(hidden_pos, 0.1, color=hidden_color, ec='black', zorder=4)
    output_node = Circle(output_pos, 0.1, color=output_color, ec='black', zorder=4)
    
    ax.add_patch(input_node)
    ax.add_patch(hidden_node)
    ax.add_patch(output_node)
    
    # Node labels
    ax.text(input_pos[0], input_pos[1], "$x$", ha='center', va='center', fontsize=12, zorder=5)
    ax.text(hidden_pos[0], hidden_pos[1], "$h$", ha='center', va='center', fontsize=12, zorder=5)
    ax.text(output_pos[0], output_pos[1], "$y$", ha='center', va='center', fontsize=12, zorder=5)
    
    # Draw connections
    # Input to hidden
    arrow = FancyArrowPatch(
        (input_pos[0] + 0.1, input_pos[1]),
        (hidden_pos[0] - 0.1, hidden_pos[1]),
        connectionstyle="arc3,rad=0", 
        arrowstyle="-|>", linewidth=1.5, color='black'
    )
    ax.add_patch(arrow)
    
    # Hidden to output
    arrow = FancyArrowPatch(
        (hidden_pos[0] + 0.1, hidden_pos[1]),
        (output_pos[0] - 0.1, output_pos[1]),
        connectionstyle="arc3,rad=0", 
        arrowstyle="-|>", linewidth=1.5, color='black'
    )
    ax.add_patch(arrow)
    
    # Recurrent connection
    arrow = FancyArrowPatch(
        (hidden_pos[0] + 0.05, hidden_pos[1] + 0.08),
        (hidden_pos[0] - 0.05, hidden_pos[1] + 0.08),
        connectionstyle="arc3,rad=-1.4", 
        arrowstyle="-|>", linewidth=1.5, color='red', zorder=3
    )
    ax.add_patch(arrow)
    
    # Add layer labels
    ax.text(input_pos[0], 0.2, "Input", ha='center', va='center', fontsize=12)
    ax.text(hidden_pos[0], 0.2, "Hidden\nState", ha='center', va='center', fontsize=12)
    ax.text(output_pos[0], 0.2, "Output", ha='center', va='center', fontsize=12)
    
    # Add title
    ax.set_title("Compact RNN Representation", fontsize=14)

# Second plot: Unfolded RNN over time
def draw_unfolded_rnn(ax):
    # Colors
    input_color = '#b3e0ff'   # Light blue
    hidden_color = '#d9d9d9'  # Light gray
    output_color = '#b3ffb3'  # Light green
    
    # Number of time steps to show
    time_steps = 4
    
    # Size parameters
    node_radius = 0.06
    spacing = 0.2
    
    # Dictionary to store node positions for easier arrow drawing
    positions = {}
    
    # Draw time step labels
    for t in range(time_steps):
        ax.text(t*spacing + 0.1, 0.05, f"t={t}", ha='center', va='center', fontsize=12)
    
    # First, create all positions to ensure they're available for arrows
    for t in range(time_steps):
        x_pos = t * spacing + 0.1
        positions[('x', t)] = (x_pos, 0.3)
        positions[('h', t)] = (x_pos, 0.5)
        positions[('y', t)] = (x_pos, 0.7)
    
    # Now draw nodes and connections
    for t in range(time_steps):
        x_pos = t * spacing + 0.1
        
        # Input node
        input_pos = positions[('x', t)]
        input_node = Circle(input_pos, node_radius, color=input_color, ec='black', zorder=4)
        ax.add_patch(input_node)
        ax.text(input_pos[0], input_pos[1], f"$x_{{{t}}}$", ha='center', va='center', fontsize=10, zorder=5)
        
        # Hidden node
        hidden_pos = positions[('h', t)]
        hidden_node = Circle(hidden_pos, node_radius, color=hidden_color, ec='black', zorder=4)
        ax.add_patch(hidden_node)
        ax.text(hidden_pos[0], hidden_pos[1], f"$h_{{{t}}}$", ha='center', va='center', fontsize=10, zorder=5)
        
        # Output node
        output_pos = positions[('y', t)]
        output_node = Circle(output_pos, node_radius, color=output_color, ec='black', zorder=4)
        ax.add_patch(output_node)
        ax.text(output_pos[0], output_pos[1], f"$y_{{{t}}}$", ha='center', va='center', fontsize=10, zorder=5)
        
        # Input to hidden connection
        arrow = FancyArrowPatch(
            (input_pos[0], input_pos[1] + node_radius),
            (hidden_pos[0], hidden_pos[1] - node_radius),
            connectionstyle="arc3,rad=0", 
            arrowstyle="-|>", linewidth=1, color='black'
        )
        ax.add_patch(arrow)
        
        # Hidden to output connection
        arrow = FancyArrowPatch(
            (hidden_pos[0], hidden_pos[1] + node_radius),
            (output_pos[0], output_pos[1] - node_radius),
            connectionstyle="arc3,rad=0", 
            arrowstyle="-|>", linewidth=1, color='black'
        )
        ax.add_patch(arrow)
        
        # Recurrent connection (except for the last time step)
        if t < time_steps - 1:
            arrow = FancyArrowPatch(
                (positions[('h', t)][0] + node_radius, positions[('h', t)][1]),
                (positions[('h', t+1)][0] - node_radius, positions[('h', t+1)][1]),
                connectionstyle="arc3,rad=0", 
                arrowstyle="-|>", linewidth=1, color='red'
            )
            ax.add_patch(arrow)
    
    # Add weight labels
    ax.text(0.1, 0.4, "$W_{xh}$", ha='center', va='center', fontsize=10, zorder=5, bbox=dict(facecolor='white', alpha=0.8))
    ax.text(0.1, 0.6, "$W_{hy}$", ha='center', va='center', fontsize=10, zorder=5, bbox=dict(facecolor='white', alpha=0.8))
    arrow_center = ((positions[('h', 0)][0] + positions[('h', 1)][0])/2, positions[('h', 0)][1] + 0.03)
    ax.text(arrow_center[0], arrow_center[1], "$W_{hh}$", ha='center', va='center', fontsize=10, zorder=5, bbox=dict(facecolor='white', alpha=0.8))
    
    # Add title
    ax.set_title("Unfolded RNN Over Time", fontsize=14)

# Draw both representations
draw_compact_rnn(ax1)
draw_unfolded_rnn(ax2)

# Set limits and remove axes
ax1.set_xlim(0, 1)
ax1.set_ylim(0, 1)
ax1.axis('off')

ax2.set_xlim(0, 0.8)
ax2.set_ylim(0, 0.8)
ax2.axis('off')

# Add a main title
fig.suptitle("Recurrent Neural Network Architecture", fontsize=16, y=0.98)

plt.tight_layout()
plt.savefig('recurrent_neural_network.svg', format='svg')

overfitting_visualization

CLOSED: [2025-04-12 Sat 18:16]

State "DONE" from [2025-04-12 Sat 18:16]

/projects/ml/dl/neural-nets/fig/
overfitting_visualization.svg — overfitting visualization

code

"""
overfitting_visualization - Visualize the problem of overfitting in neural networks
"""

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# Create synthetic data with noise
np.random.seed(0)
n_samples = 30
X = np.sort(np.random.rand(n_samples))
y = np.sin(2 * np.pi * X) + np.random.normal(0, 0.1, n_samples)

# Split into training and test sets
X_train, y_train = X[:20].reshape(-1, 1), y[:20]
X_test, y_test = X[20:].reshape(-1, 1), y[20:]

# Create and fit models of different complexity
degrees = [1, 4, 15]  # Linear, polynomial, and high-degree polynomial
colors = ['blue', 'green', 'red']
names = ['Linear (d=1)', 'Polynomial (d=4)', 'High-degree Polynomial (d=15)']

# Dense X for plotting smooth curves
X_plot = np.linspace(0, 1, 1000).reshape(-1, 1)

# Create plot
fig, axs = plt.subplots(1, 2, figsize=(14, 6))

# Training performance
axs[0].scatter(X_train, y_train, color='black', s=30, label='Training data')
axs[0].set_title('Model Fit on Training Data')
axs[0].set_xlabel('x')
axs[0].set_ylabel('y')
axs[0].set_ylim(-1.5, 1.5)

# Test performance
axs[1].scatter(X_test, y_test, color='black', s=30, label='Test data')
axs[1].set_title('Model Performance on Test Data')
axs[1].set_xlabel('x')
axs[1].set_ylim(-1.5, 1.5)

# Add ground truth
ground_truth = np.sin(2 * np.pi * X_plot.ravel())
axs[0].plot(X_plot, ground_truth, 'k:', alpha=0.5, label='Ground truth', linewidth=2)
axs[1].plot(X_plot, ground_truth, 'k:', alpha=0.5, label='Ground truth', linewidth=2)

# Train models and plot predictions
training_error = []
test_error = []

for i, degree in enumerate(degrees):
    # Create and fit model
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    model.fit(X_train, y_train)
    
    # Predict
    y_plot = model.predict(X_plot)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate errors
    train_mse = np.mean((y_train - y_train_pred) ** 2)
    test_mse = np.mean((y_test - y_test_pred) ** 2)
    training_error.append(train_mse)
    test_error.append(test_mse)
    
    # Plot model fits
    axs[0].plot(X_plot, y_plot, color=colors[i], linewidth=2,
                label=f'{names[i]} (MSE: {train_mse:.3f})')
    axs[1].plot(X_plot, y_plot, color=colors[i], linewidth=2,
                label=f'{names[i]} (MSE: {test_mse:.3f})')

# Add legends
axs[0].legend(loc='upper right', fontsize=9)
axs[1].legend(loc='upper right', fontsize=9)

for ax in axs:
    ax.grid(True, alpha=0.3)

fig.suptitle('Overfitting: Good Training Performance ≠ Good Generalization', fontsize=16)

# Add extra panel below showing training vs test error
fig.subplots_adjust(bottom=0.3)
ax3 = fig.add_subplot(212)
ind = np.arange(len(degrees))
width = 0.35

training_bars = ax3.bar(ind - width/2, training_error, width, color='lightblue', label='Training Error')
test_bars = ax3.bar(ind + width/2, test_error, width, color='salmon', label='Test Error')

ax3.set_xticks(ind)
ax3.set_xticklabels([f'Model {i+1}\nd={d}' for i, d in enumerate(degrees)])
ax3.set_ylabel('Mean Squared Error')
ax3.set_title('Training vs Test Error')
ax3.legend()
ax3.grid(True, axis='y', alpha=0.3)

# Annotate overfitting region
ax3.annotate('Overfitting Region', xy=(2, test_error[2]), xytext=(1.5, test_error[2]+0.05),
            arrowprops=dict(facecolor='black', shrink=0.05, width=1.5),
            fontsize=12)

plt.tight_layout()
plt.savefig('overfitting_visualization.svg', format='svg')

regularization_visualization

CLOSED: [2025-04-12 Sat 18:16]

State "DONE" from [2025-04-12 Sat 18:16]

/projects/ml/dl/neural-nets/fig/
regularization_visualization.svg — regularization visualization

code

"""
regularization_visualization - Visualize how L2 regularization prevents overfitting
"""

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline

# Create synthetic data with noise (same as overfitting example)
np.random.seed(0)
n_samples = 30
X = np.sort(np.random.rand(n_samples))
y = np.sin(2 * np.pi * X) + np.random.normal(0, 0.1, n_samples)

# Split into training and test sets
X_train, y_train = X[:20].reshape(-1, 1), y[:20]
X_test, y_test = X[20:].reshape(-1, 1), y[20:]

# Create polynomial degree
degree = 15  # High degree polynomial that would normally overfit

# Try different regularization strengths
alphas = [0, 0.001, 0.01, 0.1, 1.0]
colors = ['red', 'orange', 'green', 'blue', 'purple']

# Dense X for plotting smooth curves
X_plot = np.linspace(0, 1, 1000).reshape(-1, 1)

# Ground truth function
true_fun = lambda x: np.sin(2 * np.pi * x)

# Create plot
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
axs = axs.flatten()

# Make additional subplot for the error comparison
train_errors = []
test_errors = []

# Plot each regularization strength
for i, alpha in enumerate(alphas):
    # Create and fit model with regularization
    model = make_pipeline(
        PolynomialFeatures(degree),
        Ridge(alpha=alpha)
    )
    model.fit(X_train, y_train)
    
    # Predict
    y_plot = model.predict(X_plot)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate errors
    train_mse = np.mean((y_train - y_train_pred) ** 2)
    test_mse = np.mean((y_test - y_test_pred) ** 2)
    train_errors.append(train_mse)
    test_errors.append(test_mse)
    
    # Plot model
    if i < 5:  # First 5 plots show individual models
        ax = axs[i]
        ax.scatter(X_train, y_train, color='black', s=30, label='Training data')
        ax.scatter(X_test, y_test, color='black', s=30, alpha=0.3, label='Test data')
        ax.plot(X_plot, true_fun(X_plot.ravel()), 'k:', alpha=0.5, label='Ground truth', linewidth=2)
        ax.plot(X_plot, y_plot, color=colors[i], linewidth=2, 
                label=f'Model (d={degree}, λ={alpha})')
        
        ax.set_ylim(-1.5, 1.5)
        ax.set_xlabel('x')
        ax.set_ylabel('y')
        ax.set_title(f'Regularization: λ={alpha}')
        ax.grid(True, alpha=0.3)
        ax.legend(fontsize=8, loc='upper right')
        
        # Annotate errors
        ax.text(0.05, -1.2, f'Train MSE: {train_mse:.4f}\nTest MSE: {test_mse:.4f}', 
                bbox=dict(facecolor='white', alpha=0.8))

# Plot learning curves (error vs. regularization strength)
ax = axs[5]
ax.plot(alphas, train_errors, 'o-', color='blue', label='Training error')
ax.plot(alphas, test_errors, 'o-', color='red', label='Test error')
ax.set_xscale('log')
ax.set_xlabel('Regularization strength (λ)')
ax.set_ylabel('Mean Squared Error')
ax.set_title('Error vs. Regularization Strength')
ax.grid(True)
ax.legend()

# Find best alpha
best_alpha_idx = np.argmin(test_errors)
ax.annotate('Best λ', xy=(alphas[best_alpha_idx], test_errors[best_alpha_idx]),
            xytext=(alphas[best_alpha_idx]*2, test_errors[best_alpha_idx]*0.7),
            arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))

# Highlight what's happening
overfitting_text = """
Without regularization (λ=0):
- The model fits training data very well
- But performs poorly on test data
- Learns noise in the data
"""

optimal_text = """
With optimal regularization:
- Balances model complexity
- Generalizes better to test data
- Prevents overfitting
"""

fig.text(0.02, 0.02, overfitting_text, fontsize=10, 
         bbox=dict(facecolor='white', alpha=0.8, boxstyle='round,pad=0.5'))
fig.text(0.7, 0.02, optimal_text, fontsize=10, 
         bbox=dict(facecolor='white', alpha=0.8, boxstyle='round,pad=0.5'))

fig.suptitle('Effect of L2 Regularization on Preventing Overfitting', fontsize=16)
plt.tight_layout()
plt.subplots_adjust(top=0.9, bottom=0.15)
plt.savefig('regularization_visualization.svg', format='svg')