Nielsen's Figures
ntfs. (note-to-future-self)
these are all svgs created in python
using matplotlib
.
I could not get my dirty little paws on nielsen's tikz code that he used to produce the neural net diagrams. he compiled them as pngs on his own site. he also used mathjax to typeset his mathematics.
relu
code
import numpy as np
import matplotlib.pyplot as plt
z = np.arange(-2, 2, .1)
zero = np.zeros(len(z))
y = np.max([zero, z], axis=0)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(z, y)
ax.set_ylim([-2.0, 2.0])
ax.set_xlim([-2.0, 2.0])
ax.grid(True)
ax.set_xlabel('z')
ax.set_title('Rectified linear unit')
# Save as SVG instead of displaying
plt.savefig('relu.svg', format='svg')
plt.close(fig) # Close the figure to prevent display
sigmoid
CLOSED: [2025-04-12 Sat 17:22]
- State "DONE" from [2025-04-12 Sat 17:22]
code
import numpy
import matplotlib.pyplot as plt
z = numpy.arange(-5, 5, .1)
sigma_fn = numpy.vectorize(lambda z: 1/(1+numpy.exp(-z)))
sigma = sigma_fn(z)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(z, sigma)
ax.set_ylim([-0.5, 1.5])
ax.set_xlim([-5,5])
ax.grid(True)
ax.set_xlabel('z')
ax.set_title('sigmoid function')
plt.savefig('sigmoid.svg', format='svg')
step
:PROPERTIES: CLOSED: [2025-04-12 Sat 17:22]
- State "DONE" from [2025-04-12 Sat 17:22]
:END:
code
import numpy
import matplotlib.pyplot as plt
z = numpy.arange(-5, 5, .02)
step_fn = numpy.vectorize(lambda z: 1.0 if z >= 0.0 else 0.0)
step = step_fn(z)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(z, step)
ax.set_ylim([-0.5, 1.5])
ax.set_xlim([-5,5])
ax.grid(True)
ax.set_xlabel('z')
ax.set_title('step function')
plt.savefig('step.svg', format='svg')
tanh
CLOSED: [2025-04-12 Sat 17:22]
- State "DONE" from [2025-04-12 Sat 17:22]
code
import numpy as np
import matplotlib.pyplot as plt
z = np.arange(-5, 5, .1)
t = np.tanh(z)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(z, t)
ax.set_ylim([-1.0, 1.0])
ax.set_xlim([-5,5])
ax.grid(True)
ax.set_xlabel('z')
ax.set_title('tanh function')
plt.savefig('tanh.svg', format='svg')
diff sigmoid
CLOSED: [2025-04-15 Tue 12:07]
- State "DONE" from [2025-04-15 Tue 12:07]
code
import numpy as np
import matplotlib.pyplot as plt
# Define sigmoid and its derivative
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def sigmoid_prime(z):
s = sigmoid(z)
return s * (1 - s)
# z values
z = np.arange(-5, 5, 0.1)
s_prime = sigmoid_prime(z)
# Plot
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(z, s_prime)
ax.set_ylim([0.0, 0.3])
ax.set_xlim([-5, 5])
ax.grid(True)
ax.set_xlabel('z')
ax.set_title('Derivative of the sigmoid function')
plt.savefig('sigmoid_prime.svg', format='svg')
valley
CLOSED: [2025-04-12 Sat 17:50]
- State "DONE" from [2025-04-12 Sat 17:50]
code
"""
valley - Plots a function of two variables to minimize.
The function is a fairly generic valley function.
"""
# Third party libraries
from matplotlib.ticker import LinearLocator
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d') # FIXED: create a 3D axis properly
X = np.arange(-1, 1, 0.1)
Y = np.arange(-1, 1, 0.1)
X, Y = np.meshgrid(X, Y)
Z = X**2 + Y**2
colortuple = ('w', 'b')
colors = np.empty(X.shape, dtype=str)
for x in range(len(X)):
for y in range(len(Y)):
colors[x, y] = colortuple[(x + y) % 2]
surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, facecolors=colors,
linewidth=0)
ax.set_xlim3d(-1, 1)
ax.set_ylim3d(-1, 1)
ax.set_zlim3d(0, 2)
ax.xaxis.set_major_locator(LinearLocator(3))
ax.yaxis.set_major_locator(LinearLocator(3))
ax.zaxis.set_major_locator(LinearLocator(3))
ax.text(1.79, 0, 1.62, "$C$", fontsize=20)
ax.text(0.05, -1.8, 0, "$v_1$", fontsize=20)
ax.text(1.5, -0.25, 0, "$v_2$", fontsize=20)
plt.savefig('valley.svg', format='svg')
plt.show()
valley2
CLOSED: [2025-04-12 Sat 17:51]
- State "DONE" from [2025-04-12 Sat 17:51]
code
"""
valley2 - Plots a function of two variables to minimize.
This is the second valley function visualization.
"""
# Third party libraries
from matplotlib.ticker import LinearLocator
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d') # FIXED: Use add_subplot for 3D
X = np.arange(-1, 1, 0.1)
Y = np.arange(-1, 1, 0.1)
X, Y = np.meshgrid(X, Y)
Z = X**2 + 10*Y**2
colortuple = ('w', 'b')
colors = np.empty(X.shape, dtype=str)
for x in range(len(X)):
for y in range(len(Y)):
colors[x, y] = colortuple[(x + y) % 2]
surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, facecolors=colors,
linewidth=0)
ax.set_xlim3d(-1, 1)
ax.set_ylim3d(-1, 1)
ax.set_zlim3d(0, 10)
ax.xaxis.set_major_locator(LinearLocator(3))
ax.yaxis.set_major_locator(LinearLocator(3))
ax.zaxis.set_major_locator(LinearLocator(3))
ax.text(1.79, 0, 8.4, "$C$", fontsize=20)
ax.text(0.05, -1.8, 0, "$v_1$", fontsize=20)
ax.text(1.5, -0.25, 0, "$v_2$", fontsize=20)
plt.savefig('valley2.svg', format='svg')
plt.close(fig)
false_minima
CLOSED: [2025-04-12 Sat 17:51]
- State "DONE" from [2025-04-12 Sat 17:51]
code
"""
false_minima - Plots a function of two variables with many false minima.
"""
from matplotlib.ticker import LinearLocator
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d') # FIXED
X = np.arange(-5, 5, 0.1)
Y = np.arange(-5, 5, 0.1)
X, Y = np.meshgrid(X, Y)
Z = np.sin(X) * np.sin(Y) + 0.2 * X
colortuple = ('w', 'b')
colors = np.empty(X.shape, dtype=str)
for x in range(len(X)):
for y in range(len(Y)):
colors[x, y] = colortuple[(x + y) % 2]
surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, facecolors=colors, linewidth=0)
ax.set_xlim3d(-5, 5)
ax.set_ylim3d(-5, 5)
ax.set_zlim3d(-2, 2)
ax.xaxis.set_major_locator(LinearLocator(3)) # FIXED
ax.yaxis.set_major_locator(LinearLocator(3))
ax.zaxis.set_major_locator(LinearLocator(3))
plt.savefig('false_minima.svg', format='svg')
plt.close(fig)
misleading_gradient_contours
CLOSED: [2025-04-12 Sat 17:51]
- State "DONE" from [2025-04-12 Sat 17:51]
code
"""
misleading_gradient_contours - Plots the contours of a function with misleading gradients
"""
# Third party libraries
import matplotlib.pyplot as plt
import numpy as np
X = np.arange(-1, 1, 0.02)
Y = np.arange(-1, 1, 0.02)
X, Y = np.meshgrid(X, Y)
Z = X**2 + 10*Y**2
plt.figure()
CS = plt.contour(X, Y, Z, levels=[0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])
plt.xlabel("$w_1$", fontsize=16)
plt.ylabel("$w_2$", fontsize=16)
plt.savefig('misleading_gradient_contours.svg', format='svg')
plt.close() # Close the figure to prevent display
misleading_gradient
CLOSED: [2025-04-12 Sat 17:51]
- State "DONE" from [2025-04-12 Sat 17:51]
code
"""
misleading_gradient - Plots a function which misleads the gradient descent algorithm.
"""
from matplotlib.ticker import LinearLocator
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d') # FIXED
X = np.arange(-1, 1, 0.025)
Y = np.arange(-1, 1, 0.025)
X, Y = np.meshgrid(X, Y)
Z = X**2 + 10*Y**2
colortuple = ('w', 'b')
colors = np.empty(X.shape, dtype=str)
for x in range(len(X)):
for y in range(len(Y)):
colors[x, y] = colortuple[(x + y) % 2]
surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, facecolors=colors, linewidth=0)
ax.set_xlim3d(-1, 1)
ax.set_ylim3d(-1, 1)
ax.set_zlim3d(0, 12)
ax.xaxis.set_major_locator(LinearLocator(3)) # FIXED
ax.yaxis.set_major_locator(LinearLocator(3))
ax.zaxis.set_major_locator(LinearLocator(3))
ax.text(0.05, -1.8, 0, "$w_1$", fontsize=20)
ax.text(1.5, -0.25, 0, "$w_2$", fontsize=20)
ax.text(1.79, 0, 9.62, "$C$", fontsize=20)
plt.savefig('misleading_gradient.svg', format='svg')
plt.close(fig)
pca_limitations
CLOSED: [2025-04-12 Sat 17:51]
- State "DONE" from [2025-04-12 Sat 17:51]
code
"""
pca_limitations - Plot graphs to illustrate the limitations of PCA.
"""
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
# Data points only
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d') # FIXED
z = np.linspace(-2, 2, 20)
theta = np.linspace(-4 * np.pi, 4 * np.pi, 20)
x = np.sin(theta) + 0.03 * np.random.randn(20)
y = np.cos(theta) + 0.03 * np.random.randn(20)
ax.plot(x, y, z, 'ro')
plt.savefig('pca_limitations_data.svg', format='svg')
plt.close(fig)
# Helix + data
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d') # FIXED
z_helix = np.linspace(-2, 2, 100)
theta_helix = np.linspace(-4 * np.pi, 4 * np.pi, 100)
x_helix = np.sin(theta_helix)
y_helix = np.cos(theta_helix)
ax.plot(x, y, z, 'ro') # replotting noisy data
ax.plot(x_helix, y_helix, z_helix, 'b-')
plt.savefig('pca_limitations_helix.svg', format='svg')
plt.close(fig)
backprop_magnitude_nabla
CLOSED: [2025-04-12 Sat 17:51]
- State "DONE" from [2025-04-12 Sat 17:51]
code
"""
backprop_magnitude_nabla - Plotting the magnitude of gradient terms during backpropagation
"""
# Third-party libraries
import matplotlib.pyplot as plt
# Data from backpropagation in a 784-30-30-30-30-30-10 network
nw1 = [0.129173436407863, 0.4242933114455002,
1.6154682713449411, 7.5451567587160069]
nw2 = [0.12571016850457151, 0.44231149185805047,
1.8435833504677326, 7.61973813981073]
nw3 = [0.15854489503205446, 0.70244235144444678,
2.6294803575724157, 10.427062019753425]
plt.figure()
plt.plot(range(1, 5), nw1, "ro-", range(1, 5), nw2, "go-",
range(1, 5), nw3, "bo-")
plt.xlabel('Layer $l$')
plt.ylabel(r"$\Vert\nabla C^l_w\Vert$")
plt.xticks([1, 2, 3, 4])
plt.savefig('backprop_magnitude_nabla.svg', format='svg')
plt.close() # Close the figure to prevent display
softmax
CLOSED: [2025-04-12 Sat 17:51]
- State "DONE" from [2025-04-12 Sat 17:51]
code
"""
softmax - Plot the softmax activation function for different temperature values
"""
import numpy as np
import matplotlib.pyplot as plt
# Define the softmax function
def softmax(x, temperature=1.0):
"""Compute softmax values for array of logits with temperature scaling."""
# Subtract max for numerical stability (prevents overflow)
x = x / temperature
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
# Create input values
x = np.array([0.1, 0.2, 0.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
# Plot softmax with different temperature values
fig, ax = plt.subplots(figsize=(10, 5))
temps = [0.5, 1.0, 2.0]
bar_width = 0.25
index = np.arange(len(x))
for i, temp in enumerate(temps):
y = softmax(x, temperature=temp)
offset = (i - 1) * bar_width
ax.bar(index + offset, y, bar_width, label=f'T={temp}')
ax.set_xlabel('Class')
ax.set_ylabel('Probability')
ax.set_title('Softmax function with varying temperature')
ax.set_xticks(index)
ax.set_ylim(0, 1)
ax.legend()
ax.grid(True, axis='y', alpha=0.3)
plt.savefig('softmax.svg', format='svg')
leaky_relu
CLOSED: [2025-04-12 Sat 17:52]
- State "DONE" from [2025-04-12 Sat 17:52]
code
"""
leaky_relu - Plot the leaky ReLU activation function
"""
import numpy as np
import matplotlib.pyplot as plt
z = np.arange(-2, 2, .1)
alpha = 0.1
y = np.maximum(alpha * z, z)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(z, y)
ax.set_ylim([-0.5, 2.0])
ax.set_xlim([-2.0, 2.0])
ax.grid(True)
ax.set_xlabel('z')
ax.set_title('Leaky Rectified Linear Unit (alpha=0.1)')
plt.savefig('leaky_relu.svg', format='svg')
gradient_descent
CLOSED: [2025-04-12 Sat 17:52]
- State "DONE" from [2025-04-12 Sat 17:52]
code
"""
gradient_descent - Visualize gradient descent optimization in 2D
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches
# Create a simple quadratic function
def f(x, y):
return x**2 + 10*y**2
# Create grid of x, y values
x = np.linspace(-2, 2, 100)
y = np.linspace(-0.7, 0.7, 100)
X, Y = np.meshgrid(x, y)
Z = f(X, Y)
# Gradient descent path (simulated)
start_x, start_y = -1.8, 0.6
learning_rate = 0.1
points = [(start_x, start_y)]
for _ in range(15):
grad_x = 2 * points[-1][0]
grad_y = 20 * points[-1][1]
new_x = points[-1][0] - learning_rate * grad_x
new_y = points[-1][1] - learning_rate * grad_y
points.append((new_x, new_y))
# Create plot
fig, ax = plt.subplots(figsize=(10, 6))
# Plot contour
CS = plt.contour(X, Y, Z, levels=np.logspace(0, 2, 10))
plt.clabel(CS, inline=True, fontsize=8)
# Plot gradient descent path
path_x, path_y = zip(*points)
ax.plot(path_x, path_y, 'ro-', markersize=6, linewidth=1.5,
label='Gradient Descent Path', alpha=0.7)
# Annotate start and finish points
ax.annotate('Start', xy=(start_x, start_y), xytext=(start_x-0.4, start_y+0.1),
arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
ax.annotate('End', xy=(points[-1][0], points[-1][1]),
xytext=(points[-1][0]+0.3, points[-1][1]+0.1),
arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_title('Gradient Descent Optimization')
ax.grid(True)
ax.legend(loc='upper right')
plt.savefig('gradient_descent.svg', format='svg')
simple_neural_network
CLOSED: [2025-04-12 Sat 17:52]
- State "DONE" from [2025-04-12 Sat 17:52]
code
"""
simple_neural_network - Visualize a simple neural network architecture with clearer structure
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, FancyArrowPatch
# Set up the figure and axis
fig, ax = plt.subplots(figsize=(10, 7))
# Network parameters
layer_sizes = [4, 5, 3] # Number of neurons per layer
n_layers = len(layer_sizes)
layer_names = ['Input\nLayer', 'Hidden\nLayer', 'Output\nLayer']
# Colors and sizes
node_colors = ['#b3e0ff', '#d9d9d9', '#b3ffb3'] # Light blue, light gray, light green
node_size = 0.15
layer_spacing = 2.0
vertical_spacing = 0.7
# Positions for each layer
layer_positions = [i * layer_spacing for i in range(n_layers)]
# Draw the network
for l, layer_size in enumerate(layer_sizes):
# Calculate vertical positions for this layer
y_positions = np.linspace(0, (layer_size-1) * vertical_spacing, layer_size)
# Center the layer vertically
y_positions = y_positions - np.mean(y_positions)
# Draw the nodes
for i, y in enumerate(y_positions):
# Create and draw the neuron circle
circle = Circle((layer_positions[l], y), node_size,
color=node_colors[l], ec='black', zorder=4)
ax.add_patch(circle)
# Label the neurons
if l == 0: # Input layer
ax.text(layer_positions[l] - 0.1, y, f'$x_{i+1}$',
ha='right', va='center', fontsize=12)
elif l == n_layers - 1: # Output layer
ax.text(layer_positions[l] + 0.1, y, f'$y_{i+1}$',
ha='left', va='center', fontsize=12)
# Add layer label
ax.text(layer_positions[l], -layer_sizes[0]*vertical_spacing/1.7,
layer_names[l], ha='center', va='top', fontsize=14,
bbox=dict(facecolor='white', alpha=0.7, boxstyle='round,pad=0.5'))
# Draw connections to next layer
if l < n_layers - 1:
next_y_positions = np.linspace(0, (layer_sizes[l+1]-1) * vertical_spacing, layer_sizes[l+1])
next_y_positions = next_y_positions - np.mean(next_y_positions)
for i, y_start in enumerate(y_positions):
for j, y_end in enumerate(next_y_positions):
# Draw an arrow from this node to the next
arrow = FancyArrowPatch(
(layer_positions[l] + node_size, y_start),
(layer_positions[l+1] - node_size, y_end),
connectionstyle=f"arc3,rad=0.1",
arrowstyle="-|>", linewidth=0.8, color='gray', alpha=0.6, zorder=1
)
ax.add_patch(arrow)
# Set limits and remove axes
ax.set_xlim(-0.5, layer_positions[-1] + 0.5)
ax.set_ylim(-layer_sizes[0]*vertical_spacing/1.5, layer_sizes[0]*vertical_spacing/1.5)
ax.axis('off')
ax.set_title('Neural Network Architecture', fontsize=16)
plt.tight_layout()
plt.savefig('simple_neural_network.svg', format='svg')
vanishing_gradient
CLOSED: [2025-04-12 Sat 17:53]
- State "DONE" from [2025-04-12 Sat 17:53]
code
"""
vanishing_gradient - Visualize the vanishing gradient problem in deep networks
"""
import numpy as np
import matplotlib.pyplot as plt
# Sigmoid function and its derivative
def sigmoid(z):
return 1.0/(1.0 + np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
# Create input values
z = np.linspace(-10, 10, 1000)
sigmoid_z = sigmoid(z)
derivative = sigmoid_prime(z)
# Plot the sigmoid and its derivative
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Sigmoid function
ax1.plot(z, sigmoid_z, 'b-', linewidth=2)
ax1.set_title('Sigmoid Function')
ax1.set_xlabel('z')
ax1.set_ylabel('σ(z)')
ax1.grid(True)
# Derivative of sigmoid
ax2.plot(z, derivative, 'r-', linewidth=2)
ax2.set_title('Derivative of Sigmoid')
ax2.set_xlabel('z')
ax2.set_ylabel("σ'(z)")
ax2.grid(True)
# Add annotation to show vanishing gradient
ax2.annotate('Vanishing gradient\nregions', xy=(-8, 0.0004), xytext=(-7, 0.05),
arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
ax2.annotate('Vanishing gradient\nregions', xy=(8, 0.0004), xytext=(7, 0.05),
arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
plt.tight_layout()
plt.savefig('vanishing_gradient.svg', format='svg')
learning_rate_effects
CLOSED: [2025-04-12 Sat 17:53]
- State "DONE" from [2025-04-12 Sat 17:53]
code
"""
learning_rate_effects - Visualize the effect of different learning rates in gradient descent
"""
import numpy as np
import matplotlib.pyplot as plt
# Function to optimize
def f(x):
return 0.1 * x**4 - 0.5 * x**3 - 0.2 * x**2 + 2 * x + 2
# Derivative of the function
def df(x):
return 0.4 * x**3 - 1.5 * x**2 - 0.4 * x + 2
# Create x values
x = np.linspace(-3, 3, 1000)
y = f(x)
# Define different learning rates and starting points
learning_rates = [0.01, 0.05, 0.2]
start_x = 2.5
iterations = 20
# Plot function
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(x, y, 'b-', linewidth=2, label='f(x)')
ax.grid(True)
# Colors for different learning rates
colors = ['green', 'orange', 'red']
markers = ['o', 's', '^']
# Run gradient descent with different learning rates
for i, lr in enumerate(learning_rates):
path_x = [start_x]
path_y = [f(start_x)]
current_x = start_x
for _ in range(iterations):
# Gradient descent update
gradient = df(current_x)
current_x = current_x - lr * gradient
# Store points for plotting
path_x.append(current_x)
path_y.append(f(current_x))
# Plot path
ax.plot(path_x, path_y, color=colors[i], marker=markers[i], markersize=6,
linewidth=1.5, alpha=0.7, label=f'η = {lr}')
# Add annotation for the final point
ax.annotate(f'Final (η={lr})', xy=(path_x[-1], path_y[-1]),
xytext=(path_x[-1] + 0.3, path_y[-1] + 0.5),
arrowprops=dict(facecolor=colors[i], shrink=0.05, width=1.5))
# Annotate starting point
ax.annotate('Start', xy=(start_x, f(start_x)), xytext=(start_x + 0.3, f(start_x) + 1.5),
arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
ax.set_title('Effect of Learning Rate on Gradient Descent')
ax.legend(loc='upper right')
plt.savefig('learning_rate_effects.svg', format='svg')
dropout_regularization
CLOSED: [2025-04-12 Sat 17:54]
- State "DONE" from [2025-04-12 Sat 17:54]
code
"""
dropout_regularization - Visualize dropout regularization in neural networks with improved clarity
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, FancyArrowPatch
# Function to draw a neural network with optional dropout
def draw_network(ax, title, dropout=False):
# Network parameters
layer_sizes = [3, 8, 8, 2] # Number of neurons per layer
n_layers = len(layer_sizes)
layer_names = ['Input', 'Hidden 1', 'Hidden 2', 'Output']
# Colors and sizes
active_color = '#b3e0ff' # Light blue for active neurons
dropout_color = '#ffcccc' # Light red for dropped out neurons
node_size = 0.15
layer_spacing = 2.0
vertical_spacing = 0.5
# Set random seed for reproducibility
np.random.seed(42)
# Generate dropout masks for hidden layers
dropout_masks = []
for l in range(1, n_layers-1): # Only for hidden layers
# 50% dropout rate
mask = np.random.rand(layer_sizes[l]) > 0.5 if dropout else np.ones(layer_sizes[l])
dropout_masks.append(mask)
# Positions for each layer
layer_positions = [i * layer_spacing for i in range(n_layers)]
# Store node positions for connection drawing
node_positions = {}
# Draw the network
for l in range(n_layers):
# Calculate vertical positions for this layer
y_positions = np.linspace(0, (layer_sizes[l]-1) * vertical_spacing, layer_sizes[l])
# Center the layer vertically
y_positions = y_positions - np.mean(y_positions)
# Draw the nodes
for i, y in enumerate(y_positions):
# Determine if this neuron is dropped out
is_dropout = False
if dropout and l > 0 and l < n_layers-1:
is_dropout = not dropout_masks[l-1][i]
# Store position for connections
node_positions[(l, i)] = (layer_positions[l], y)
# Create and draw the neuron circle
if not is_dropout:
# Active neuron
circle = Circle((layer_positions[l], y), node_size,
color=active_color, ec='black', zorder=4)
ax.add_patch(circle)
else:
# Dropped out neuron - draw with dashed lines
circle = Circle((layer_positions[l], y), node_size,
color=dropout_color, ec='red',
linestyle='dashed', alpha=0.7, zorder=4)
ax.add_patch(circle)
# Add a slash through dropped neurons
ax.plot([layer_positions[l]-node_size, layer_positions[l]+node_size],
[y+node_size, y-node_size], 'r-', linewidth=1.5, zorder=5)
# Add layer label
ax.text(layer_positions[l], -2.5,
layer_names[l], ha='center', va='center', fontsize=12,
bbox=dict(facecolor='white', alpha=0.7, boxstyle='round,pad=0.3'))
# Draw connections between layers
for l in range(n_layers-1):
for i in range(layer_sizes[l]):
# Skip connections from dropped out neurons
if dropout and l > 0 and l < n_layers-1 and not dropout_masks[l-1][i]:
continue
for j in range(layer_sizes[l+1]):
# Skip connections to dropped out neurons
if dropout and l+1 < n_layers-1 and not dropout_masks[l][j]:
continue
# Get node positions
start_pos = node_positions[(l, i)]
end_pos = node_positions[(l+1, j)]
# Draw an arrow from this node to the next
arrow = FancyArrowPatch(
(start_pos[0] + node_size, start_pos[1]),
(end_pos[0] - node_size, end_pos[1]),
connectionstyle=f"arc3,rad=0.1",
arrowstyle="-", linewidth=0.8,
color='gray', alpha=0.6, zorder=1
)
ax.add_patch(arrow)
# Set limits and remove axes
ax.set_xlim(-0.5, layer_positions[-1] + 0.5)
ax.set_ylim(-3, 2)
ax.axis('off')
ax.set_title(title, fontsize=14)
# Create the figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
# Draw the standard network
draw_network(ax1, 'Standard Neural Network')
# Draw the network with dropout
draw_network(ax2, 'Network with Dropout (50%)', dropout=True)
plt.tight_layout()
plt.savefig('dropout_regularization.svg', format='svg')
momentum_optimization
CLOSED: [2025-04-12 Sat 18:10]
- State "DONE" from [2025-04-12 Sat 18:10]
code
"""
momentum_optimization - Visualization of gradient descent with momentum
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.path import Path
import matplotlib.patches as patches
# Create a function with a ravine - common challenge for optimization
def f(x, y):
return 0.1 * x**2 + y**2
# Create grid of x, y values
x = np.linspace(-2, 2, 100)
y = np.linspace(-1, 1, 100)
X, Y = np.meshgrid(x, y)
Z = f(X, Y)
# Run standard gradient descent
start_x, start_y = -1.8, 0.6
learning_rate = 0.1
std_points = [(start_x, start_y)]
for _ in range(20):
grad_x = 0.2 * std_points[-1][0] # Partial derivative with respect to x
grad_y = 2 * std_points[-1][1] # Partial derivative with respect to y
new_x = std_points[-1][0] - learning_rate * grad_x
new_y = std_points[-1][1] - learning_rate * grad_y
std_points.append((new_x, new_y))
# Run gradient descent with momentum
beta = 0.9 # Momentum parameter
momentum_points = [(start_x, start_y)]
v_x, v_y = 0, 0 # Initialize velocity
for _ in range(20):
grad_x = 0.2 * momentum_points[-1][0]
grad_y = 2 * momentum_points[-1][1]
# Update velocity with momentum
v_x = beta * v_x - learning_rate * grad_x
v_y = beta * v_y - learning_rate * grad_y
# Update position
new_x = momentum_points[-1][0] + v_x
new_y = momentum_points[-1][1] + v_y
momentum_points.append((new_x, new_y))
# Create plot
fig, ax = plt.subplots(figsize=(10, 6))
# Plot contour
CS = plt.contour(X, Y, Z, levels=np.logspace(-1, 1, 10))
plt.clabel(CS, inline=True, fontsize=8)
# Plot paths
std_x, std_y = zip(*std_points)
mom_x, mom_y = zip(*momentum_points)
ax.plot(std_x, std_y, 'r.-', markersize=8, linewidth=1.5,
label='Standard Gradient Descent', alpha=0.7)
ax.plot(mom_x, mom_y, 'b.-', markersize=8, linewidth=1.5,
label='Gradient Descent with Momentum', alpha=0.7)
# Add annotations
ax.annotate('Start', xy=(start_x, start_y), xytext=(start_x-0.4, start_y+0.2),
arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
# Highlight oscillations in standard GD
oscillation_idx = 10
ax.annotate('Oscillation', xy=(std_x[oscillation_idx], std_y[oscillation_idx]),
xytext=(std_x[oscillation_idx]-0.7, std_y[oscillation_idx]-0.2),
arrowprops=dict(facecolor='red', shrink=0.05, width=1.5))
# Highlight momentum's smoother path
smooth_idx = 10
ax.annotate('Smoother path', xy=(mom_x[smooth_idx], mom_y[smooth_idx]),
xytext=(mom_x[smooth_idx]+0.5, mom_y[smooth_idx]),
arrowprops=dict(facecolor='blue', shrink=0.05, width=1.5))
ax.set_xlabel('$w_1$')
ax.set_ylabel('$w_2$')
ax.set_title('Gradient Descent With and Without Momentum')
ax.grid(True)
ax.legend(loc='upper right')
plt.savefig('momentum_optimization.svg', format='svg')
batch_normalization
CLOSED: [2025-04-12 Sat 18:10]
- State "DONE" from [2025-04-12 Sat 18:10]
code
"""
batch_normalization - Visualization of how batch normalization affects feature distributions
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
# Set random seed for reproducibility
np.random.seed(42)
# Generate original feature distribution (skewed and shifted)
n_samples = 1000
original_data = np.random.randn(n_samples, 2)
# Apply a transformation to make data non-standard
original_data[:, 0] = 3 * original_data[:, 0] + 2 # Mean=2, Std=3
original_data[:, 1] = 0.5 * original_data[:, 1] - 1 # Mean=-1, Std=0.5
# Apply batch normalization
def batch_normalize(data):
# Calculate mean and std along first axis (across samples)
mean = np.mean(data, axis=0)
std = np.std(data, axis=0)
# Normalize
normalized_data = (data - mean) / (std + 1e-8)
return normalized_data, mean, std
normalized_data, mean, std = batch_normalize(original_data)
# Create the plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Plot original data
ax1.scatter(original_data[:, 0], original_data[:, 1], alpha=0.5, color='red')
ax1.set_title('Before Batch Normalization')
ax1.set_xlabel('Feature 1')
ax1.set_ylabel('Feature 2')
ax1.grid(True)
ax1.set_xlim(-10, 14)
ax1.set_ylim(-4, 4)
# Add an annotation about mean and variance
ax1.text(0.05, 0.95, f'Feature 1: μ={mean[0]:.1f}, σ={std[0]:.1f}\nFeature 2: μ={mean[1]:.1f}, σ={std[1]:.1f}',
transform=ax1.transAxes, va='top', bbox=dict(boxstyle='round,pad=0.5'))
# Add ellipse to show the spread
ellipse = Ellipse(xy=(mean[0], mean[1]), width=2*std[0], height=2*std[1],
angle=0, alpha=0.2, color='red')
ax1.add_patch(ellipse)
# Plot normalized data
ax2.scatter(normalized_data[:, 0], normalized_data[:, 1], alpha=0.5, color='blue')
ax2.set_title('After Batch Normalization')
ax2.set_xlabel('Feature 1')
ax2.set_ylabel('Feature 2')
ax2.grid(True)
ax2.set_xlim(-4, 4)
ax2.set_ylim(-4, 4)
# Add an annotation about mean and variance
norm_mean = np.mean(normalized_data, axis=0)
norm_std = np.std(normalized_data, axis=0)
ax2.text(0.05, 0.95, f'Feature 1: μ={norm_mean[0]:.1f}, σ={norm_std[0]:.1f}\nFeature 2: μ={norm_mean[1]:.1f}, σ={norm_std[1]:.1f}',
transform=ax2.transAxes, va='top', bbox=dict(boxstyle='round,pad=0.5))
# Add ellipse to show the spread
ellipse = Ellipse(xy=(0, 0), width=2, height=2,
angle=0, alpha=0.2, color='blue')
ax2.add_patch(ellipse)
plt.tight_layout()
plt.savefig('batch_normalization.svg', format='svg')
convolutional_layer
CLOSED: [2025-04-12 Sat 18:10]
- State "DONE" from [2025-04-12 Sat 18:10]
code
"""
convolutional_layer - Visualization of how convolutional filters work
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
# Create a simple 8x8 input image with a pattern
input_image = np.zeros((8, 8))
input_image[2:6, 2:6] = 1 # A small square in the middle
# Define a few different 3x3 convolutional filters
edge_detect_filter = np.array([
[-1, -1, -1],
[-1, 8, -1],
[-1, -1, -1]
])
horizontal_filter = np.array([
[-1, -1, -1],
[ 2, 2, 2],
[-1, -1, -1]
])
vertical_filter = np.array([
[-1, 2, -1],
[-1, 2, -1],
[-1, 2, -1]
])
# Apply convolution
def apply_convolution(image, kernel):
# Get dimensions
image_height, image_width = image.shape
kernel_height, kernel_width = kernel.shape
# Calculate output dimensions
output_height = image_height - kernel_height + 1
output_width = image_width - kernel_width + 1
# Initialize output
output = np.zeros((output_height, output_width))
# Apply convolution
for i in range(output_height):
for j in range(output_width):
output[i, j] = np.sum(image[i:i+kernel_height, j:j+kernel_width] * kernel)
return output
# Apply filters
edge_output = apply_convolution(input_image, edge_detect_filter)
horiz_output = apply_convolution(input_image, horizontal_filter)
vert_output = apply_convolution(input_image, vertical_filter)
# Create a custom colormap for better visualization
custom_cmap = LinearSegmentedColormap.from_list(
'custom_divergent',
['blue', 'white', 'red'],
N=256
)
# Create the visualization
fig, axs = plt.subplots(2, 4, figsize=(16, 8))
# Helper function to plot an image with consistent settings
def plot_image(ax, data, title, is_filter=False):
if is_filter:
im = ax.imshow(data, cmap=custom_cmap, vmin=-2, vmax=8)
else:
im = ax.imshow(data, cmap='viridis')
ax.set_title(title)
ax.set_xticks([])
ax.set_yticks([])
return im
# First row - the process for edge detection
plot_image(axs[0, 0], input_image, 'Input Image')
plot_image(axs[0, 1], edge_detect_filter, 'Edge Detection Filter', True)
axs[0, 2].text(0.5, 0.5, 'Convolution\nOperation', ha='center', va='center', fontsize=12)
axs[0, 2].set_xticks([])
axs[0, 2].set_yticks([])
axs[0, 2].add_patch(plt.Rectangle((0.2, 0.3), 0.6, 0.4, fill=False, edgecolor='black'))
axs[0, 2].arrow(0.35, 0.5, 0.25, 0, head_width=0.1, head_length=0.05, fc='black', ec='black')
plot_image(axs[0, 3], edge_output, 'Edge Detection Output')
# Second row - comparison of different filters
plot_image(axs[1, 0], input_image, 'Input Image')
plot_image(axs[1, 1], horizontal_filter, 'Horizontal Filter', True)
plot_image(axs[1, 2], vertical_filter, 'Vertical Filter', True)
plot_image(axs[1, 3], np.stack([edge_output, horiz_output, vert_output], axis=2), 'Combined Output\n(RGB Channels)')
plt.tight_layout()
plt.savefig('convolutional_layer.svg', format='svg')
recurrent_neural_network
CLOSED: [2025-04-12 Sat 18:10]
- State "DONE" from [2025-04-12 Sat 18:10]
code
"""
recurrent_neural_network - Visualization of RNN unfolding over time
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, FancyArrowPatch, Rectangle
# Create the figure
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# First plot: Compact RNN representation
def draw_compact_rnn(ax):
# Colors
input_color = '#b3e0ff' # Light blue
hidden_color = '#d9d9d9' # Light gray
output_color = '#b3ffb3' # Light green
# Node positions
input_pos = (0.3, 0.5)
hidden_pos = (0.5, 0.5)
output_pos = (0.7, 0.5)
# Draw nodes
input_node = Circle(input_pos, 0.1, color=input_color, ec='black', zorder=4)
hidden_node = Circle(hidden_pos, 0.1, color=hidden_color, ec='black', zorder=4)
output_node = Circle(output_pos, 0.1, color=output_color, ec='black', zorder=4)
ax.add_patch(input_node)
ax.add_patch(hidden_node)
ax.add_patch(output_node)
# Node labels
ax.text(input_pos[0], input_pos[1], "$x$", ha='center', va='center', fontsize=12, zorder=5)
ax.text(hidden_pos[0], hidden_pos[1], "$h$", ha='center', va='center', fontsize=12, zorder=5)
ax.text(output_pos[0], output_pos[1], "$y$", ha='center', va='center', fontsize=12, zorder=5)
# Draw connections
# Input to hidden
arrow = FancyArrowPatch(
(input_pos[0] + 0.1, input_pos[1]),
(hidden_pos[0] - 0.1, hidden_pos[1]),
connectionstyle="arc3,rad=0",
arrowstyle="-|>", linewidth=1.5, color='black'
)
ax.add_patch(arrow)
# Hidden to output
arrow = FancyArrowPatch(
(hidden_pos[0] + 0.1, hidden_pos[1]),
(output_pos[0] - 0.1, output_pos[1]),
connectionstyle="arc3,rad=0",
arrowstyle="-|>", linewidth=1.5, color='black'
)
ax.add_patch(arrow)
# Recurrent connection
arrow = FancyArrowPatch(
(hidden_pos[0] + 0.05, hidden_pos[1] + 0.08),
(hidden_pos[0] - 0.05, hidden_pos[1] + 0.08),
connectionstyle="arc3,rad=-1.4",
arrowstyle="-|>", linewidth=1.5, color='red', zorder=3
)
ax.add_patch(arrow)
# Add layer labels
ax.text(input_pos[0], 0.2, "Input", ha='center', va='center', fontsize=12)
ax.text(hidden_pos[0], 0.2, "Hidden\nState", ha='center', va='center', fontsize=12)
ax.text(output_pos[0], 0.2, "Output", ha='center', va='center', fontsize=12)
# Add title
ax.set_title("Compact RNN Representation", fontsize=14)
# Second plot: Unfolded RNN over time
def draw_unfolded_rnn(ax):
# Colors
input_color = '#b3e0ff' # Light blue
hidden_color = '#d9d9d9' # Light gray
output_color = '#b3ffb3' # Light green
# Number of time steps to show
time_steps = 4
# Size parameters
node_radius = 0.06
spacing = 0.2
# Dictionary to store node positions for easier arrow drawing
positions = {}
# Draw time step labels
for t in range(time_steps):
ax.text(t*spacing + 0.1, 0.05, f"t={t}", ha='center', va='center', fontsize=12)
# First, create all positions to ensure they're available for arrows
for t in range(time_steps):
x_pos = t * spacing + 0.1
positions[('x', t)] = (x_pos, 0.3)
positions[('h', t)] = (x_pos, 0.5)
positions[('y', t)] = (x_pos, 0.7)
# Now draw nodes and connections
for t in range(time_steps):
x_pos = t * spacing + 0.1
# Input node
input_pos = positions[('x', t)]
input_node = Circle(input_pos, node_radius, color=input_color, ec='black', zorder=4)
ax.add_patch(input_node)
ax.text(input_pos[0], input_pos[1], f"$x_{{{t}}}$", ha='center', va='center', fontsize=10, zorder=5)
# Hidden node
hidden_pos = positions[('h', t)]
hidden_node = Circle(hidden_pos, node_radius, color=hidden_color, ec='black', zorder=4)
ax.add_patch(hidden_node)
ax.text(hidden_pos[0], hidden_pos[1], f"$h_{{{t}}}$", ha='center', va='center', fontsize=10, zorder=5)
# Output node
output_pos = positions[('y', t)]
output_node = Circle(output_pos, node_radius, color=output_color, ec='black', zorder=4)
ax.add_patch(output_node)
ax.text(output_pos[0], output_pos[1], f"$y_{{{t}}}$", ha='center', va='center', fontsize=10, zorder=5)
# Input to hidden connection
arrow = FancyArrowPatch(
(input_pos[0], input_pos[1] + node_radius),
(hidden_pos[0], hidden_pos[1] - node_radius),
connectionstyle="arc3,rad=0",
arrowstyle="-|>", linewidth=1, color='black'
)
ax.add_patch(arrow)
# Hidden to output connection
arrow = FancyArrowPatch(
(hidden_pos[0], hidden_pos[1] + node_radius),
(output_pos[0], output_pos[1] - node_radius),
connectionstyle="arc3,rad=0",
arrowstyle="-|>", linewidth=1, color='black'
)
ax.add_patch(arrow)
# Recurrent connection (except for the last time step)
if t < time_steps - 1:
arrow = FancyArrowPatch(
(positions[('h', t)][0] + node_radius, positions[('h', t)][1]),
(positions[('h', t+1)][0] - node_radius, positions[('h', t+1)][1]),
connectionstyle="arc3,rad=0",
arrowstyle="-|>", linewidth=1, color='red'
)
ax.add_patch(arrow)
# Add weight labels
ax.text(0.1, 0.4, "$W_{xh}$", ha='center', va='center', fontsize=10, zorder=5, bbox=dict(facecolor='white', alpha=0.8))
ax.text(0.1, 0.6, "$W_{hy}$", ha='center', va='center', fontsize=10, zorder=5, bbox=dict(facecolor='white', alpha=0.8))
arrow_center = ((positions[('h', 0)][0] + positions[('h', 1)][0])/2, positions[('h', 0)][1] + 0.03)
ax.text(arrow_center[0], arrow_center[1], "$W_{hh}$", ha='center', va='center', fontsize=10, zorder=5, bbox=dict(facecolor='white', alpha=0.8))
# Add title
ax.set_title("Unfolded RNN Over Time", fontsize=14)
# Draw both representations
draw_compact_rnn(ax1)
draw_unfolded_rnn(ax2)
# Set limits and remove axes
ax1.set_xlim(0, 1)
ax1.set_ylim(0, 1)
ax1.axis('off')
ax2.set_xlim(0, 0.8)
ax2.set_ylim(0, 0.8)
ax2.axis('off')
# Add a main title
fig.suptitle("Recurrent Neural Network Architecture", fontsize=16, y=0.98)
plt.tight_layout()
plt.savefig('recurrent_neural_network.svg', format='svg')
overfitting_visualization
CLOSED: [2025-04-12 Sat 18:16]
- State "DONE" from [2025-04-12 Sat 18:16]
code
"""
overfitting_visualization - Visualize the problem of overfitting in neural networks
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
# Create synthetic data with noise
np.random.seed(0)
n_samples = 30
X = np.sort(np.random.rand(n_samples))
y = np.sin(2 * np.pi * X) + np.random.normal(0, 0.1, n_samples)
# Split into training and test sets
X_train, y_train = X[:20].reshape(-1, 1), y[:20]
X_test, y_test = X[20:].reshape(-1, 1), y[20:]
# Create and fit models of different complexity
degrees = [1, 4, 15] # Linear, polynomial, and high-degree polynomial
colors = ['blue', 'green', 'red']
names = ['Linear (d=1)', 'Polynomial (d=4)', 'High-degree Polynomial (d=15)']
# Dense X for plotting smooth curves
X_plot = np.linspace(0, 1, 1000).reshape(-1, 1)
# Create plot
fig, axs = plt.subplots(1, 2, figsize=(14, 6))
# Training performance
axs[0].scatter(X_train, y_train, color='black', s=30, label='Training data')
axs[0].set_title('Model Fit on Training Data')
axs[0].set_xlabel('x')
axs[0].set_ylabel('y')
axs[0].set_ylim(-1.5, 1.5)
# Test performance
axs[1].scatter(X_test, y_test, color='black', s=30, label='Test data')
axs[1].set_title('Model Performance on Test Data')
axs[1].set_xlabel('x')
axs[1].set_ylim(-1.5, 1.5)
# Add ground truth
ground_truth = np.sin(2 * np.pi * X_plot.ravel())
axs[0].plot(X_plot, ground_truth, 'k:', alpha=0.5, label='Ground truth', linewidth=2)
axs[1].plot(X_plot, ground_truth, 'k:', alpha=0.5, label='Ground truth', linewidth=2)
# Train models and plot predictions
training_error = []
test_error = []
for i, degree in enumerate(degrees):
# Create and fit model
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
model.fit(X_train, y_train)
# Predict
y_plot = model.predict(X_plot)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Calculate errors
train_mse = np.mean((y_train - y_train_pred) ** 2)
test_mse = np.mean((y_test - y_test_pred) ** 2)
training_error.append(train_mse)
test_error.append(test_mse)
# Plot model fits
axs[0].plot(X_plot, y_plot, color=colors[i], linewidth=2,
label=f'{names[i]} (MSE: {train_mse:.3f})')
axs[1].plot(X_plot, y_plot, color=colors[i], linewidth=2,
label=f'{names[i]} (MSE: {test_mse:.3f})')
# Add legends
axs[0].legend(loc='upper right', fontsize=9)
axs[1].legend(loc='upper right', fontsize=9)
for ax in axs:
ax.grid(True, alpha=0.3)
fig.suptitle('Overfitting: Good Training Performance ≠ Good Generalization', fontsize=16)
# Add extra panel below showing training vs test error
fig.subplots_adjust(bottom=0.3)
ax3 = fig.add_subplot(212)
ind = np.arange(len(degrees))
width = 0.35
training_bars = ax3.bar(ind - width/2, training_error, width, color='lightblue', label='Training Error')
test_bars = ax3.bar(ind + width/2, test_error, width, color='salmon', label='Test Error')
ax3.set_xticks(ind)
ax3.set_xticklabels([f'Model {i+1}\nd={d}' for i, d in enumerate(degrees)])
ax3.set_ylabel('Mean Squared Error')
ax3.set_title('Training vs Test Error')
ax3.legend()
ax3.grid(True, axis='y', alpha=0.3)
# Annotate overfitting region
ax3.annotate('Overfitting Region', xy=(2, test_error[2]), xytext=(1.5, test_error[2]+0.05),
arrowprops=dict(facecolor='black', shrink=0.05, width=1.5),
fontsize=12)
plt.tight_layout()
plt.savefig('overfitting_visualization.svg', format='svg')
regularization_visualization
CLOSED: [2025-04-12 Sat 18:16]
- State "DONE" from [2025-04-12 Sat 18:16]
code
"""
regularization_visualization - Visualize how L2 regularization prevents overfitting
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
# Create synthetic data with noise (same as overfitting example)
np.random.seed(0)
n_samples = 30
X = np.sort(np.random.rand(n_samples))
y = np.sin(2 * np.pi * X) + np.random.normal(0, 0.1, n_samples)
# Split into training and test sets
X_train, y_train = X[:20].reshape(-1, 1), y[:20]
X_test, y_test = X[20:].reshape(-1, 1), y[20:]
# Create polynomial degree
degree = 15 # High degree polynomial that would normally overfit
# Try different regularization strengths
alphas = [0, 0.001, 0.01, 0.1, 1.0]
colors = ['red', 'orange', 'green', 'blue', 'purple']
# Dense X for plotting smooth curves
X_plot = np.linspace(0, 1, 1000).reshape(-1, 1)
# Ground truth function
true_fun = lambda x: np.sin(2 * np.pi * x)
# Create plot
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
axs = axs.flatten()
# Make additional subplot for the error comparison
train_errors = []
test_errors = []
# Plot each regularization strength
for i, alpha in enumerate(alphas):
# Create and fit model with regularization
model = make_pipeline(
PolynomialFeatures(degree),
Ridge(alpha=alpha)
)
model.fit(X_train, y_train)
# Predict
y_plot = model.predict(X_plot)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Calculate errors
train_mse = np.mean((y_train - y_train_pred) ** 2)
test_mse = np.mean((y_test - y_test_pred) ** 2)
train_errors.append(train_mse)
test_errors.append(test_mse)
# Plot model
if i < 5: # First 5 plots show individual models
ax = axs[i]
ax.scatter(X_train, y_train, color='black', s=30, label='Training data')
ax.scatter(X_test, y_test, color='black', s=30, alpha=0.3, label='Test data')
ax.plot(X_plot, true_fun(X_plot.ravel()), 'k:', alpha=0.5, label='Ground truth', linewidth=2)
ax.plot(X_plot, y_plot, color=colors[i], linewidth=2,
label=f'Model (d={degree}, λ={alpha})')
ax.set_ylim(-1.5, 1.5)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title(f'Regularization: λ={alpha}')
ax.grid(True, alpha=0.3)
ax.legend(fontsize=8, loc='upper right')
# Annotate errors
ax.text(0.05, -1.2, f'Train MSE: {train_mse:.4f}\nTest MSE: {test_mse:.4f}',
bbox=dict(facecolor='white', alpha=0.8))
# Plot learning curves (error vs. regularization strength)
ax = axs[5]
ax.plot(alphas, train_errors, 'o-', color='blue', label='Training error')
ax.plot(alphas, test_errors, 'o-', color='red', label='Test error')
ax.set_xscale('log')
ax.set_xlabel('Regularization strength (λ)')
ax.set_ylabel('Mean Squared Error')
ax.set_title('Error vs. Regularization Strength')
ax.grid(True)
ax.legend()
# Find best alpha
best_alpha_idx = np.argmin(test_errors)
ax.annotate('Best λ', xy=(alphas[best_alpha_idx], test_errors[best_alpha_idx]),
xytext=(alphas[best_alpha_idx]*2, test_errors[best_alpha_idx]*0.7),
arrowprops=dict(facecolor='black', shrink=0.05, width=1.5))
# Highlight what's happening
overfitting_text = """
Without regularization (λ=0):
- The model fits training data very well
- But performs poorly on test data
- Learns noise in the data
"""
optimal_text = """
With optimal regularization:
- Balances model complexity
- Generalizes better to test data
- Prevents overfitting
"""
fig.text(0.02, 0.02, overfitting_text, fontsize=10,
bbox=dict(facecolor='white', alpha=0.8, boxstyle='round,pad=0.5'))
fig.text(0.7, 0.02, optimal_text, fontsize=10,
bbox=dict(facecolor='white', alpha=0.8, boxstyle='round,pad=0.5'))
fig.suptitle('Effect of L2 Regularization on Preventing Overfitting', fontsize=16)
plt.tight_layout()
plt.subplots_adjust(top=0.9, bottom=0.15)
plt.savefig('regularization_visualization.svg', format='svg')
CLOSED: [2025-04-12 Sat 17:22]