load("numerics")$
load("ax-plots")$

n : 200$
angle : float(%pi/6)$   /* 30 degrees */

/* Generate data along the true principal axes */
pc1 : np_scale(3.0, np_randn([n]))$   /* high variance axis */
pc2 : np_scale(0.5, np_randn([n]))$   /* low variance axis */

/* Rotate into x-y coordinates */
x : np_add(np_scale(cos(angle), pc1), np_scale(-sin(angle), pc2))$
y : np_add(np_scale(sin(angle), pc1), np_scale(cos(angle), pc2))$

print("Generated", n, "points with sigma1=3, sigma2=0.5, angle=30 deg")$

Generated 200 points with sigma1=3, sigma2=0.5, angle=30 deg

/* Plot the raw data */
ax_draw2d(
  marker_size=4, opacity=0.5,
  points(x, y),
  title="Original Data",
  xlabel="x", ylabel="y",
  aspect_ratio=true
)$

/* Stack x, y into an n-by-2 matrix */
data : np_hstack(
  np_reshape(x, [n, 1]),
  np_reshape(y, [n, 1])
)$
print("Data shape:", np_shape(data))$

/* Compute the sample covariance matrix */
C : np_cov(data)$
print("Covariance matrix:")$
np_to_matrix(C);

Data shape: [200,2]
Covariance matrix:

/* Eigendecomposition of the covariance matrix */
[vals, vecs] : np_eig(C)$

/* Sort by descending eigenvalue so PC1 = dominant component */
if np_ref(vals, 0) < np_ref(vals, 1) then (
  vals : ndarray([np_ref(vals, 1), np_ref(vals, 0)], [2]),
  vecs : np_hstack(np_col(vecs, 1), np_col(vecs, 0))
)$

print("Eigenvalues (descending):")$
print(np_to_list(vals))$

print("Eigenvectors (columns):")$
np_to_matrix(vecs);

Eigenvalues (descending):
[9.883969942230777,0.24901138222797123]
Eigenvectors (columns):

/* Explained variance ratio: fraction of total variance per component */
total_var : np_sum(vals)$
ratios : np_to_list(np_scale(1.0 / total_var, vals))$

print("Explained variance ratios:")$
for i : 1 thru length(ratios) do
  printf(true, "  PC~d: ~,1f%~%", i, ratios[i] * 100)$

Explained variance ratios:
  PC1: 97.5%o144
o144  PC2: 2.5%o144
o144$$\mathbf{done}$$
false

/* Compute the mean of x and y for centering the arrows */
mx : np_mean(x)$
my : np_mean(y)$

/* Extract eigenvectors and scale by sqrt(eigenvalue) */
ev1 : np_to_list(np_col(vecs, 0))$
ev2 : np_to_list(np_col(vecs, 1))$
scale1 : sqrt(lam[1])$
scale2 : sqrt(lam[2])$

ax_draw2d(
  color="steelblue", marker_size=4, opacity=0.4,
  points(x, y),
  /* First eigenvector (larger variance — dominant PC) */
  color="darkred", line_width=3,
  lines([mx, mx + scale1 * ev1[1]], [my, my + scale1 * ev1[2]]),
  /* Second eigenvector (smaller variance) */
  color="red", line_width=3,
  lines([mx, mx + scale2 * ev2[1]], [my, my + scale2 * ev2[2]]),
  title="Data with Principal Component Directions",
  xlabel="x", ylabel="y",
  aspect_ratio=true
)$

/* Mean-center each column */
mean_vec : np_mean(data, 0)$
centered : np_sub(data, np_reshape(mean_vec, [1, 2]))$

/* Project onto principal components */
projected : np_matmul(centered, vecs)$
print("Projected data shape:", np_shape(projected))$

/* Verify the projected covariance is (approximately) diagonal */
print("Covariance in PC space (should be nearly diagonal):")$
np_to_matrix(np_cov(projected));

Projected data shape: [200,2]
Covariance in PC space (should be nearly diagonal):

/* Plot the projected data -- axes should be aligned with variance */
proj_x : np_col(projected, 0)$
proj_y : np_col(projected, 1)$

ax_draw2d(
  color="darkorange", marker_size=4, opacity=0.5,
  points(proj_x, proj_y),
  title="Data in Principal Component Space",
  xlabel="PC1", ylabel="PC2",
  aspect_ratio=true
)$

ax_draw2d(
  color="steelblue",
  ax_bar(["PC1", "PC2"], ratios),
  title="Explained Variance Ratio",
  ylabel="Fraction of Total Variance"
)$

/* Keep only the dominant PC (column 0, the largest eigenvalue) */
scores_dominant : np_reshape(np_col(projected, 0), [n, 1])$
vec_dominant : np_reshape(np_col(vecs, 0), [1, 2])$

/* Reconstruct: project back and add the mean */
reconstructed : np_add(
  np_matmul(scores_dominant, vec_dominant),
  np_reshape(mean_vec, [1, 2])
)$

rx : np_col(reconstructed, 0)$
ry : np_col(reconstructed, 1)$

ax_draw2d(
  color="steelblue", marker_size=4, opacity=0.3,
  name="Original", points(x, y),
  color="red", marker_size=4, opacity=0.7,
  name="Reconstructed (1 PC)", points(rx, ry),
  title="Original vs Reconstructed from Dominant PC",
  xlabel="x", ylabel="y",
  aspect_ratio=true
)$

Principal Component Analysis (PCA)¶

Generating 2D Data with Known Structure¶

Computing the Covariance Matrix¶

Eigendecomposition¶

Visualizing Principal Components¶

Projecting onto Principal Components¶

Explained Variance Plot¶

Reconstruction from the First Principal Component¶

Summary¶