Department of Mathematics
The Chinese University of Hong Kong
HSMMC Math Modelling Advanced Training Workshop
June 27, 2026
# ==============================================================================
# Step-by-Step PCA Implementation for Custom 2D Synthetic Dataset (N=7 samples)
# Fully match the manual arithmetic in the provided LaTeX document
# Plots: 1) Raw input scatter plot (before all calculations)
# 2) Covariance eigenvector axis plot + projected 1D PCA results (after computation)
# All intermediate outputs printed to console for manual answer verification
# ==============================================================================
import numpy as np
import matplotlib.pyplot as plt
# ------------------------------------------------------
# 1. Define raw input dataset (matches table in LaTeX)
# Rows = N=7 observations, Columns = x feature, y feature
# ------------------------------------------------------
# Raw (x, y) paired data from the paper table
raw_data = np.array([
[2.5, 2.4],
[0.5, 0.7],
[2.2, 2.9],
[1.9, 2.2],
[3.1, 3.0],
[2.3, 2.7],
[2.0, 1.6]
])
N = raw_data.shape[0] # Total samples N=7
d = raw_data.shape[1] # Feature dimension d=2
print("=" * 60)## ============================================================
## RAW INPUT DATASET (N=7 samples, 2 features)
## [[2.5 2.4]
## [0.5 0.7]
## [2.2 2.9]
## [1.9 2.2]
## [3.1 3. ]
## [2.3 2.7]
## [2. 1.6]]
## ============================================================
# --------------------------
# PLOT 1: Raw data visualization (display BEFORE any PCA calculation)
# --------------------------
plt.figure(figsize=(6, 5))
plt.scatter(raw_data[:, 0], raw_data[:, 1], color="darkblue", s=60, label="Raw (x,y) Samples")
plt.xlabel("x Feature")
plt.ylabel("y Feature")
plt.title("Raw 2D Input Dataset (Pre-PCA Visualization)")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()# ------------------------------------------------------
# Step 1: Mean Centering (Feature-wise mean subtraction)
# Compute feature means x̄, ȳ then center all points
# ------------------------------------------------------
# Calculate sample mean for each feature (column-wise average)
feature_means = np.mean(raw_data, axis=0)
x_mean = feature_means[0]
y_mean = feature_means[1]
# Print mean values to cross-check LaTeX manual calculation
print(f"\nFeature x mean x̄ = {x_mean:.6f} (Manual ref: 2.0714)")##
## Feature x mean x̄ = 2.071429 (Manual ref: 2.0714)
## Feature y mean ȳ = 2.214286 (Manual ref: 2.2142)
# Mean-centered data matrix X_centered = raw - mean
X_centered = raw_data - feature_means
print("\nMEAN-CENTERED DEVIATION MATRIX [X = x_i-x̄ , Y = y_i-ȳ]")##
## MEAN-CENTERED DEVIATION MATRIX [X = x_i-x̄ , Y = y_i-ȳ]
## [[ 0.4286 0.1857]
## [-1.5714 -1.5143]
## [ 0.1286 0.6857]
## [-0.1714 -0.0143]
## [ 1.0286 0.7857]
## [ 0.2286 0.4857]
## [-0.0714 -0.6143]]
# ------------------------------------------------------
# Step 2: Compute Unbiased Sample Covariance Matrix
# Formula: Σ = 1/(N-1) * X_centered^T @ X_centered
# Denominator N-1 = 6 for unbiased sample covariance
# ------------------------------------------------------
cov_matrix = np.cov(X_centered, rowvar=False) # rowvar=False: columns = features
print("\nSAMPLE COVARIANCE MATRIX Σ (Unbiased, N-1=6)")##
## SAMPLE COVARIANCE MATRIX Σ (Unbiased, N-1=6)
## [[0.63571429 0.58547619]
## [0.58547619 0.67142857]]
## Matches manual result: [[0.63571429, 0.58547619],[0.58547619, 0.67142857]]
# ------------------------------------------------------
# Step 3: Eigendecomposition of Covariance Matrix
# Solve Σu = λu; sort eigenvalues descending order
# ------------------------------------------------------
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
# Sort eigenpairs from largest λ to smallest λ
sort_idx = np.argsort(eigenvalues)[::-1]
sorted_eig_vals = eigenvalues[sort_idx]
sorted_eig_vecs = eigenvectors[:, sort_idx]
# Extract PC1 eigenvector (largest eigenvalue basis vector)
pc1_vector = sorted_eig_vecs[:, 0]
lambda1, lambda2 = sorted_eig_vals[0], sorted_eig_vals[1]
print("\nSORTED EIGENVALUES (Descending λ1, λ2)")##
## SORTED EIGENVALUES (Descending λ1, λ2)
## λ1 = 1.23931988+0.00000000j, λ2 = 0.06782298+0.00000000j
## Manual reference: λ1=1.23931988, λ2=0.06782298
##
## ORTHONORMAL EIGENVECTOR MATRIX U (Columns = PC1, PC2)
## [[-0.69624492+0.j -0.7178043 +0.j]
## [-0.7178043 +0.j 0.69624492+0.j]]
# ------------------------------------------------------
# Step 4: Calculate Explained Variance Ratios
# EV_k = λ_k / (λ1 + λ2), total sum λ1+λ2 = total dataset variance
# ------------------------------------------------------
total_variance = lambda1 + lambda2
ev_ratio1 = lambda1 / total_variance
ev_ratio2 = lambda2 / total_variance
print("\nEXPLAINED VARIANCE RATIOS")##
## EXPLAINED VARIANCE RATIOS
## Total Variance Sum λ1+λ2 = 1.30714286+0.00000000j
## EV1 (PC1) = 0.94811357+0.00000000j (~94.81%)
## EV2 (PC2) = 0.05188643+0.00000000j (~5.19%)
## Sum EV1+EV2 = 1.0+0.0j (must equal 1)
# ------------------------------------------------------
# Step 5: Project centered data onto PC1 basis to get 1D reduced data
# Projection formula: Z = X_centered @ pc1_vector
# Equivalent to Z = u1^T * X_centered.T (matches LaTeX math notation)
# ------------------------------------------------------
Z_1d = X_centered @ pc1_vector
print("\nFINAL 1D PCA PROJECTION VALUES (PC1 Compressed Dataset)")##
## FINAL 1D PCA PROJECTION VALUES (PC1 Compressed Dataset)
## [-0.431697+0.j 2.18106 +0.j -0.581726+0.j 0.129611+0.j -1.280127+0.j
## -0.507789+0.j 0.490669+0.j]
## Matches manual table of PC1 projected values in LaTeX
# --------------------------
# PLOT 2: Post-PCA Result Visualization
# Subplot 1: Centered data + PC1 / PC2 eigenvector axes
# Subplot 2: 1D projected PC1 values distribution
# --------------------------
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Subplot 1: Centered data with principal component axes
ax1.scatter(X_centered[:, 0], X_centered[:, 1], c="crimson", s=60, label="Mean-Centered Samples")
# Draw PC1 eigenvector line
scale = 1.2
ax1.plot([0, pc1_vector[0]*scale], [0, pc1_vector[1]*scale], c="black", lw=2.5, label="PC1 Axis (94.81% variance)")
# Draw PC2 eigenvector line
pc2_vector = sorted_eig_vecs[:, 1]
ax1.plot([0, pc2_vector[0]*scale], [0, pc2_vector[1]*scale], c="gray", lw=2, linestyle="--", label="PC2 Axis (5.19% variance)")
ax1.axhline(y=0, color="k", alpha=0.2)
ax1.axvline(x=0, color="k", alpha=0.2)
ax1.set_xlabel("Centered X")
ax1.set_ylabel("Centered Y")
ax1.set_title("Centered Data + PCA Principal Component Axes")
ax1.grid(True, alpha=0.3)
ax1.legend()
# Subplot 2: 1D projected PC1 values
# sample_indices = np.arange(1, N+1)
# ax2.scatter(Z_1d, sample_indices, c="darkgreen", s=60)
# ax2.set_xlabel("PC1 Projected 1D Value")
# ax2.set_ylabel("Sample Index (1 to 7)")
# ax2.set_title("Reduced 1D Dataset After PCA Projection")
# ax2.grid(True, alpha=0.3)
# plt.tight_layout()
# plt.show()
# Create index labels for original 7 samples (1,2,3,4,5,6,7)
sample_indices = np.arange(1, N+1)
# Scatter plot: horizontal axis = compressed 1D PCA values Z₁d; vertical axis = original sample ID
ax2.scatter(Z_1d, sample_indices, c="darkgreen", s=60)
# Axis labels for interpretability
ax2.set_xlabel("PC1 Projected 1D Value")
ax2.set_ylabel("Sample Index (1 to 7)")
# Plot title states core output: dimensionality reduction from 2D → 1D
ax2.set_title("Reduced 1D Dataset After PCA Projection")
ax2.grid(True, alpha=0.3)
# Auto-adjust subplot spacing
plt.tight_layout()
# Render combined figure (centered data axes + this 1D projection panel)
plt.show()# ==============================================================================
# Answer Verification Summary Block
# All numerical outputs printed above align exactly with LaTeX manual derivation
# Two visualization groups generated: raw input plot (pre-computation), PCA analysis plots (post-result)
# ==============================================================================
print("\n" + "="*60)##
## ============================================================
print("VERIFICATION COMPLETE: All intermediate values match manual PCA calculation in the LaTeX document")## VERIFICATION COMPLETE: All intermediate values match manual PCA calculation in the LaTeX document
## 1. Raw data plot shown before all PCA computation
## 2. Post-PCA figures generated with centered data axes & 1D projection distribution
## ============================================================
# ==============================================================================
# Full PCA Pipeline for 4-Variable Computer Preference Likert Survey
# N=16 survey respondents, p=4 features: Price, Software, Aesthetics, Brand
# Workflow matched 1:1 to reference SageMath code
# Visualization split:
# 1) Pre-computation pairwise raw data plot (displayed BEFORE PCA math)
# 2) Post-computation dual plot: Scree variance line + 2D PC1/PC2 scatter
# All intermediate matrices and metrics printed for manual cross-checking
# ==============================================================================
import numpy as np
import matplotlib.pyplot as plt
# ------------------------------------------------------------------------------
# Step 1: Define raw survey data matrix (exact copy from SageMath input)
# Rows = individual respondents, Columns = [Price, Software, Aesthetics, Brand]
# Likert scale: 1 = Strongly Disagree, 7 = Strongly Agree
# ------------------------------------------------------------------------------
raw_X = np.array([
[6, 5, 3, 4],
[7, 3, 2, 2],
[6, 4, 4, 5],
[5, 7, 1, 3],
[7, 7, 5, 5],
[6, 4, 2, 3],
[5, 7, 2, 1],
[6, 5, 4, 4],
[3, 5, 6, 7],
[1, 3, 7, 5],
[2, 6, 6, 7],
[5, 7, 7, 6],
[2, 4, 5, 6],
[3, 5, 6, 5],
[1, 6, 5, 5],
[2, 3, 7, 7]
])
n_samples, p_features = raw_X.shape
feature_names = ["Price", "Software", "Aesthetics", "Brand"]
# Print raw input matrix for manual verification
print("=" * 72)## ========================================================================
## RAW SURVEY DATA MATRIX X (16 samples × 4 features)
## Feature Column Order: ['Price', 'Software', 'Aesthetics', 'Brand']
## [[6 5 3 4]
## [7 3 2 2]
## [6 4 4 5]
## [5 7 1 3]
## [7 7 5 5]
## [6 4 2 3]
## [5 7 2 1]
## [6 5 4 4]
## [3 5 6 7]
## [1 3 7 5]
## [2 6 6 7]
## [5 7 7 6]
## [2 4 5 6]
## [3 5 6 5]
## [1 6 5 5]
## [2 3 7 7]]
##
## Sample count n = 16, Feature dimension p = 4
## ========================================================================
# --------------------------
# PLOT 1: Pre-PCA Raw Pairwise Visualization (Rendered BEFORE all PCA calculations)
# 4x4 grid: Diagonal = single-feature histograms, Off-diagonal = feature scatter plots
# --------------------------
fig_raw, axes_raw = plt.subplots(p_features, p_features, figsize=(10, 10))
fig_raw.suptitle("Raw 4D Survey Data Pairwise Preview (Pre-PCA Visualization)", fontsize=14)
for i in range(p_features):
for j in range(p_features):
ax = axes_raw[i, j]
if i != j:
ax.scatter(raw_X[:, j], raw_X[:, i], c="navy", s=30, alpha=0.7)
else:
ax.hist(raw_X[:, i], bins=range(0, 9), color="steelblue", alpha=0.6)
ax.set_xlabel(feature_names[j], fontsize=8)
ax.set_ylabel(feature_names[i], fontsize=8)
ax.tick_params(axis="both", labelsize=6)
ax.grid(True, alpha=0.2)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()# ------------------------------------------------------------------------------
# Step 2: Standardization (Z-score centering + scaling, replicate Sage nested for-loops)
# Formula for standardized entry: X_ctr[row,col] = (X[row,col] - column_mean) / column_std
# Population standard deviation (ddof=0) to match SageMath native behavior
# ------------------------------------------------------------------------------
X_standardized = np.zeros_like(raw_X, dtype=np.float64)
# Iterate over every feature column
for col_idx in range(p_features):
single_col = raw_X[:, col_idx]
col_mean = np.mean(single_col)
col_std = np.std(single_col, ddof=0)
# Iterate over every sample row to fill standardized matrix
for row_idx in range(n_samples):
X_standardized[row_idx, col_idx] = (raw_X[row_idx, col_idx] - col_mean) / col_std
# Print standardized matrix rounded to 4 decimal places (match Sage .n(digits=4))
print("\nSTANDARDIZED (CENTERED + SCALED) MATRIX X_ctr (4 decimal precision)")##
## STANDARDIZED (CENTERED + SCALED) MATRIX X_ctr (4 decimal precision)
## [[ 0.8764 -0.0436 -0.7746 -0.3993]
## [ 1.3599 -1.4375 -1.291 -1.5608]
## [ 0.8764 -0.7405 -0.2582 0.1815]
## [ 0.3929 1.3504 -1.8074 -0.98 ]
## [ 1.3599 1.3504 0.2582 0.1815]
## [ 0.8764 -0.7405 -1.291 -0.98 ]
## [ 0.3929 1.3504 -1.291 -2.1416]
## [ 0.8764 -0.0436 -0.2582 -0.3993]
## [-0.5742 -0.0436 0.7746 1.343 ]
## [-1.5412 -1.4375 1.291 0.1815]
## [-1.0577 0.6534 0.7746 1.343 ]
## [ 0.3929 1.3504 1.291 0.7623]
## [-1.0577 -0.7405 0.2582 0.7623]
## [-0.5742 -0.0436 0.7746 0.1815]
## [-1.5412 0.6534 0.2582 0.1815]
## [-1.0577 -1.4375 1.291 1.343 ]]
## ------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# Step 3: Singular Value Decomposition (SVD): X_ctr = U @ S @ V.T
# np.linalg.svd returns U, singular value vector, V transpose (Vh)
# ------------------------------------------------------------------------------
U, singular_vals, V_transpose = np.linalg.svd(X_standardized, full_matrices=False)
S_diag = np.diag(singular_vals)
# ------------------------------------------------------------------------------
# Step 4: Calculate Principal Component Variance & Explained Variance Percentage
# Sage formula: Var_PC[k] = singular_vals[k]^2 / n_samples
# Sage formula: PropVar[k] = 100 * Var_PC[k] / total_variance
# ------------------------------------------------------------------------------
pc_variances = [(singular_vals[k] ** 2) / n_samples for k in range(p_features)]
total_var = sum(pc_variances)
explained_var_pct = [100 * var / total_var for var in pc_variances]
# Print PCA variance metrics for manual answer cross-verification
print("Principal Component Variances (Var_PC, 4 decimal precision):")## Principal Component Variances (Var_PC, 4 decimal precision):
## [2.4303 0.9612 0.4647 0.1438]
##
## Percentage of Total Variance Explained by Each PC (Prop_Var %):
## [60.76 24.03 11.62 3.6 ]
cumulative_pc1pc2 = sum(explained_var_pct[:2])
print(f"\nCumulative explained variance PC1 + PC2 = {np.round(cumulative_pc1pc2, 2)}%")##
## Cumulative explained variance PC1 + PC2 = 84.79%
## (Confirms document claim: ~85% of total dataset information retained with 2 dimensions)
## ------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# Step 5: Compute PCA Score Matrix Z = U @ S (match Sage PC_score = U*S)
# Extract only PC1 and PC2 scores for 2D visualization
# ------------------------------------------------------------------------------
pc_score_matrix = U @ S_diag
pc1_pc2_scores = pc_score_matrix[:, 0:2]
# --------------------------
# PLOT 2: Post-PCA Dual Subplot Figure (Matches reference dual-chart image)
# Left: Scree variance line plot (red line labeled "Variances")
# Right: 2D scatter of PC1 vs PC2 sample projections (red data points)
# --------------------------
fig_post, (ax_scree, ax_pc2d) = plt.subplots(1, 2, figsize=(14, 6))
# Left Subplot: Scree Plot
pc_index_axis = np.arange(1, p_features + 1)
ax_scree.plot(pc_index_axis, pc_variances, color="red", linewidth=2, marker="o")
ax_scree.set_title("Scree Plot of Principal Component Variances", fontsize=12)
ax_scree.set_xlabel("Principal Component Index (PC1, PC2, PC3, PC4)")
ax_scree.set_ylabel("Variances")
ax_scree.grid(True, alpha=0.3)
ax_scree.set_xticks(pc_index_axis)
# Vertical dashed elbow cutoff line at PC2
ax_scree.axvline(x=2, color="black", linestyle="--", alpha=0.6, label="Elbow Cutoff: Retain PC1 + PC2")
ax_scree.legend()
# Right Subplot: 2D PC Score Scatter Plot
ax_pc2d.scatter(pc1_pc2_scores[:, 0], pc1_pc2_scores[:, 1], color="red", s=40)
ax_pc2d.set_title("2D Projection of Survey Data (PC1 vs PC2)", fontsize=12)
ax_pc2d.set_xlabel(f"PC1 ({np.round(explained_var_pct[0],1)}% Variance)")
ax_pc2d.set_ylabel(f"PC2 ({np.round(explained_var_pct[1],1)}% Variance)")
ax_pc2d.axhline(y=0, color="gray", alpha=0.4)
ax_pc2d.axvline(x=0, color="gray", alpha=0.4)
ax_pc2d.grid(True, alpha=0.3)
fig_post.suptitle("Post-PCA Analysis Output Visualizations", fontsize=14)
plt.tight_layout(rect=[0, 0, 1, 0.94])
# --------------------------
# NEW CODE BLOCK: Save figure to current working directory
# Output filename: pca_dual_output.png (matches attached reference figure)
# --------------------------
plt.savefig("pca_dual_output.png", dpi=300, bbox_inches="tight")
print("\nFigure saved successfully as 'pca_dual_output.png' in the current folder!")##
## Figure saved successfully as 'pca_dual_output.png' in the current folder!
# ==============================================================================
# Final Numerical Verification Summary Output
# ==============================================================================
print("\n" + "="*72)##
## ========================================================================
## VERIFICATION COMPLETE: Full PCA pipeline exactly replicates original SageMath workflow
## 1. Raw pairwise feature plot generated BEFORE all PCA mathematical calculations
## 2. Standardization uses identical nested row/column loops from reference Sage code
## 3. SVD decomposition, PC variance, and explained variance ratios fully cross-validated
## 4. Post-PCA dual figures rendered: Red scree variance line + red 2D PC scatter points
print("5. PC1 + PC2 capture ~85% of total dataset variance for effective 4D → 2D dimension reduction")## 5. PC1 + PC2 capture ~85% of total dataset variance for effective 4D → 2D dimension reduction
## ========================================================================