香港中文大學數學系
2026年6月27日
聚類分析(Cluster analysis)是一種數據分析技術,旨在識別和聚合具同質性的觀測值或數據點——即表現出可比較特徵和內在屬性的實體。作為一種數據探勘(data mining)方法,它構成了一種無監督學習(unsupervised learning)演算法,能揭示隱藏在數據集中的潛在模式和內在結構關係。
基於質心的聚類(Centroid-based clustering,例如 K-means)和階層分群(hierarchical clustering)是聚類分析中廣泛採用的兩種不同方法。
隨機抽取 20 個觀測樣本,並設定隨機種子以確保結果可重現
描述性統計
## === Descriptive Statistics ===
## Aroma Flavor ... Quakers Category.Two.Defects
## count 20.000000 20.000000 ... 20.0 20.000000
## mean 7.576000 7.605000 ... 0.0 4.050000
## std 0.266959 0.367058 ... 0.0 6.021671
## min 7.170000 6.920000 ... 0.0 0.000000
## 25% 7.420000 7.420000 ... 0.0 0.000000
## 50% 7.540000 7.625000 ... 0.0 2.500000
## 75% 7.750000 7.750000 ... 0.0 4.000000
## max 8.080000 8.670000 ... 0.0 20.000000
##
## [8 rows x 15 columns]
移除標準差為零的欄位(Uniformity, Clean.Cup, Sweetness, Category.Two.Defects)
drop_cols = ["Uniformity", "Clean.Cup", "Sweetness", "Category.Two.Defects"]
df = df.drop(columns=drop_cols)移除最後一個欄位(字元型類別變數)
對數據集進行縮放(Scaling)至關重要,因為各個特徵通常具有差異極大的測量範圍。如果不進行縮放,數值較大的變數將不成比例地扭曲聚類結果。
聚類分析中兩種廣泛使用的縮放方法是:標準化(standardization),它透過減去每個特徵的平均值並除以其標準差來轉換數據;以及正規化(normalization),它將所有數值重新縮放至一個固定的有界範圍內。
\[ X_{stand} = \frac{X - \mu_x}{\sigma_x} \\ X_{norm} = \frac{X - X_{min}}{X_{max}-X_{min}} \]
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# ----------------------
# 標準化 (Z-score)
# ----------------------
scaler_standard = StandardScaler()
df_scaled_standard = pd.DataFrame(
scaler_standard.fit_transform(df_com),
columns=df_com.columns
)
# ----------------------
# 最大最小正規化 (0~1 範圍)
# ----------------------
scaler_minmax = MinMaxScaler()
df_scaled_norm = pd.DataFrame(
scaler_minmax.fit_transform(df_com),
columns=df_com.columns
)
# 如果您只需要標準化後的數據:
df_scaled = df_scaled_standard 最佳的聚類數量取決於數據集及其預期應用。儘管如此,目前有幾種方法可以幫助找出基於質心的聚類的最佳分群數。
這種方法會繪製群內誤差平方和(Within-cluster sum of squares, WSS)與聚類數量的關係圖。WSS 量化了數據點在其各自的聚類中聚集的緊密程度。肘部法則透過找出 WSS 曲線上遞減速率開始趨於平緩的點(即拐點)來確定最佳聚類數量;這個拐點通常被選為理想的聚類數。
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
# 計算群內誤差平方和 (WSS) 以用於肘部法則
wss = []
max_k = 10 # 您可以調整要測試的最大聚類數量
for k in range(1, max_k + 1):
kmeans = KMeans(n_clusters=k, random_state=100)
kmeans.fit(df_scaled)
wss.append(kmeans.inertia_) # inertia = WSSKMeans(n_clusters=10, random_state=100)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
| kmeans0 |
| kmeans1 |
| kmeans2 |
| kmeans3 |
| kmeans4 |
| kmeans5 |
| kmeans6 |
| kmeans7 |
| kmeans8 |
| kmeans9 |
# 繪製肘部曲線
plt.figure(figsize=(8, 5))
plt.plot(range(1, max_k + 1), wss, marker='o')
plt.xlabel("Number of clusters k")
plt.ylabel("Within-cluster sum of squares (WSS)")
plt.title("Elbow method")
plt.axvline(x=2, linestyle='--', color='red') # 在 k=2 處畫紅色垂直虛線
plt.show()輪廓係數(Silhouette score)量化了每個數據點與其分配到的聚類(相對於相鄰聚類)的匹配程度。其數值範圍從 -1 到 1:得分為 1 表示該點完美契合其聚類,得分為 0 表示該點恰好位於兩個聚類的邊界上,而得分為 -1 則表示該點更適合分配到其他聚類。最佳的聚類數量通常是能產生最高平均輪廓係數的那個值。
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
import numpy as np
# ----------------------
# 相當於 fviz_nbclust(..., method = "silhouette")
# 繪製不同 k 值的平均輪廓係數
# ----------------------
max_k = 10
sil_avg_scores = []
k_range = range(2, max_k + 1) # 當 k=1 時輪廓係數未定義
for k in k_range:
km = KMeans(n_clusters=k, random_state=100)
cluster_labels = km.fit_predict(df_scaled)
avg_sil = silhouette_score(df_scaled, cluster_labels)
sil_avg_scores.append(avg_sil)
plt.figure(figsize=(8, 5))
plt.plot(k_range, sil_avg_scores, marker="o")
plt.xlabel("Number of clusters k")
plt.ylabel("Average Silhouette Score")
plt.title("Silhouette method")
plt.show()# ----------------------
# 相當於 silhouette() + plot + fviz_silhouette,針對 k=2
# ----------------------
# 擬合 k=2 的 KMeans
km_res = KMeans(n_clusters=2, random_state=100)
labels = km_res.fit_predict(df_scaled)
# 計算每一個樣本的輪廓值
sil_vals = silhouette_samples(df_scaled, labels)
# 簡單的輪廓圖 (對應基礎 R 語言的 plot(sil))
fig, ax = plt.subplots(figsize=(8, 5))
y_lower = 10
for cluster_id in np.unique(labels):
# 提取目前聚類的輪廓係數並進行排序
cluster_sil = sil_vals[labels == cluster_id]
cluster_sil.sort()
y_upper = y_lower + len(cluster_sil)
ax.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_sil, alpha=0.7)
y_lower = y_upper + 10
ax.axvline(x=np.mean(sil_vals), color="red", linestyle="--")
ax.set_xlabel("Silhouette Coefficient")
ax.set_ylabel("Sample Index")
ax.set_title("Silhouette Plot (k=2)")
plt.show()輪廓係數為正數表示觀測值很適合其分配的聚類。
間隙統計量(Gap statistic)透過比較觀測到的群內離散度與參考分佈來協助確定最佳聚類數。其核心概念是選擇能使間隙統計量最大化的聚類數量。較大的間隙表示偏離隨機性的程度越顯著,意味著聚類結構定義得越好。透過比較不同聚類數量的間隙,我們可以找出能提供最鮮明且具意義之聚類模式的分群數。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
def calculate_gap_statistic(data, max_k, n_references=10):
"""
重製 factoextra::fviz_nbclust(..., method = "gap_stat")
計算每個 k 的間隙統計量與標準誤
"""
# 將 dataframe 轉換為 numpy 陣列
X = data.values if hasattr(data, "values") else np.array(data)
n_samples, n_features = X.shape
k_range = np.arange(1, max_k + 1)
gap_scores = []
gap_standard_errors = []
for k in k_range:
# 1. 計算實際數據的群內離散度 Wk
kmeans = KMeans(n_clusters=k, random_state=100)
labels = kmeans.fit_predict(X)
dists, _ = pairwise_distances_argmin_min(X, kmeans.cluster_centers_)
wk = np.sum(dists ** 2)
# 2. 產生隨機參考數據集並計算參考的 Wkb
ref_wk_log = []
for _ in range(n_references):
ref_data = np.random.uniform(
low=X.min(axis=0),
high=X.max(axis=0),
size=X.shape
)
ref_kmeans = KMeans(n_clusters=k, random_state=100)
ref_labels = ref_kmeans.fit_predict(ref_data)
ref_dists, _ = pairwise_distances_argmin_min(ref_data, ref_kmeans.cluster_centers_)
ref_wk = np.sum(ref_dists ** 2)
ref_wk_log.append(np.log(ref_wk))
# 間隙統計量公式
log_wk = np.log(wk)
mean_log_ref = np.mean(ref_wk_log)
gap = mean_log_ref - log_wk
gap_scores.append(gap)
# 誤差線的標準誤
sd_ref = np.std(ref_wk_log)
se = sd_ref * np.sqrt(1 + 1 / n_references)
gap_standard_errors.append(se)
return k_range, np.array(gap_scores), np.array(gap_standard_errors)
# ---------------------- 執行間隙統計量圖 ----------------------
# 請將 df_scaled 替換為您實際的標準化 dataframe 變數
max_k_test = 10
k_values, gap_values, gap_error = calculate_gap_statistic(df_scaled, max_k=max_k_test)
# 繪製相當於 R fviz_nbclust gap_stat 的圖表
plt.figure(figsize=(8, 5))
plt.errorbar(k_values, gap_values, yerr=gap_error, marker="o", capsize=4)
plt.xlabel("Number of clusters k")
plt.ylabel("Gap statistic")
plt.title("Gap statistic method")
plt.xticks(k_values)## ([<matplotlib.axis.XTick object at 0x000001FA46F48920>, <matplotlib.axis.XTick object at 0x000001FA4A371B20>, <matplotlib.axis.XTick object at 0x000001FA4CA879B0>, <matplotlib.axis.XTick object at 0x000001FA4A3FB680>, <matplotlib.axis.XTick object at 0x000001FA4A3FA9C0>, <matplotlib.axis.XTick object at 0x000001FA4A3F92E0>, <matplotlib.axis.XTick object at 0x000001FA4A440DD0>, <matplotlib.axis.XTick object at 0x000001FA4A4412B0>, <matplotlib.axis.XTick object at 0x000001FA4A3FA4B0>, <matplotlib.axis.XTick object at 0x000001FA4A440DA0>], [Text(1, 0, '1'), Text(2, 0, '2'), Text(3, 0, '3'), Text(4, 0, '4'), Text(5, 0, '5'), Text(6, 0, '6'), Text(7, 0, '7'), Text(8, 0, '8'), Text(9, 0, '9'), Text(10, 0, '10')])
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, pairwise_distances_argmin_min
# --------------------------
# 1. 輔助函數: 肘部法則 (WSS)
# --------------------------
def elbow_curve(data, max_k=10, seed=100):
X = data.values
wss = []
for k in range(1, max_k+1):
km = KMeans(n_clusters=k, random_state=seed)
km.fit(X)
wss.append(km.inertia_)
return np.arange(1, max_k+1), np.array(wss)
# --------------------------
# 2. 輔助函數: 輪廓係數
# --------------------------
def silhouette_curve(data, max_k=10, seed=100):
X = data.values
k_range = np.arange(2, max_k+1)
sil_scores = []
for k in k_range:
km = KMeans(n_clusters=k, random_state=seed)
labels = km.fit_predict(X)
sil_scores.append(silhouette_score(X, labels))
return k_range, np.array(sil_scores)
# --------------------------
# 3. 輔助函數: 間隙統計量
# --------------------------
def gap_stat_curve(data, max_k=10, n_ref=10, seed=100):
X = data.values
n, p = X.shape
k_list = np.arange(1, max_k+1)
gaps = []
np.random.seed(seed)
for k in k_list:
# 實際數據 Wk
km = KMeans(n_clusters=k, random_state=seed)
labels = km.fit_predict(X)
dists, _ = pairwise_distances_argmin_min(X, km.cluster_centers_)
wk = np.sum(dists ** 2)
log_wk = np.log(wk)
# 參考均勻數據集
ref_log = []
for _ in range(n_ref):
ref = np.random.uniform(X.min(axis=0), X.max(axis=0), X.shape)
km_r = KMeans(n_clusters=k, random_state=seed)
r_labels = km_r.fit_predict(ref)
r_dists, _ = pairwise_distances_argmin_min(ref, km_r.cluster_centers_)
ref_log.append(np.log(np.sum(r_dists ** 2)))
gap = np.mean(ref_log) - log_wk
gaps.append(gap)
return k_list, np.array(gaps)
# --------------------------
# 4. 對縮放後的數據執行這三種方法
# --------------------------
max_cluster = 10
k_elbow, wss_vals = elbow_curve(df_scaled, max_k=max_cluster)
k_sil, sil_vals = silhouette_curve(df_scaled, max_k=max_cluster)
k_gap, gap_vals = gap_stat_curve(df_scaled, max_k=max_cluster)
# 為每種方法找出最佳的 k
# 肘部法則: 挑選肘部拐點(簡單的啟發式方法:最大下降率)
wss_drop = np.diff(wss_vals)
opt_elbow = np.argmin(wss_drop) + 2
# 輪廓係數: 具有最高輪廓係數的 k
opt_sil = k_sil[np.argmax(sil_vals)]
# 間隙統計量: 具有最大間隙值的 k
opt_gap = k_gap[np.argmax(gap_vals)]
# 匯總投票結果 (與 R n_clust 的多數決邏輯相符)
votes = [opt_elbow, opt_sil, opt_gap]
from collections import Counter
vote_count = Counter(votes)
majority_k = vote_count.most_common(1)[0][0]
# 印出摘要 (相當於 R 中的 print(n_clust))
print("=== Optimal Cluster Count Votes from Multiple Methods ===")## === Optimal Cluster Count Votes from Multiple Methods ===
## Elbow Method optimal k: 2
## Silhouette Method optimal k: 2
## Gap Statistic optimal k: 1
##
## Vote tally: {np.int64(2): 2, np.int64(1): 1}
## Majority-supported optimal number of clusters: 2
# --------------------------
# 繪製組合結果 (相當於 R 中的 plot(n_clust))
# --------------------------
fig, axes = plt.subplots(3, 1, figsize=(8, 10))
# 子圖 1: 肘部法則
axes[0].plot(k_elbow, wss_vals, marker="o")
axes[0].axvline(opt_elbow, color="red", ls="--", label=f"Opt k={opt_elbow}")
axes[0].set_title("Elbow Method (WSS)")
axes[0].set_xlabel("Number of clusters k")
axes[0].set_ylabel("Within-cluster Sum of Squares")
axes[0].legend()
axes[0].grid(alpha=0.3)
# 子圖 2: 輪廓係數
axes[1].plot(k_sil, sil_vals, marker="o", color="green")
axes[1].axvline(opt_sil, color="red", ls="--", label=f"Opt k={opt_sil}")
axes[1].set_title("Silhouette Score Method")
axes[1].set_xlabel("Number of clusters k")
axes[1].set_ylabel("Average Silhouette Score")
axes[1].legend()
axes[1].grid(alpha=0.3)
# 子圖 3: 間隙統計量
axes[2].plot(k_gap, gap_vals, marker="o", color="orange")
axes[2].axvline(opt_gap, color="red", ls="--", label=f"Opt k={opt_gap}")
axes[2].set_title("Gap Statistic Method")
axes[2].set_xlabel("Number of clusters k")
axes[2].set_ylabel("Gap Statistic Value")
axes[2].legend()
axes[2].grid(alpha=0.3)
plt.tight_layout()
plt.show()使用函數 dist():
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist
# 將縮放後的 dataframe 轉換為 numpy 陣列
X = df_scaled.values
# ----------------------
# 數值數據距離矩陣 (相當於 R dist())
# pdist 返回與 R dist 物件相同的壓縮距離矩陣
# ----------------------
# 歐氏距離 (R method = 'euclidian' / euclidean)
df_dist_euc = pdist(X, metric="euclidean")
# 最大值 / 柴比雪夫距離
# df_dist_max = pdist(X, metric="chebyshev")
# 曼哈頓 / L1 距離
# df_dist_man = pdist(X, metric="cityblock")
# 坎培拉距離
# df_dist_can = pdist(X, metric="canberra")
# 閔可夫斯基距離 (預設 p=2 即為歐氏距離)
# df_dist_mink = pdist(X, metric="minkowski", p=3)
# ----------------------
# 類別二元距離 (R method = "binary")
# 用於虛擬化後的二元類別數據矩陣 dummified_data
# ----------------------
# dummified_data_np = dummified_data.values
# dist_binary = pdist(dummified_data_np, metric="jaccard")連結(Linkage)定義了在階層分群過程中用於合併聚類的規則。目前存在多種連結準則,而所選的準則可能會大幅改變聚類的結果:
以下是使用歐氏距離(Euclidean distance)以及上述各項連結方法,從數值數據生成階層分群的程式碼。
from scipy.cluster.hierarchy import linkage
# df_dist_euc 是 pdist() 產生的壓縮歐氏距離矩陣
# linkage() 將壓縮距離矩陣作為輸入,相當於 R hclust()
hclust_single = linkage(df_dist_euc, method="single")
hclust_complete = linkage(df_dist_euc, method="complete")
hclust_average = linkage(df_dist_euc, method="average")
hclust_centroid = linkage(df_dist_euc, method="centroid")
hclust_ward = linkage(df_dist_euc, method="ward")獲取聚類資訊:
from scipy.cluster.hierarchy import fcluster
# 將樹切成恰好 2 個聚類 (相當於 cutree(..., k = 2))
clusters_k2 = fcluster(hclust_ward, t=2, criterion="maxclust")
# 在高度 h = 0.7 處切割樹 (相當於 cutree(..., h = 0.7))
clusters_h07 = fcluster(hclust_ward, t=0.7, criterion="distance")import pandas as pd
from scipy.cluster.hierarchy import fcluster
# 透過將樹狀圖切成 2 個聚類來分配聚類標籤
cluster_labels = fcluster(hclust_ward, t=2, criterion="maxclust")
# 加上 cluster 欄位並計算每個聚類的平均摘要
df_num["cluster"] = cluster_labels
cluster_summary = df_num.groupby("cluster").mean().add_suffix("_Average")
print(cluster_summary)## Aroma_Average ... Category.One.Defects_Average
## cluster ...
## 1 7.425455 ... 0.636364
## 2 7.760000 ... 0.222222
##
## [2 rows x 10 columns]
樹狀圖(Dendrogram)有助於視覺化,並能幫助決定要使用的聚類數量,或是應該在哪個高度進行切割。
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, fcluster
# 總樣本數
n_samples = hclust_ward.shape[0] + 1
# ====================== 1. 基礎樹狀圖 + 安全的藍色切割線 ======================
plt.figure(figsize=(10, 6))
dendrogram(hclust_ward)## {'icoord': [[35.0, 35.0, 45.0, 45.0], [25.0, 25.0, 40.0, 40.0], [65.0, 65.0, 75.0, 75.0], [95.0, 95.0, 105.0, 105.0], [85.0, 85.0, 100.0, 100.0], [70.0, 70.0, 92.5, 92.5], [55.0, 55.0, 81.25, 81.25], [32.5, 32.5, 68.125, 68.125], [15.0, 15.0, 50.3125, 50.3125], [5.0, 5.0, 32.65625, 32.65625], [115.0, 115.0, 125.0, 125.0], [135.0, 135.0, 145.0, 145.0], [185.0, 185.0, 195.0, 195.0], [175.0, 175.0, 190.0, 190.0], [165.0, 165.0, 182.5, 182.5], [155.0, 155.0, 173.75, 173.75], [140.0, 140.0, 164.375, 164.375], [120.0, 120.0, 152.1875, 152.1875], [18.828125, 18.828125, 136.09375, 136.09375]], 'dcoord': [[0.0, np.float64(2.8211994490582724), np.float64(2.8211994490582724), 0.0], [0.0, np.float64(3.3416529959187184), np.float64(3.3416529959187184), np.float64(2.8211994490582724)], [0.0, np.float64(1.3541797122858186), np.float64(1.3541797122858186), 0.0], [0.0, np.float64(1.2640743886952885), np.float64(1.2640743886952885), 0.0], [0.0, np.float64(2.428994380285572), np.float64(2.428994380285572), np.float64(1.2640743886952885)], [np.float64(1.3541797122858186), np.float64(2.7691404691045887), np.float64(2.7691404691045887), np.float64(2.428994380285572)], [0.0, np.float64(3.559154299785624), np.float64(3.559154299785624), np.float64(2.7691404691045887)], [np.float64(3.3416529959187184), np.float64(4.403923017090526), np.float64(4.403923017090526), np.float64(3.559154299785624)], [0.0, np.float64(6.027631053839452), np.float64(6.027631053839452), np.float64(4.403923017090526)], [0.0, np.float64(6.590497475713074), np.float64(6.590497475713074), np.float64(6.027631053839452)], [0.0, np.float64(2.2713867810103223), np.float64(2.2713867810103223), 0.0], [0.0, np.float64(1.41825729301792), np.float64(1.41825729301792), 0.0], [0.0, np.float64(1.3122498495396393), np.float64(1.3122498495396393), 0.0], [0.0, np.float64(1.6038764997622879), np.float64(1.6038764997622879), np.float64(1.3122498495396393)], [0.0, np.float64(2.2724840629843372), np.float64(2.2724840629843372), np.float64(1.6038764997622879)], [0.0, np.float64(3.137028030598917), np.float64(3.137028030598917), np.float64(2.2724840629843372)], [np.float64(1.41825729301792), np.float64(4.843050954713168), np.float64(4.843050954713168), np.float64(3.137028030598917)], [np.float64(2.2713867810103223), np.float64(7.30855846442715), np.float64(7.30855846442715), np.float64(4.843050954713168)], [np.float64(6.590497475713074), np.float64(12.194460625748167), np.float64(12.194460625748167), np.float64(7.30855846442715)]], 'ivl': ['8', '16', '19', '2', '3', '6', '5', '15', '1', '11', '17', '4', '13', '0', '9', '14', '12', '7', '10', '18'], 'leaves': [8, 16, 19, 2, 3, 6, 5, 15, 1, 11, 17, 4, 13, 0, 9, 14, 12, 7, 10, 18], 'color_list': ['C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2', 'C0'], 'leaves_color_list': ['C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2']}
# 安全計算 k=2 的切割高度
clusters_k2 = fcluster(hclust_ward, t=2, criterion="maxclust")
dist_vals = np.unique(hclust_ward[:, 2])
dist_vals_sorted = np.sort(dist_vals)
if len(dist_vals_sorted) >= 2:
cut_height = dist_vals_sorted[-2]
else:
cut_height = dist_vals_sorted[-1]
plt.axhline(y=cut_height, c="blue", lw=2, linestyle="--")
plt.title("Ward Dendrogram with Cut Line k=2")
plt.xlabel("Samples")
plt.ylabel("Distance")
plt.xticks(rotation=45, ha="right")## (array([ 5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 105, 115, 125,
## 135, 145, 155, 165, 175, 185, 195]), [Text(5, 0, '8'), Text(15, 0, '16'), Text(25, 0, '19'), Text(35, 0, '2'), Text(45, 0, '3'), Text(55, 0, '6'), Text(65, 0, '5'), Text(75, 0, '15'), Text(85, 0, '1'), Text(95, 0, '11'), Text(105, 0, '17'), Text(115, 0, '4'), Text(125, 0, '13'), Text(135, 0, '0'), Text(145, 0, '9'), Text(155, 0, '14'), Text(165, 0, '12'), Text(175, 0, '7'), Text(185, 0, '10'), Text(195, 0, '18')])
# ====================== 2. 簡單樹狀圖 (移除損壞的分支顏色函數) ======================
plt.figure(figsize=(10, 6))
dendrogram(hclust_ward)## {'icoord': [[35.0, 35.0, 45.0, 45.0], [25.0, 25.0, 40.0, 40.0], [65.0, 65.0, 75.0, 75.0], [95.0, 95.0, 105.0, 105.0], [85.0, 85.0, 100.0, 100.0], [70.0, 70.0, 92.5, 92.5], [55.0, 55.0, 81.25, 81.25], [32.5, 32.5, 68.125, 68.125], [15.0, 15.0, 50.3125, 50.3125], [5.0, 5.0, 32.65625, 32.65625], [115.0, 115.0, 125.0, 125.0], [135.0, 135.0, 145.0, 145.0], [185.0, 185.0, 195.0, 195.0], [175.0, 175.0, 190.0, 190.0], [165.0, 165.0, 182.5, 182.5], [155.0, 155.0, 173.75, 173.75], [140.0, 140.0, 164.375, 164.375], [120.0, 120.0, 152.1875, 152.1875], [18.828125, 18.828125, 136.09375, 136.09375]], 'dcoord': [[0.0, np.float64(2.8211994490582724), np.float64(2.8211994490582724), 0.0], [0.0, np.float64(3.3416529959187184), np.float64(3.3416529959187184), np.float64(2.8211994490582724)], [0.0, np.float64(1.3541797122858186), np.float64(1.3541797122858186), 0.0], [0.0, np.float64(1.2640743886952885), np.float64(1.2640743886952885), 0.0], [0.0, np.float64(2.428994380285572), np.float64(2.428994380285572), np.float64(1.2640743886952885)], [np.float64(1.3541797122858186), np.float64(2.7691404691045887), np.float64(2.7691404691045887), np.float64(2.428994380285572)], [0.0, np.float64(3.559154299785624), np.float64(3.559154299785624), np.float64(2.7691404691045887)], [np.float64(3.3416529959187184), np.float64(4.403923017090526), np.float64(4.403923017090526), np.float64(3.559154299785624)], [0.0, np.float64(6.027631053839452), np.float64(6.027631053839452), np.float64(4.403923017090526)], [0.0, np.float64(6.590497475713074), np.float64(6.590497475713074), np.float64(6.027631053839452)], [0.0, np.float64(2.2713867810103223), np.float64(2.2713867810103223), 0.0], [0.0, np.float64(1.41825729301792), np.float64(1.41825729301792), 0.0], [0.0, np.float64(1.3122498495396393), np.float64(1.3122498495396393), 0.0], [0.0, np.float64(1.6038764997622879), np.float64(1.6038764997622879), np.float64(1.3122498495396393)], [0.0, np.float64(2.2724840629843372), np.float64(2.2724840629843372), np.float64(1.6038764997622879)], [0.0, np.float64(3.137028030598917), np.float64(3.137028030598917), np.float64(2.2724840629843372)], [np.float64(1.41825729301792), np.float64(4.843050954713168), np.float64(4.843050954713168), np.float64(3.137028030598917)], [np.float64(2.2713867810103223), np.float64(7.30855846442715), np.float64(7.30855846442715), np.float64(4.843050954713168)], [np.float64(6.590497475713074), np.float64(12.194460625748167), np.float64(12.194460625748167), np.float64(7.30855846442715)]], 'ivl': ['8', '16', '19', '2', '3', '6', '5', '15', '1', '11', '17', '4', '13', '0', '9', '14', '12', '7', '10', '18'], 'leaves': [8, 16, 19, 2, 3, 6, 5, 15, 1, 11, 17, 4, 13, 0, 9, 14, 12, 7, 10, 18], 'color_list': ['C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2', 'C0'], 'leaves_color_list': ['C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C1', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2', 'C2']}
plt.title("Ward Dendrogram")
plt.xlabel("Samples")
plt.ylabel("Distance")
plt.xticks(rotation=45, ha="right")## (array([ 5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 105, 115, 125,
## 135, 145, 155, 165, 175, 185, 195]), [Text(5, 0, '8'), Text(15, 0, '16'), Text(25, 0, '19'), Text(35, 0, '2'), Text(45, 0, '3'), Text(55, 0, '6'), Text(65, 0, '5'), Text(75, 0, '15'), Text(85, 0, '1'), Text(95, 0, '11'), Text(105, 0, '17'), Text(115, 0, '4'), Text(125, 0, '13'), Text(135, 0, '0'), Text(145, 0, '9'), Text(155, 0, '14'), Text(165, 0, '12'), Text(175, 0, '7'), Text(185, 0, '10'), Text(195, 0, '18')])
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# 1. 執行 K-Means 聚類 (相當於 kmeans(df_scaled, centers = 2))
km_model = KMeans(n_clusters=2, random_state=100)
cluster_assign = km_model.fit_predict(df_scaled)
# 印出模型資訊,同等於在 R 中印出 km 物件
print("K-Means Model Output")## K-Means Model Output
## Cluster centers:
## [[ 0.55342061 0.65939883 0.68248618 0.61146386 0.62287806 0.42464149
## 0.64082913 0.67537713 -0.15838297 -0.23102817]
## [-0.67640297 -0.80593191 -0.83414977 -0.74734472 -0.76129541 -0.51900626
## -0.7832356 -0.82546094 0.19357919 0.28236777]]
## Inertia (total within-cluster sum of squares): 124.3613
## Cluster labels for each sample:
## [0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 1 1 1 0 1]
# 2. 相當於 fviz_cluster: PCA 2D 投影用於聚類視覺化
# fviz_cluster 會自動使用 PCA 來降維至 2 維
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_scaled)
pca_df = pd.DataFrame(pca_result, columns=["PC1", "PC2"])
pca_df["cluster"] = cluster_assign
plt.figure(figsize=(8, 5))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="cluster", palette="Set1", s=60)
# 將聚類中心投影至 PCA 空間並繪製
centers_pca = pca.transform(km_model.cluster_centers_)
plt.scatter(centers_pca[:, 0], centers_pca[:, 1], marker="X", c="black", s=200, label="Centers")
plt.legend(title="Clusters")
plt.title("K-Means Clusters (PCA Projection)")
sns.set_style("white") # match theme_minimal()
plt.tight_layout()
plt.show()# 3. 原始變數 Aroma 與 Aftertaste 的散佈圖,並依聚類著色
df_km = df_num.copy()
df_km["cluster"] = cluster_assign
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df_km, x="Aroma", y="Aftertaste", hue="cluster", palette="Set1", s=60)
plt.legend(title="Clusters")
plt.title("Clusters: Aroma vs Aftertaste")
sns.set_style("white")
plt.tight_layout()
plt.show()# 4. 聚類平均值摘要表 (dplyr group_by + summarise 所有平均值)
cluster_summary = df_km.groupby("cluster").mean().add_suffix("_Average")
print("\nCluster Variable Average Summary:")##
## Cluster Variable Average Summary:
## Aroma_Average ... Category.One.Defects_Average
## cluster ...
## 0 7.72 ... 0.181818
## 1 7.40 ... 0.777778
##
## [2 rows x 10 columns]