0lai0 · ianrwan · May 22, 2025 · Jun 5, 2025
diff --git a/ex1/ex1.md b/ex1/ex1.md
@@ -0,0 +1,16 @@
+1. 監督式學習（隨機森林）
+使用100 n_estimators
+
+參數：
+max_depth=20：限制樹深度防止過擬合
+class_weight='balanced_subsample'：自動調整權重
+
+並且透過PR曲線尋找最佳 threshold，來最大化F1分數
+
+2. 非監督式學習（K-Means）
+僅使用正常交易數據訓練模型（模擬真實場景）
+
+自動尋找最佳的 cluster count（2-4範圍）
+使用 silhouette score 評估 clustering 效果
+
+標籤對齊：將 clusters 結果映射到真實的標籤，並且基於多數決原則分配類別
diff --git a/ex1/ex1.py b/ex1/ex1.py
@@ -0,0 +1,155 @@
+import numpy as np  # 數值計算庫
+import pandas as pd  # 數據處理庫
+
+# 機器學習工具
+from sklearn.model_selection import train_test_split  # 數據分割
+from sklearn.preprocessing import StandardScaler  # 特徵標準化
+from sklearn.metrics import classification_report  # 分類報告
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score  # 評估指標
+from sklearn.metrics import precision_recall_curve  # PR曲線
+
+import kagglehub  # Kaggle數據集下載
+
+# 機器學習模型
+from sklearn.ensemble import RandomForestClassifier  # 監督式學習模型
+from sklearn.cluster import KMeans  # 非監督式學習模型
+from sklearn.metrics import silhouette_score  # 聚類效果評估
+
+# ======================
+# 全局設置
+# ======================
+RANDOM_SEED = 42  # 隨機種子確保結果可重現
+TEST_SIZE = 0.3   # 測試集比例（不可更改）
+
+# ======================
+# 評估函數
+# ======================
+def evaluation(y_true, y_pred, model_name="Model"):
+   # 計算各項評估指標
+   accuracy = accuracy_score(y_true, y_pred)  # 準確率
+   precision = precision_score(y_true, y_pred, zero_division=0)  # 精確率（處理除零錯誤）
+   recall = recall_score(y_true, y_pred)  # 召回率
+   f1 = f1_score(y_true, y_pred)  # F1分數
+
+   # 輸出評估結果
+   print(f'\n{model_name} Evaluation:')
+   print('===' * 15)
+   print('         Accuracy:', accuracy)
+   print('  Precision Score:', precision)
+   print('     Recall Score:', recall)
+   print('         F1 Score:', f1)
+   print("\nClassification Report:")
+   print(classification_report(y_true, y_pred))  # 詳細分類報告
+
+# ======================
+# 數據準備
+# ======================
+# 下載信用卡詐騙數據集
+path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
+# 讀取CSV文件
+data = pd.read_csv(f"{path}/creditcard.csv")
+# 將目標變量轉換為整數類型
+data['Class'] = data['Class'].astype(int)
+
+# 數據預處理
+data = data.drop(['Time'], axis=1)  # 移除時間特徵
+# 標準化交易金額特徵
+data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
+
+# 分析數據分布
+fraud = data[data['Class'] == 1]  # 詐騙交易
+nonfraud = data[data['Class'] == 0]  # 正常交易
+print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
+# 計算詐騙交易百分比
+print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')
+
+# 準備特徵和標籤
+X = data.drop(columns=['Class']).values  # 特徵矩陣
+Y = data['Class'].values  # 目標變量
+
+# ============================================
+# 監督式學習段落 (Random Forest)
+# ============================================
+# 分割訓練集和測試集（無分層抽樣）
+x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)
+
+# 初始化隨機森林分類器
+rf_model = RandomForestClassifier(    
+    n_estimators=100,  # 決策樹數量
+    max_depth=20,  # 樹的最大深度
+    class_weight='balanced_subsample',  # 自動平衡類別權重（處理不平衡數據）
+    random_state=RANDOM_SEED  # 隨機種子
+)
+# 訓練模型
+rf_model.fit(x_train, y_train)
+
+# 模型預測與評估
+y_prob = rf_model.predict_proba(x_test)[:, 1]  # 預測詐騙概率
+# 計算PR曲線和閾值
+prec, rec, thresh = precision_recall_curve(y_test, y_prob)
+# 計算F1分數（添加小數避免除零）
+f1 = 2 * (prec * rec) / (prec + rec + 1e-6)
+best_idx = np.argmax(f1)  # 最佳F1索引
+best_threshold = thresh[best_idx]  # 最佳概率閾值
+y_pred = (y_prob >= best_threshold).astype(int)  # 應用閾值轉換預測類別
+
+# 評估監督式模型
+evaluation(y_test, y_pred, model_name="RandomForestClassifier(Supervised)")
+
+# ============================================
+# 非監督式學習段落 (K-Means)
+# ============================================
+# 重新分割數據集（使用分層抽樣確保類別比例）
+x_train, x_test, y_train, y_test = train_test_split(
+   X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y
+)
+
+# 特徵標準化
+scaler = StandardScaler()
+x_train = scaler.fit_transform(x_train)  # 擬合並轉換訓練集
+x_test = scaler.transform(x_test)  # 轉換測試集
+
+# 僅使用正常交易數據進行訓練（非監督學習特徵）
+n_x_train = x_train[y_train == 0]  # 篩選正常交易
+n_x_train = n_x_train[:1000]  # 取1000個樣本
+
+# 尋找最佳聚類數k（2-4範圍）
+scores = []
+for k in range(2, 5):
+   # 初始化K-Means模型
+   kmeans = KMeans(
+       n_clusters=k,  # 聚類數量
+       init='k-means++',  # 智能初始化中心點
+       random_state=RANDOM_SEED  # 隨機種子
+   )
+   kmeans.fit(n_x_train)  # 訓練模型
+   # 計算輪廓係數（評估聚類效果）
+   score = silhouette_score(n_x_train, kmeans.labels_)
+   scores.append(score)
+
+# 選擇最佳k值（輪廓係數最高）
+optimal_k = np.argmax(scores) + 2  # 索引轉實際k值
+# 使用最佳k值初始化模型
+kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
+kmeans.fit(n_x_train)  # 重新訓練
+
+# 預測測試集
+y_pred_test = kmeans.predict(x_test)
+
+# 對齊聚類標籤與真實標籤
+def align_labels(y_true, y_pred, n_clusters):
+   labels = np.zeros_like(y_pred)  # 初始化標籤數組
+   for i in range(n_clusters):
+       mask = (y_pred == i)  # 當前聚類的掩碼
+       if np.sum(mask) > 0:  # 確保聚類非空
+           # 將聚類分配為該組中多數的真實類別
+           labels[mask] = np.bincount(y_true[mask]).argmax()
+       else:
+           labels[mask] = 0  # 默認為正常交易
+   return labels
+
+# 應用標籤對齊
+y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)
+
+# 評估非監督式模型
+evaluation(y_test, y_pred_aligned, model_name="KMeans (Unsupervised)")
diff --git a/ex1/ex1_監督式與非監督式.png b/ex1/ex1_監督式與非監督式.png
diff --git a/ex2/ex2.md b/ex2/ex2.md
@@ -0,0 +1,19 @@
+# 非監督式學習階段：Isolation Forest
+核心概念：Anomalous data points 比 normal points 更容易被「隔離」
+運作方式：通過隨機選擇特徵和分割值構建 decision trees
+
+優勢：更有效率的處理高維的數據，並且使計算複雜度低
+
+## 程式內容
+
+contamination 參數：設定預期異常值比例，影響判定 threshold
+不使用標籤：完全基於數據分布特性檢測異常
+輸出轉換：將預測結果作為新特徵加入原始數據
+
+初步篩選可疑的交易內容，透過「異常程度」的量化指標，增強後續監督模型的 feature space
+
+# 監督式學習階段：XGBoost
+核心概念：串聯多個 weak learners 形成 strong predictor
+
+優勢：處理不平衡數據能力強，防止過擬合
+XGBoost 的特點：內建正則化，自動處理缺失的數值
diff --git a/ex2/ex2.py b/ex2/ex2.py
@@ -0,0 +1,97 @@
+import numpy as np
+import pandas as pd
+
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve
+from sklearn.ensemble import IsolationForest
+from xgboost import XGBClassifier
+import kagglehub
+
+# General setting
+RANDOM_SEED = 42
+TEST_SIZE = 0.3
+
+# 評估函數
+def evaluation(y_true, y_pred, model_name="Model"):
+    accuracy = accuracy_score(y_true, y_pred)
+    precision = precision_score(y_true, y_pred, zero_division=0)
+    recall = recall_score(y_true, y_pred)
+    f1 = f1_score(y_true, y_pred)
+
+    print(f'\n{model_name} Evaluation:')
+    print('===' * 15)
+    print('         Accuracy:', accuracy)
+    print('  Precision Score:', precision)
+    print('     Recall Score:', recall)
+    print('         F1 Score:', f1)
+    print("\nClassification Report:")
+    print(classification_report(y_true, y_pred))
+
+# 下載資料
+path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
+data = pd.read_csv(f"{path}/creditcard.csv")
+data['Class'] = data['Class'].astype(int)
+
+# 預處理
+data = data.drop(['Time'], axis=1)
+data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
+
+X = data.drop(columns=['Class']).values
+Y = data['Class'].values
+
+# 分割訓練/測試集
+x_train, x_test, y_train, y_test = train_test_split(
+    X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y
+)
+
+# 標準化
+scaler = StandardScaler()
+x_train = scaler.fit_transform(x_train)
+x_test = scaler.transform(x_test)
+
+# 加入 IsolationForest（非監督式）
+iso_model = IsolationForest(
+    n_estimators=100,
+    contamination=0.01,
+    random_state=RANDOM_SEED
+)
+iso_model.fit(x_train)
+
+# 預測並加入為新特徵
+iso_train_scores = iso_model.predict(x_train)  # -1 = 異常, 1 = 正常
+iso_test_scores = iso_model.predict(x_test)
+
+# 合併新特徵
+x_train_enhanced = np.concatenate([x_train, iso_train_scores.reshape(-1, 1)], axis=1)
+x_test_enhanced = np.concatenate([x_test, iso_test_scores.reshape(-1, 1)], axis=1)
+
+# 使用 XGBoost（監督式）
+xgb_model = XGBClassifier(
+    n_estimators=400,
+    max_depth=8,
+    learning_rate=0.03,
+    subsample=0.8,
+    colsample_bytree=0.8,
+    scale_pos_weight=(len(y_train[y_train == 0]) / max(len(y_train[y_train == 1]), 1)),
+    use_label_encoder=False,
+    eval_metric='logloss',
+    random_state=RANDOM_SEED
+)
+xgb_model.fit(x_train_enhanced, y_train)
+
+# 預測機率
+y_prob = xgb_model.predict_proba(x_test_enhanced)[:, 1]
+
+# 找最佳 threshold
+prec, rec, thresh = precision_recall_curve(y_test, y_prob)
+f1 = 2 * (prec * rec) / (prec + rec + 1e-6)
+best_idx = np.argmax(f1)
+best_threshold = thresh[best_idx]
+
+# 可自定 threshold（嘗試提高精確度）
+manual_threshold = max(best_threshold, 0.6)
+y_pred = (y_prob >= manual_threshold).astype(int)
+
+# 評估
+evaluation(y_test, y_pred, model_name="XGBoostClassifier (Supervised after IsolationForest)")
diff --git a/ex2/ex2_img.png b/ex2/ex2_img.png