0lai0 · ferdinKuan · May 29, 2025 · May 29, 2025 · Jun 14, 2025 · Jun 14, 2025
diff --git a/ACS111151_ex/README.md b/ACS111151_ex/README.md
@@ -0,0 +1 @@
+作業一放這裡
diff --git a/ACS111151_ex/ex1.ipynb b/ACS111151_ex/ex1.ipynb
@@ -0,0 +1,116 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "dQc5pfBVV_SF"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "from sklearn.model_selection   import train_test_split\n",
+        "from sklearn.preprocessing     import StandardScaler\n",
+        "from imblearn.over_sampling    import SMOTE\n",
+        "from sklearn.ensemble          import RandomForestClassifier\n",
+        "from sklearn.cluster           import KMeans\n",
+        "from sklearn.metrics           import (\n",
+        "    classification_report,\n",
+        "    confusion_matrix,\n",
+        "    roc_auc_score\n",
+        ")\n",
+        "\n",
+        "# 固定參數\n",
+        "RANDOM_SEED = 42\n",
+        "TEST_SIZE   = 0.3\n",
+        "\n",
+        "def supervised_pipeline(X_train, X_test, y_train, y_test):\n",
+        "    \"\"\"監督式：SMOTE + RandomForest\"\"\"\n",
+        "    sm = SMOTE(random_state=RANDOM_SEED)\n",
+        "    X_res, y_res = sm.fit_resample(X_train, y_train)\n",
+        "\n",
+        "    clf = RandomForestClassifier(\n",
+        "        n_estimators=100,\n",
+        "        class_weight='balanced',\n",
+        "        random_state=RANDOM_SEED\n",
+        "    )\n",
+        "    clf.fit(X_res, y_res)\n",
+        "\n",
+        "    y_pred = clf.predict(X_test)\n",
+        "    y_prob = clf.predict_proba(X_test)[:,1]\n",
+        "\n",
+        "    print(\"\\n--- 監督式學習：SMOTE + RandomForest ---\")\n",
+        "    print(classification_report(y_test, y_pred, digits=4))\n",
+        "    print(\"Confusion Matrix:\")\n",
+        "    print(confusion_matrix(y_test, y_pred))\n",
+        "    print(f\"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}\")\n",
+        "\n",
+        "def unsupervised_pipeline(X_all, y_all):\n",
+        "    \"\"\"非監督式：KMeans(k=3) 異常偵測\"\"\"\n",
+        "    # 全資料標準化\n",
+        "    X_scaled = StandardScaler().fit_transform(X_all)\n",
+        "\n",
+        "    k = 3\n",
+        "    km = KMeans(n_clusters=k, random_state=RANDOM_SEED).fit(X_scaled)\n",
+        "    labels = km.labels_\n",
+        "\n",
+        "    # 群內多數標籤當預測\n",
+        "    y_pred = np.zeros_like(labels)\n",
+        "    for c in range(k):\n",
+        "        mask = (labels == c)\n",
+        "        majority = pd.Series(y_all[mask]).mode()[0]\n",
+        "        y_pred[mask] = majority\n",
+        "\n",
+        "    print(\"\\n--- 非監督式學習：KMeans (k=3) ---\")\n",
+        "    print(classification_report(y_all, y_pred, digits=4))\n",
+        "    print(\"Confusion Matrix:\")\n",
+        "    print(confusion_matrix(y_all, y_pred))\n",
+        "\n",
+        "def main():\n",
+        "    # 1. 讀檔 & 前處理\n",
+        "    data = pd.read_csv(\"data/creditcard.csv\")\n",
+        "    data = data.drop(columns=['Time'])\n",
+        "    data['Amount'] = StandardScaler().fit_transform(\n",
+        "        data['Amount'].values.reshape(-1,1)\n",
+        "    )\n",
+        "\n",
+        "    X = data.drop(columns=['Class']).values\n",
+        "    y = data['Class'].values\n",
+        "\n",
+        "    # 2. 切 supervised 的 train/test\n",
+        "    X_train, X_test, y_train, y_test = train_test_split(\n",
+        "        X, y,\n",
+        "        test_size=TEST_SIZE,\n",
+        "        random_state=RANDOM_SEED,\n",
+        "        stratify=y\n",
+        "    )\n",
+        "\n",
+        "    # 3. 執行監督式流程\n",
+        "    supervised_pipeline(X_train, X_test, y_train, y_test)\n",
+        "\n",
+        "    # 4. 執行非監督式流程（用全部資料評估）\n",
+        "    unsupervised_pipeline(X, y)\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    main()\n"
+      ]
+    }
+  ]
+}
diff --git a/ACS111151_ex/ex1.md b/ACS111151_ex/ex1.md
@@ -0,0 +1,22 @@
+1. 前處理
+- 資料來源：`data/creditcard.csv`
+- 刪除 `Time` 欄位，對 `Amount` 做 StandardScaler。
+
+2. 監督式實驗：SMOTE + RandomForest
+- SMOTE 過採樣後的訓練集：正/負樣本比例接近平衡。
+- RandomForest 參數：`n_estimators=100, class_weight='balanced'`。
+- 結果：
+  - Precision、Recall、F1-score、ROC AUC 如下表。
+
+| 類別 | Precision | Recall | F1    |
+|----|---------|-------|-------|
+| 0  | …       | …     | …     |
+| 1  | …       | …     | …     |
+
+3. 非監督式實驗：KMeans(k=3)
+- 對全資料做標準化後聚成三群，每群以多數真實標籤做預測
+- 結果：
+  - Precision、Recall、F1-score 如下。
+
+4. 結論
+- 監督式方法效果遠優於非監督式。
diff --git a/ACS111151_ex2/README.md b/ACS111151_ex2/README.md
@@ -0,0 +1 @@
+作業2放這裡
diff --git a/ACS111151_ex2/ex2.ipynb b/ACS111151_ex2/ex2.ipynb
@@ -0,0 +1,129 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "dQc5pfBVV_SF"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "\n",
+        "from sklearn.model_selection   import train_test_split\n",
+        "from sklearn.preprocessing     import StandardScaler\n",
+        "from sklearn.ensemble          import IsolationForest\n",
+        "from xgboost                   import XGBClassifier\n",
+        "from sklearn.metrics           import (\n",
+        "    classification_report,\n",
+        "    confusion_matrix,\n",
+        "    roc_auc_score,\n",
+        "    f1_score\n",
+        ")\n",
+        "\n",
+        "# 固定參數\n",
+        "RANDOM_SEED = 42\n",
+        "TEST_SIZE   = 0.3\n",
+        "\n",
+        "def evaluate_pipeline(cont_list, percentile_list):\n",
+        "    # 讀檔 & 前處理\n",
+        "    df = pd.read_csv(\"data/creditcard.csv\")\n",
+        "    df = df.drop(columns=[\"Time\"])\n",
+        "    df[\"Amount\"] = StandardScaler().fit_transform(\n",
+        "        df[\"Amount\"].values.reshape(-1, 1)\n",
+        "    )\n",
+        "    X = df.drop(columns=[\"Class\"]).values\n",
+        "    y = df[\"Class\"].values\n",
+        "\n",
+        "    # 切分\n",
+        "    X_train, X_test, y_train, y_test = train_test_split(\n",
+        "        X, y,\n",
+        "        test_size=TEST_SIZE,\n",
+        "        random_state=RANDOM_SEED,\n",
+        "        stratify=y\n",
+        "    )\n",
+        "\n",
+        "    # 訓練 XGBoost（全資料）\n",
+        "    xgb = XGBClassifier(\n",
+        "        n_estimators=100,\n",
+        "        random_state=RANDOM_SEED,\n",
+        "        use_label_encoder=False,\n",
+        "        eval_metric=\"logloss\"\n",
+        "    )\n",
+        "    xgb.fit(X_train, y_train)\n",
+        "\n",
+        "    best_cfg = None\n",
+        "    best_f1  = 0\n",
+        "\n",
+        "    # 掃描不同的 contamination\n",
+        "    for cont in cont_list:\n",
+        "        iso = IsolationForest(\n",
+        "            contamination=cont,\n",
+        "            random_state=RANDOM_SEED\n",
+        "        )\n",
+        "        iso.fit(X_train[y_train==0])\n",
+        "\n",
+        "        # decision_function 取分數\n",
+        "        scores = -iso.decision_function(X_test)\n",
+        "\n",
+        "        # 在這個 contamination 下，掃描不同的 percentile 作為 threshold\n",
+        "        for pct in percentile_list:\n",
+        "            thr = np.percentile(scores, pct)\n",
+        "            mask_anom = (scores >= thr)\n",
+        "\n",
+        "            # 合併預測\n",
+        "            y_pred = np.zeros_like(y_test)\n",
+        "            if mask_anom.any():\n",
+        "                y_pred[mask_anom] = xgb.predict(X_test[mask_anom])\n",
+        "\n",
+        "            # 計算 F1\n",
+        "            f1 = f1_score(y_test, y_pred)\n",
+        "            if f1 > best_f1:\n",
+        "                best_f1 = f1\n",
+        "                best_cfg = (cont, pct, thr, f1)\n",
+        "\n",
+        "    cont, pct, thr, f1 = best_cfg\n",
+        "    print(f\"\\n最佳配置 → contamination={cont}, percentile={pct:.1f}, thr={thr:.3f}\")\n",
+        "    print(f\"對應 F1 = {f1:.4f}\\n\")\n",
+        "\n",
+        "    # 用最佳配置重跑一次並印最終報告\n",
+        "    iso = IsolationForest(contamination=cont, random_state=RANDOM_SEED)\n",
+        "    iso.fit(X_train[y_train==0])\n",
+        "    scores = -iso.decision_function(X_test)\n",
+        "    mask_anom = (scores >= thr)\n",
+        "\n",
+        "    y_pred = np.zeros_like(y_test)\n",
+        "    y_pred[mask_anom] = xgb.predict(X_test[mask_anom])\n",
+        "    y_prob = np.zeros_like(y_test, dtype=float)\n",
+        "    y_prob[mask_anom] = xgb.predict_proba(X_test[mask_anom])[:,1]\n",
+        "\n",
+        "    print(\"=== 最終評估 ===\")\n",
+        "    print(classification_report(y_test, y_pred, digits=4))\n",
+        "    print(\"Confusion Matrix:\")\n",
+        "    print(confusion_matrix(y_test, y_pred))\n",
+        "    print(f\"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}\")\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    # 自訂 contamination 與 percentile 的範圍\n",
+        "    cons = [0.001, 0.002, 0.005, 0.01]\n",
+        "    pers = [99, 99.5, 99.8, 99.9]\n",
+        "    evaluate_pipeline(cons, pers)\n"
+      ]
+    }
+  ]
+}
diff --git a/ACS111151_ex2/ex2.md b/ACS111151_ex2/ex2.md
@@ -0,0 +1,55 @@
+為什麼要用 AutoEncoder + XGBoost？
+AutoEncoder 是一種神經網路架構，用來壓縮並還原輸入資料。如果某筆資料「無法被還原得很好」，那可能表示這是異常樣本。
+
+XGBoost 是目前最受歡迎的梯度提升樹模型，對不平衡資料具有良好表現。
+
+結合這兩者：利用 AutoEncoder 偵測異常的能力，為每筆資料生成一個「異常分數」，再加入到 XGBoost 當作額外特徵，使模型能更好地識別詐欺交易。
+
+實作步驟與程式碼解說
+載入並預處理資料
+df = pd.read_csv("creditcard.csv")
+X = df.drop(['Class', 'Time'], axis=1)
+y = df['Class']
+Class 是標籤，0 代表正常交易，1 代表詐欺。
+Time 被移除，因為對模型學習幫助不大。
+
+使用 MinMaxScaler 對資料進行正規化：
+scaler = MinMaxScaler()
+X_scaled = scaler.fit_transform(X)
+
+訓練 AutoEncoder
+X_normal = X_scaled[y == 0]  # 只用正常樣本
+
+設計一個簡單的 AutoEncoder 結構（中間隱藏層是 16 維）：
+input_dim = X_normal.shape[1]
+input_layer = layers.Input(shape=(input_dim,))
+encoded = layers.Dense(16, activation='relu')(input_layer)
+decoded = layers.Dense(input_dim, activation='sigmoid')(encoded)
+
+autoencoder = models.Model(inputs=input_layer, outputs=decoded)
+autoencoder.compile(optimizer='adam', loss='mse')
+autoencoder.fit(X_normal, X_normal, epochs=10, batch_size=256, shuffle=True)
+
+計算重建誤差（異常分數）
+X_reconstructed = autoencoder.predict(X_scaled)
+recon_error = np.mean(np.power(X_scaled - X_reconstructed, 2), axis=1)
+這個 recon_error 就是每筆資料與其重建結果的誤差，數值愈大，表示愈可能是異常。
+
+將異常分數加入原始特徵
+X_with_score = pd.DataFrame(X_scaled, columns=X.columns)
+X_with_score['recon_error'] = recon_error
+
+使用 XGBoost 做分類
+X_train, X_test, y_train, y_test = train_test_split(X_with_score, y, test_size=0.2, stratify=y)
+
+model = xgb.XGBClassifier(scale_pos_weight=10, use_label_encoder=False, eval_metric='logloss')
+model.fit(X_train, y_train)
+
+y_pred = model.predict(X_test)
+y_prob = model.predict_proba(X_test)[:, 1]
+scale_pos_weight=10 是為了解決資料不平衡問題，可以根據實際詐欺比例微調。
+
+模型評估
+print(classification_report(y_test, y_pred))
+print("AUC Score:", roc_auc_score(y_test, y_prob))
+輸出 Precision、Recall、F1-score 與 AUC 分數，讓你評估模型在詐欺樣本上的準確程度。
diff --git a/ex1 b/ex1
@@ -0,0 +1 @@
+
diff --git a/ex1.py b/ex1.py
@@ -0,0 +1 @@
+
diff --git a/ex2.py b/ex2.py
@@ -0,0 +1 @@
+