From bb039d017acae61806cb96e0ce8b9a9f09a5ad15 Mon Sep 17 00:00:00 2001
From: JHTNT <jerry1106666@gmail.com>
Date: Thu, 5 Jun 2025 17:15:53 +0800
Subject: [PATCH 1/3] add: ex1.ipynb

---
 ex1.ipynb | 424 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 424 insertions(+)
 create mode 100644 ex1.ipynb

diff --git a/ex1.ipynb b/ex1.ipynb
new file mode 100644
index 0000000..ce54dea
--- /dev/null
+++ b/ex1.ipynb
@@ -0,0 +1,424 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install kagglehub\n",
+    "# !pip install ipywidgets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "dWMfSiQ965S2"
+   },
+   "source": [
+    "## Import Necessary Package"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "id": "Z9p241Ag6_W4"
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.metrics import (silhouette_score, accuracy_score, precision_score, recall_score, f1_score,\n",
+    "                             roc_auc_score, confusion_matrix, classification_report, precision_recall_curve)\n",
+    "from sklearn.cluster import KMeans\n",
+    "from xgboost import XGBClassifier\n",
+    "import kagglehub\n",
+    "import optuna\n",
+    "\n",
+    "# general setting. do not change TEST_SIZE\n",
+    "RANDOM_SEED = 42\n",
+    "TEST_SIZE = 0.3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ysBADDUY7ESi"
+   },
+   "source": [
+    "## Load Dataset & Prepare Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "S9OzKek-7Ly4"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/home/u7539525/.cache/kagglehub/datasets/mlg-ulb/creditcardfraud/versions/3\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load dataset（from kagglehub）\n",
+    "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n",
+    "print(path)\n",
+    "data = pd.read_csv(f\"{path}/creditcard.csv\")\n",
+    "data['Class'] = data['Class'].astype(int)\n",
+    "\n",
+    "# prepare data\n",
+    "data = data.drop(['Time'], axis=1)\n",
+    "data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "OSGjRpDG7Vac"
+   },
+   "source": [
+    "## Fraud/Non-Fraud Transactions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "HBTR4FUN7dTM",
+    "outputId": "81ca067b-fa55-419b-ee49-82e308083709"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fraudulent:492, non-fraudulent:284315\n",
+      "the positive class (frauds) percentage: 492/284807 (0.173%)\n"
+     ]
+    }
+   ],
+   "source": [
+    "fraud = data[data['Class'] == 1]\n",
+    "nonfraud = data[data['Class'] == 0]\n",
+    "print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')\n",
+    "print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "FtFnNrdm8FYr",
+    "outputId": "fd1b0dd7-9cb9-4781-88e1-3378e045627d"
+   },
+   "outputs": [],
+   "source": [
+    "# define evaluation function\n",
+    "def evaluation(y_true, y_pred, model_name=\"Model\"):\n",
+    "    accuracy = accuracy_score(y_true, y_pred)\n",
+    "    precision = precision_score(y_true, y_pred)\n",
+    "    recall = recall_score(y_true, y_pred)\n",
+    "    f1 = f1_score(y_true, y_pred)\n",
+    "\n",
+    "    print(f'\\n{model_name} Evaluation:')\n",
+    "    print('===' * 15)\n",
+    "    print('         Accuracy:', accuracy)\n",
+    "    print('  Precision Score:', precision)\n",
+    "    print('     Recall Score:', recall)\n",
+    "    print('         F1 Score:', f1)\n",
+    "    print(\"\\nClassification Report:\")\n",
+    "    print(classification_report(y_true, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 監督式學習 (XGBoost)\n",
+    "\n",
+    "**Baseline**:\n",
+    "\n",
+    "```\n",
+    "Random forest Evaluation:\n",
+    "=============================================\n",
+    "         Accuracy: 0.9996371850239341\n",
+    "  Precision Score: 0.9411764705882353\n",
+    "     Recall Score: 0.8235294117647058\n",
+    "         F1 Score: 0.8784313725490196\n",
+    "\n",
+    "Classification Report:\n",
+    "              precision    recall  f1-score   support\n",
+    "\n",
+    "           0       1.00      1.00      1.00     85307\n",
+    "           1       0.94      0.82      0.88       136\n",
+    "\n",
+    "    accuracy                           1.00     85443\n",
+    "   macro avg       0.97      0.91      0.94     85443\n",
+    "weighted avg       1.00      1.00      1.00     85443\n",
+    "```\n",
+    "\n",
+    "### 說明\n",
+    "\n",
+    "XGBoost 是使用 Gradient Boosting 方式，依序訓練多個決策樹，每棵新的樹都會對前一棵樹進行學習跟修正。相較於 Random Forest 的每棵樹彼此獨立，XGBoost 有更高的 accuracy，並且訓練效率更高。\n",
+    "\n",
+    "參數調整：\n",
+    "\n",
+    "- `enable_categorical`: 使用分類模式。\n",
+    "- `n_estimators`: 經過多組參數測試，設置 250 的效果最好，設更高結果不再提升。\n",
+    "- `tree_method`: 分類模式需要使用 `approx` 或 `hist` 演算法，前者兼顧效率與準確度。\n",
+    "- `device`: 使用 GPU 加速計算。\n",
+    "- `learning_rate`: 預設值是 0.3，在 40 步之後開始出現 overfitting 的現象，\n",
+    "- `n_jobs`: -1 表示用所有 CPU 核心進行平行計算。\n",
+    "\n",
+    "### 結果\n",
+    "\n",
+    "- Accuracy: 0.999637 -> **0.999672**\n",
+    "- Precision Score: 0.941176 -> **0.95**\n",
+    "- Recall Score: 0.823529 -> **0.838235**\n",
+    "- F1 Score: 0.878431 -> **0.890625**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "id": "NwgT6nZQ7le0"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.12/dist-packages/xgboost/core.py:158: UserWarning: [16:46:20] WARNING: /home/coder/xgboost/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.\n",
+      "Potential solutions:\n",
+      "- Use a data structure that matches the device ordinal in the booster.\n",
+      "- Set the device for booster before call to inplace_predict.\n",
+      "\n",
+      "This warning will only be shown once.\n",
+      "\n",
+      "  warnings.warn(smsg, UserWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "XGBoost Evaluation:\n",
+      "=============================================\n",
+      "         Accuracy: 0.9996605924417448\n",
+      "  Precision Score: 0.9495798319327731\n",
+      "     Recall Score: 0.8308823529411765\n",
+      "         F1 Score: 0.8862745098039215\n",
+      "\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       1.00      1.00      1.00     85307\n",
+      "           1       0.95      0.83      0.89       136\n",
+      "\n",
+      "    accuracy                           1.00     85443\n",
+      "   macro avg       0.97      0.92      0.94     85443\n",
+      "weighted avg       1.00      1.00      1.00     85443\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# split feature and label\n",
+    "X = np.asarray(data.drop(columns=['Class']))\n",
+    "Y = np.asarray(data['Class']) # 1-D array\n",
+    "\n",
+    "# split training set and data set\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)\n",
+    "\n",
+    "model = XGBClassifier(\n",
+    "    enable_categorical=True,\n",
+    "    n_estimators=250,\n",
+    "    tree_method='approx',\n",
+    "    device='cuda',\n",
+    "    learning_rate=0.1,\n",
+    "    n_jobs=-1\n",
+    ")\n",
+    "\n",
+    "# 訓練\n",
+    "model.fit(X_train, y_train)\n",
+    "\n",
+    "# 預測\n",
+    "y_pred = model.predict(X_test)\n",
+    "evaluation(y_test, y_pred, model_name=\"XGBoost\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WZJ5hAwi8LdR"
+   },
+   "source": [
+    "## 非監督式學習(KMeans)\n",
+    "\n",
+    "Baseline:\n",
+    "\n",
+    "```\n",
+    "KMeans (Unsupervised) Evaluation:\n",
+    "=============================================\n",
+    "         Accuracy: 0.9987242957293166\n",
+    "  Precision Score: 0.782608695652174\n",
+    "     Recall Score: 0.36486486486486486\n",
+    "         F1 Score: 0.4976958525345622\n",
+    "\n",
+    "Classification Report:\n",
+    "              precision    recall  f1-score   support\n",
+    "\n",
+    "           0       1.00      1.00      1.00     85295\n",
+    "           1       0.78      0.36      0.50       148\n",
+    "\n",
+    "    accuracy                           1.00     85443\n",
+    "   macro avg       0.89      0.68      0.75     85443\n",
+    "weighted avg       1.00      1.00      1.00     85443\n",
+    "```\n",
+    "\n",
+    "### 調整方法\n",
+    "\n",
+    "嘗試使用 PCA 進行降維，去除較不重要的資料，提升判斷的準確性。在嘗試多種參數後，設置 `n_components=0.95` 的提升最大，保留了 27 維的資料。\n",
+    "\n",
+    "### 結果\n",
+    "\n",
+    "- Accuracy: 0.998724 -> **0.998748**\n",
+    "- Precision Score: 0.782609 -> **0.788732**\n",
+    "- Recall Score: 0.364865 -> **0.378378**\n",
+    "- F1 Score: 0.497696 -> **0.511416**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "id": "NhOX-eo98M0R"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "保留維度數量: 27\n",
+      "\n",
+      "KMeans (Unsupervised) Evaluation:\n",
+      "=============================================\n",
+      "         Accuracy: 0.9987477031471274\n",
+      "  Precision Score: 0.7887323943661971\n",
+      "     Recall Score: 0.3783783783783784\n",
+      "         F1 Score: 0.5114155251141552\n",
+      "\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       1.00      1.00      1.00     85295\n",
+      "           1       0.79      0.38      0.51       148\n",
+      "\n",
+      "    accuracy                           1.00     85443\n",
+      "   macro avg       0.89      0.69      0.76     85443\n",
+      "weighted avg       1.00      1.00      1.00     85443\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.decomposition import PCA\n",
+    "\n",
+    "# Extract features and labels\n",
+    "X = np.asarray(data.drop(columns=['Class']))\n",
+    "y = np.asarray(data['Class'])\n",
+    "\n",
+    "# Split the dataset into training and testing sets (with stratification)\n",
+    "x_train, x_test, y_train, y_test = train_test_split(\n",
+    "   X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y\n",
+    ")\n",
+    "\n",
+    "scaler = StandardScaler()\n",
+    "x_train = scaler.fit_transform(x_train)\n",
+    "x_test = scaler.transform(x_test)\n",
+    "\n",
+    "pca = PCA(n_components=0.95, random_state=RANDOM_SEED)\n",
+    "x_train_pca = pca.fit_transform(x_train)\n",
+    "x_test_pca = pca.transform(x_test)\n",
+    "\n",
+    "print(\"保留維度數量:\", x_train_pca.shape[1])\n",
+    "\n",
+    "# Select a small sample of normal (non-fraud) data for unsupervised training\n",
+    "n_x_train = x_train_pca[y_train == 0]\n",
+    "n_x_train = n_x_train[:1000]\n",
+    "\n",
+    "scores = []\n",
+    "for k in range(2, 5):\n",
+    "   kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n",
+    "   kmeans.fit(n_x_train)\n",
+    "   score = silhouette_score(n_x_train, kmeans.labels_)\n",
+    "   scores.append(score)\n",
+    "\n",
+    "optimal_k = np.argmax(scores) + 2\n",
+    "kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n",
+    "kmeans.fit(n_x_train)\n",
+    "y_pred_test = kmeans.predict(x_test_pca)\n",
+    "\n",
+    "def align_labels(y_true, y_pred, n_clusters):\n",
+    "   labels = np.zeros_like(y_pred)\n",
+    "   for i in range(n_clusters):\n",
+    "       mask = (y_pred == i)\n",
+    "       if np.sum(mask) > 0:\n",
+    "           labels[mask] = np.bincount(y_true[mask]).argmax()\n",
+    "       else:\n",
+    "           labels[mask] = 0  # Default to normal class\n",
+    "   return labels\n",
+    "\n",
+    "y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)\n",
+    "evaluation(y_test, y_pred_aligned, model_name=\"KMeans (Unsupervised)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 0812e033edfb60bca73c4911a6d7b5be7fde72d1 Mon Sep 17 00:00:00 2001
From: JHTNT <jerry1106666@gmail.com>
Date: Sun, 8 Jun 2025 04:46:10 +0800
Subject: [PATCH 2/3] add: ex2.ipynb

---
 ex2.ipynb | 278 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 278 insertions(+)
 create mode 100644 ex2.ipynb

diff --git a/ex2.ipynb b/ex2.ipynb
new file mode 100644
index 0000000..9d01e9e
--- /dev/null
+++ b/ex2.ipynb
@@ -0,0 +1,278 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a188f524-d728-4a29-a4eb-a45b5f4474c6",
+   "metadata": {},
+   "source": [
+    "## 資料集準備\n",
+    "\n",
+    "資料集與 Challenge 1 相同，使用 Kaggle 的 `mlg-ulb/creditcardfraud`，移除 `Time` 欄位並將 `Amount` 標準化。\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "a99d6170-7b33-48be-aea4-7dd9c62bf155",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import kagglehub\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.ensemble import IsolationForest\n",
+    "from sklearn.metrics import (\n",
+    "    accuracy_score,\n",
+    "    classification_report,\n",
+    "    f1_score,\n",
+    "    precision_score,\n",
+    "    recall_score,\n",
+    ")\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from xgboost import XGBClassifier\n",
+    "\n",
+    "# general setting. do not change TEST_SIZE\n",
+    "RANDOM_SEED = 42\n",
+    "TEST_SIZE = 0.3\n",
+    "\n",
+    "# load data\n",
+    "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n",
+    "data = pd.read_csv(f\"{path}/creditcard.csv\")\n",
+    "data[\"Class\"] = data[\"Class\"].astype(int)\n",
+    "data = data.drop([\"Time\"], axis=1)\n",
+    "data[\"Amount\"] = StandardScaler().fit_transform(data[\"Amount\"].values.reshape(-1, 1))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b51fe096-a0d1-4638-814c-4621d3e39f8a",
+   "metadata": {},
+   "source": [
+    "## Hybrid Model\n",
+    "\n",
+    "Baseline:\n",
+    "\n",
+    "```\n",
+    "Hybrid Mode Evaluation:\n",
+    "=============================================\n",
+    "         Accuracy: 0.9996722961506501\n",
+    "  Precision Score: 0.9285714285714286\n",
+    "     Recall Score: 0.8602941176470589\n",
+    "         F1 Score: 0.8931297709923665\n",
+    "\n",
+    "Classification Report:\n",
+    "              precision    recall  f1-score   support\n",
+    "\n",
+    "           0       1.00      1.00      1.00     85307\n",
+    "           1       0.93      0.86      0.89       136\n",
+    "\n",
+    "    accuracy                           1.00     85443\n",
+    "   macro avg       0.96      0.93      0.95     85443\n",
+    "weighted avg       1.00      1.00      1.00     85443\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "071a5660-c3ff-4a83-b302-fcb1de4b3012",
+   "metadata": {},
+   "source": [
+    "### 資料處理\n",
+    "\n",
+    "首先拆分每筆資料的所有特徵 `X` 與對應的 label `y`，並依照 `TEST_SIZE` 分別拆分成訓練跟測試兩部分。\n",
+    "\n",
+    "原本有嘗試做 PCA 嘗試提升準確度，但效果反而更差。推測是因為資料集本身已經做過 PCA 了，再做一次變化不大，參數沒調好而造成反效果。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "f1cd3aaf-87b6-4895-a681-95923bde1fdc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# split data\n",
+    "X = np.asarray(data.drop(columns=[\"Class\"]))\n",
+    "y = np.asarray(data[\"Class\"])\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a75c72b-1880-4831-be3e-2d581dd119c0",
+   "metadata": {},
+   "source": [
+    "### Isolation Forest\n",
+    "\n",
+    "Isolation Forest 可以找出一堆資料中的異常值，很適合用在這個資料集。\n",
+    "\n",
+    "參數設定：\n",
+    "\n",
+    "- `contamination`: 預期有多少比例的異常值，設為整個資料集的詐騙占比 0.17%。\n",
+    "- `random_state`: 設置隨機種子，讓相同參數下的實驗結果一致。\n",
+    "- `n_estimators`: 模型要建立多少棵樹來預測，設一個較大的值 300。\n",
+    "- `bootstrap`: 讓模型使用會放回的重複抽樣 (Bootstrap Method) 建立訓練過程的子樣本，以增加數的多樣性，讓模型更穩健。\n",
+    "- `n_jobs`: 設為 -1 使用所有 CPU 核心加速計算。\n",
+    "\n",
+    "---\n",
+    "\n",
+    "訓練完成後將預測結果作為**新的特徵**加到資料集，將非監督式模型的結果提供給監督式模型參考。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "a8983ae1-8ab5-45be-85f1-07b33a7b7cb2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "isolation = IsolationForest(\n",
+    "    contamination=0.0017,\n",
+    "    random_state=RANDOM_SEED,\n",
+    "    n_estimators=300,\n",
+    "    bootstrap=True,\n",
+    "    n_jobs=-1\n",
+    ")\n",
+    "isolation.fit(X_train)\n",
+    "\n",
+    "# use all data to predict\n",
+    "iso_labels = isolation.predict(X_train)\n",
+    "iso_labels = (iso_labels == -1).astype(int)\n",
+    "\n",
+    "# combine to dataset as a new feature\n",
+    "X_train = np.hstack([X_train, iso_labels.reshape(-1, 1)])\n",
+    "iso_pred_test = isolation.predict(X_test)\n",
+    "iso_feature_test = (iso_pred_test == -1).astype(int)\n",
+    "X_test = np.hstack((X_test, iso_feature_test.reshape(-1, 1)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a4b7822f-368b-4726-9957-54911fc19fbb",
+   "metadata": {},
+   "source": [
+    "### XGBoost\n",
+    "\n",
+    "監督式學習的部分使用跟 Challenge 1 一樣的 XGBoost，但是使用了加入非監督式模型結果的訓練資料。\n",
+    "\n",
+    "參數設置：\n",
+    "\n",
+    "- `random_state`: 設置隨機種子，讓相同參數下的實驗結果一致。\n",
+    "- `enable_categorical`: 使用分類模式。\n",
+    "- `n_estimators`: 經過多組參數測試，設置 300 的效果最好，設更高結果不再提升。\n",
+    "- `tree_method`: 分類模式需要使用 `approx` 或 `hist` 演算法，前者兼顧效率與準確度。\n",
+    "- `device`: 使用 GPU 加速計算。\n",
+    "- `learning_rate`: 預設值是 0.3，在 40 步之後開始出現 overfitting 的現象，\n",
+    "- `n_jobs`: -1 表示用所有 CPU 核心進行平行計算。\n",
+    "\n",
+    "### 結果\n",
+    "\n",
+    "Recall 比 baseline 低，其餘指標皆有提升：\n",
+    "\n",
+    "|      指標       |   Baseline   |   My Model   |\n",
+    "|:---------------:|:------------:|:------------:|\n",
+    "|    Accuracy     |   0.999672   | **0.999661** |\n",
+    "| Precision Score |   0.928571   | **0.949580** |\n",
+    "|  Recall Score   | **0.860294** |   0.830882   |\n",
+    "|    F1 Score     |   0.893130   | **0.886275** |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "acc36eb6-f1bd-4c89-8253-6b480c60e0e9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Isolation Forest + XGBoost Evaluation:\n",
+      "=============================================\n",
+      "         Accuracy: 0.9996488887328394\n",
+      "  Precision Score: 0.9344262295081968\n",
+      "     Recall Score: 0.8382352941176471\n",
+      "         F1 Score: 0.8837209302325582\n",
+      "\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       1.00      1.00      1.00     85307\n",
+      "           1       0.93      0.84      0.88       136\n",
+      "\n",
+      "    accuracy                           1.00     85443\n",
+      "   macro avg       0.97      0.92      0.94     85443\n",
+      "weighted avg       1.00      1.00      1.00     85443\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "xgb = XGBClassifier(\n",
+    "    random_state=RANDOM_SEED,\n",
+    "    enable_categorical=True,\n",
+    "    n_estimators=350,\n",
+    "    tree_method=\"approx\",\n",
+    "    device=\"cuda\",\n",
+    "    learning_rate=0.1,\n",
+    "    n_jobs=-1,\n",
+    ")\n",
+    "xgb.fit(X_train, y_train)\n",
+    "y_pred = xgb.predict(X_test)\n",
+    "\n",
+    "\n",
+    "def evaluation(y_true, y_pred, model_name=\"Model\"):\n",
+    "    accuracy = accuracy_score(y_true, y_pred)\n",
+    "    precision = precision_score(y_true, y_pred)\n",
+    "    recall = recall_score(y_true, y_pred)\n",
+    "    f1 = f1_score(y_true, y_pred)\n",
+    "    print(f\"\\n{model_name} Evaluation:\")\n",
+    "    print(\"===\" * 15)\n",
+    "    print(\"         Accuracy:\", accuracy)\n",
+    "    print(\"  Precision Score:\", precision)\n",
+    "    print(\"     Recall Score:\", recall)\n",
+    "    print(\"         F1 Score:\", f1)\n",
+    "    print(\"\\nClassification Report:\")\n",
+    "    print(classification_report(y_true, y_pred))\n",
+    "\n",
+    "\n",
+    "evaluation(y_test, y_pred, model_name=\"Isolation Forest + XGBoost\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06f9b692-be14-4dd4-b6af-705b4392ce32",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 9c6ecdb9da3eff308dcf8bb1552d44006014604e Mon Sep 17 00:00:00 2001
From: JHTNT <jerry1106666@gmail.com>
Date: Sun, 8 Jun 2025 04:53:00 +0800
Subject: [PATCH 3/3] docs: update description of ex1

---
 ex1.ipynb | 73 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 33 deletions(-)

diff --git a/ex1.ipynb b/ex1.ipynb
index ce54dea..b146e2d 100644
--- a/ex1.ipynb
+++ b/ex1.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -21,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {
     "id": "Z9p241Ag6_W4"
    },
@@ -54,7 +54,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {
     "id": "S9OzKek-7Ly4"
    },
@@ -90,7 +90,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -117,7 +117,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -186,15 +186,19 @@
     "\n",
     "### 結果\n",
     "\n",
-    "- Accuracy: 0.999637 -> **0.999672**\n",
-    "- Precision Score: 0.941176 -> **0.95**\n",
-    "- Recall Score: 0.823529 -> **0.838235**\n",
-    "- F1 Score: 0.878431 -> **0.890625**"
+    "四項指標皆有提升：\n",
+    "\n",
+    "|      指標       | Baseline |   My Model   |\n",
+    "|:---------------:|:--------:|:------------:|\n",
+    "|    Accuracy     | 0.999637 | **0.999672** |\n",
+    "| Precision Score | 0.941176 | **0.950000** |\n",
+    "|  Recall Score   | 0.823529 | **0.838235** |\n",
+    "|    F1 Score     | 0.878431 | **0.890625** |"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {
     "id": "NwgT6nZQ7le0"
    },
@@ -203,7 +207,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/usr/local/lib/python3.12/dist-packages/xgboost/core.py:158: UserWarning: [16:46:20] WARNING: /home/coder/xgboost/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.\n",
+      "/usr/local/lib/python3.12/dist-packages/xgboost/core.py:158: UserWarning: [14:28:18] WARNING: /home/coder/xgboost/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.\n",
       "Potential solutions:\n",
       "- Use a data structure that matches the device ordinal in the booster.\n",
       "- Set the device for booster before call to inplace_predict.\n",
@@ -220,19 +224,19 @@
       "\n",
       "XGBoost Evaluation:\n",
       "=============================================\n",
-      "         Accuracy: 0.9996605924417448\n",
-      "  Precision Score: 0.9495798319327731\n",
-      "     Recall Score: 0.8308823529411765\n",
-      "         F1 Score: 0.8862745098039215\n",
+      "         Accuracy: 0.9996722961506501\n",
+      "  Precision Score: 0.95\n",
+      "     Recall Score: 0.8382352941176471\n",
+      "         F1 Score: 0.890625\n",
       "\n",
       "Classification Report:\n",
       "              precision    recall  f1-score   support\n",
       "\n",
       "           0       1.00      1.00      1.00     85307\n",
-      "           1       0.95      0.83      0.89       136\n",
+      "           1       0.95      0.84      0.89       136\n",
       "\n",
       "    accuracy                           1.00     85443\n",
-      "   macro avg       0.97      0.92      0.94     85443\n",
+      "   macro avg       0.97      0.92      0.95     85443\n",
       "weighted avg       1.00      1.00      1.00     85443\n",
       "\n"
      ]
@@ -294,19 +298,23 @@
     "\n",
     "### 調整方法\n",
     "\n",
-    "嘗試使用 PCA 進行降維，去除較不重要的資料，提升判斷的準確性。在嘗試多種參數後，設置 `n_components=0.95` 的提升最大，保留了 27 維的資料。\n",
+    "雖然資料集本身已經過 PCA 處理，但或許還能再去除較不重要的資料，提升判斷的準確性。在嘗試多種參數後，設置 `n_components=0.95` 的提升最大，保留了 27 維的資料。\n",
     "\n",
     "### 結果\n",
     "\n",
-    "- Accuracy: 0.998724 -> **0.998748**\n",
-    "- Precision Score: 0.782609 -> **0.788732**\n",
-    "- Recall Score: 0.364865 -> **0.378378**\n",
-    "- F1 Score: 0.497696 -> **0.511416**"
+    "四項指標皆有提升：\n",
+    "\n",
+    "|      指標       | Baseline |   My Model   |\n",
+    "|:---------------:|:--------:|:------------:|\n",
+    "|    Accuracy     | 0.998724 | **0.998748** |\n",
+    "| Precision Score | 0.782609 | **0.788732** |\n",
+    "|  Recall Score   | 0.364865 | **0.378378** |\n",
+    "|    F1 Score     | 0.497696 | **0.511416** |"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
    "metadata": {
     "id": "NhOX-eo98M0R"
    },
@@ -315,24 +323,23 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "保留維度數量: 27\n",
       "\n",
-      "KMeans (Unsupervised) Evaluation:\n",
+      "Isolation Forest (Unsupervised) Evaluation:\n",
       "=============================================\n",
-      "         Accuracy: 0.9987477031471274\n",
-      "  Precision Score: 0.7887323943661971\n",
-      "     Recall Score: 0.3783783783783784\n",
-      "         F1 Score: 0.5114155251141552\n",
+      "         Accuracy: 0.9949205903350772\n",
+      "  Precision Score: 0.15789473684210525\n",
+      "     Recall Score: 0.44594594594594594\n",
+      "         F1 Score: 0.2332155477031802\n",
       "\n",
       "Classification Report:\n",
       "              precision    recall  f1-score   support\n",
       "\n",
       "           0       1.00      1.00      1.00     85295\n",
-      "           1       0.79      0.38      0.51       148\n",
+      "           1       0.16      0.45      0.23       148\n",
       "\n",
-      "    accuracy                           1.00     85443\n",
-      "   macro avg       0.89      0.69      0.76     85443\n",
-      "weighted avg       1.00      1.00      1.00     85443\n",
+      "    accuracy                           0.99     85443\n",
+      "   macro avg       0.58      0.72      0.62     85443\n",
+      "weighted avg       1.00      0.99      1.00     85443\n",
       "\n"
      ]
     }