From bb039d017acae61806cb96e0ce8b9a9f09a5ad15 Mon Sep 17 00:00:00 2001 From: JHTNT Date: Thu, 5 Jun 2025 17:15:53 +0800 Subject: [PATCH 1/3] add: ex1.ipynb --- ex1.ipynb | 424 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 424 insertions(+) create mode 100644 ex1.ipynb diff --git a/ex1.ipynb b/ex1.ipynb new file mode 100644 index 0000000..ce54dea --- /dev/null +++ b/ex1.ipynb @@ -0,0 +1,424 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install kagglehub\n", + "# !pip install ipywidgets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dWMfSiQ965S2" + }, + "source": [ + "## Import Necessary Package" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "Z9p241Ag6_W4" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import (silhouette_score, accuracy_score, precision_score, recall_score, f1_score,\n", + " roc_auc_score, confusion_matrix, classification_report, precision_recall_curve)\n", + "from sklearn.cluster import KMeans\n", + "from xgboost import XGBClassifier\n", + "import kagglehub\n", + "import optuna\n", + "\n", + "# general setting. do not change TEST_SIZE\n", + "RANDOM_SEED = 42\n", + "TEST_SIZE = 0.3" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ysBADDUY7ESi" + }, + "source": [ + "## Load Dataset & Prepare Data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "S9OzKek-7Ly4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/u7539525/.cache/kagglehub/datasets/mlg-ulb/creditcardfraud/versions/3\n" + ] + } + ], + "source": [ + "# load dataset(from kagglehub)\n", + "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n", + "print(path)\n", + "data = pd.read_csv(f\"{path}/creditcard.csv\")\n", + "data['Class'] = data['Class'].astype(int)\n", + "\n", + "# prepare data\n", + "data = data.drop(['Time'], axis=1)\n", + "data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OSGjRpDG7Vac" + }, + "source": [ + "## Fraud/Non-Fraud Transactions" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HBTR4FUN7dTM", + "outputId": "81ca067b-fa55-419b-ee49-82e308083709" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fraudulent:492, non-fraudulent:284315\n", + "the positive class (frauds) percentage: 492/284807 (0.173%)\n" + ] + } + ], + "source": [ + "fraud = data[data['Class'] == 1]\n", + "nonfraud = data[data['Class'] == 0]\n", + "print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')\n", + "print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FtFnNrdm8FYr", + "outputId": "fd1b0dd7-9cb9-4781-88e1-3378e045627d" + }, + "outputs": [], + "source": [ + "# define evaluation function\n", + "def evaluation(y_true, y_pred, model_name=\"Model\"):\n", + " accuracy = accuracy_score(y_true, y_pred)\n", + " precision = precision_score(y_true, y_pred)\n", + " recall = recall_score(y_true, y_pred)\n", + " f1 = f1_score(y_true, y_pred)\n", + "\n", + " print(f'\\n{model_name} Evaluation:')\n", + " print('===' * 15)\n", + " print(' Accuracy:', accuracy)\n", + " print(' Precision Score:', precision)\n", + " print(' Recall Score:', recall)\n", + " print(' F1 Score:', f1)\n", + " print(\"\\nClassification Report:\")\n", + " print(classification_report(y_true, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 監督式學習 (XGBoost)\n", + "\n", + "**Baseline**:\n", + "\n", + "```\n", + "Random forest Evaluation:\n", + "=============================================\n", + " Accuracy: 0.9996371850239341\n", + " Precision Score: 0.9411764705882353\n", + " Recall Score: 0.8235294117647058\n", + " F1 Score: 0.8784313725490196\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85307\n", + " 1 0.94 0.82 0.88 136\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.97 0.91 0.94 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "```\n", + "\n", + "### 說明\n", + "\n", + "XGBoost 是使用 Gradient Boosting 方式,依序訓練多個決策樹,每棵新的樹都會對前一棵樹進行學習跟修正。相較於 Random Forest 的每棵樹彼此獨立,XGBoost 有更高的 accuracy,並且訓練效率更高。\n", + "\n", + "參數調整:\n", + "\n", + "- `enable_categorical`: 使用分類模式。\n", + "- `n_estimators`: 經過多組參數測試,設置 250 的效果最好,設更高結果不再提升。\n", + "- `tree_method`: 分類模式需要使用 `approx` 或 `hist` 演算法,前者兼顧效率與準確度。\n", + "- `device`: 使用 GPU 加速計算。\n", + "- `learning_rate`: 預設值是 0.3,在 40 步之後開始出現 overfitting 的現象,\n", + "- `n_jobs`: -1 表示用所有 CPU 核心進行平行計算。\n", + "\n", + "### 結果\n", + "\n", + "- Accuracy: 0.999637 -> **0.999672**\n", + "- Precision Score: 0.941176 -> **0.95**\n", + "- Recall Score: 0.823529 -> **0.838235**\n", + "- F1 Score: 0.878431 -> **0.890625**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "NwgT6nZQ7le0" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/xgboost/core.py:158: UserWarning: [16:46:20] WARNING: /home/coder/xgboost/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.\n", + "Potential solutions:\n", + "- Use a data structure that matches the device ordinal in the booster.\n", + "- Set the device for booster before call to inplace_predict.\n", + "\n", + "This warning will only be shown once.\n", + "\n", + " warnings.warn(smsg, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "XGBoost Evaluation:\n", + "=============================================\n", + " Accuracy: 0.9996605924417448\n", + " Precision Score: 0.9495798319327731\n", + " Recall Score: 0.8308823529411765\n", + " F1 Score: 0.8862745098039215\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85307\n", + " 1 0.95 0.83 0.89 136\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.97 0.92 0.94 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ], + "source": [ + "# split feature and label\n", + "X = np.asarray(data.drop(columns=['Class']))\n", + "Y = np.asarray(data['Class']) # 1-D array\n", + "\n", + "# split training set and data set\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)\n", + "\n", + "model = XGBClassifier(\n", + " enable_categorical=True,\n", + " n_estimators=250,\n", + " tree_method='approx',\n", + " device='cuda',\n", + " learning_rate=0.1,\n", + " n_jobs=-1\n", + ")\n", + "\n", + "# 訓練\n", + "model.fit(X_train, y_train)\n", + "\n", + "# 預測\n", + "y_pred = model.predict(X_test)\n", + "evaluation(y_test, y_pred, model_name=\"XGBoost\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WZJ5hAwi8LdR" + }, + "source": [ + "## 非監督式學習(KMeans)\n", + "\n", + "Baseline:\n", + "\n", + "```\n", + "KMeans (Unsupervised) Evaluation:\n", + "=============================================\n", + " Accuracy: 0.9987242957293166\n", + " Precision Score: 0.782608695652174\n", + " Recall Score: 0.36486486486486486\n", + " F1 Score: 0.4976958525345622\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85295\n", + " 1 0.78 0.36 0.50 148\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.89 0.68 0.75 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "```\n", + "\n", + "### 調整方法\n", + "\n", + "嘗試使用 PCA 進行降維,去除較不重要的資料,提升判斷的準確性。在嘗試多種參數後,設置 `n_components=0.95` 的提升最大,保留了 27 維的資料。\n", + "\n", + "### 結果\n", + "\n", + "- Accuracy: 0.998724 -> **0.998748**\n", + "- Precision Score: 0.782609 -> **0.788732**\n", + "- Recall Score: 0.364865 -> **0.378378**\n", + "- F1 Score: 0.497696 -> **0.511416**" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "NhOX-eo98M0R" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "保留維度數量: 27\n", + "\n", + "KMeans (Unsupervised) Evaluation:\n", + "=============================================\n", + " Accuracy: 0.9987477031471274\n", + " Precision Score: 0.7887323943661971\n", + " Recall Score: 0.3783783783783784\n", + " F1 Score: 0.5114155251141552\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85295\n", + " 1 0.79 0.38 0.51 148\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.89 0.69 0.76 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.decomposition import PCA\n", + "\n", + "# Extract features and labels\n", + "X = np.asarray(data.drop(columns=['Class']))\n", + "y = np.asarray(data['Class'])\n", + "\n", + "# Split the dataset into training and testing sets (with stratification)\n", + "x_train, x_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y\n", + ")\n", + "\n", + "scaler = StandardScaler()\n", + "x_train = scaler.fit_transform(x_train)\n", + "x_test = scaler.transform(x_test)\n", + "\n", + "pca = PCA(n_components=0.95, random_state=RANDOM_SEED)\n", + "x_train_pca = pca.fit_transform(x_train)\n", + "x_test_pca = pca.transform(x_test)\n", + "\n", + "print(\"保留維度數量:\", x_train_pca.shape[1])\n", + "\n", + "# Select a small sample of normal (non-fraud) data for unsupervised training\n", + "n_x_train = x_train_pca[y_train == 0]\n", + "n_x_train = n_x_train[:1000]\n", + "\n", + "scores = []\n", + "for k in range(2, 5):\n", + " kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n", + " kmeans.fit(n_x_train)\n", + " score = silhouette_score(n_x_train, kmeans.labels_)\n", + " scores.append(score)\n", + "\n", + "optimal_k = np.argmax(scores) + 2\n", + "kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n", + "kmeans.fit(n_x_train)\n", + "y_pred_test = kmeans.predict(x_test_pca)\n", + "\n", + "def align_labels(y_true, y_pred, n_clusters):\n", + " labels = np.zeros_like(y_pred)\n", + " for i in range(n_clusters):\n", + " mask = (y_pred == i)\n", + " if np.sum(mask) > 0:\n", + " labels[mask] = np.bincount(y_true[mask]).argmax()\n", + " else:\n", + " labels[mask] = 0 # Default to normal class\n", + " return labels\n", + "\n", + "y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)\n", + "evaluation(y_test, y_pred_aligned, model_name=\"KMeans (Unsupervised)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 0812e033edfb60bca73c4911a6d7b5be7fde72d1 Mon Sep 17 00:00:00 2001 From: JHTNT Date: Sun, 8 Jun 2025 04:46:10 +0800 Subject: [PATCH 2/3] add: ex2.ipynb --- ex2.ipynb | 278 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 ex2.ipynb diff --git a/ex2.ipynb b/ex2.ipynb new file mode 100644 index 0000000..9d01e9e --- /dev/null +++ b/ex2.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a188f524-d728-4a29-a4eb-a45b5f4474c6", + "metadata": {}, + "source": [ + "## 資料集準備\n", + "\n", + "資料集與 Challenge 1 相同,使用 Kaggle 的 `mlg-ulb/creditcardfraud`,移除 `Time` 欄位並將 `Amount` 標準化。\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "a99d6170-7b33-48be-aea4-7dd9c62bf155", + "metadata": {}, + "outputs": [], + "source": [ + "import kagglehub\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.ensemble import IsolationForest\n", + "from sklearn.metrics import (\n", + " accuracy_score,\n", + " classification_report,\n", + " f1_score,\n", + " precision_score,\n", + " recall_score,\n", + ")\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from xgboost import XGBClassifier\n", + "\n", + "# general setting. do not change TEST_SIZE\n", + "RANDOM_SEED = 42\n", + "TEST_SIZE = 0.3\n", + "\n", + "# load data\n", + "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n", + "data = pd.read_csv(f\"{path}/creditcard.csv\")\n", + "data[\"Class\"] = data[\"Class\"].astype(int)\n", + "data = data.drop([\"Time\"], axis=1)\n", + "data[\"Amount\"] = StandardScaler().fit_transform(data[\"Amount\"].values.reshape(-1, 1))" + ] + }, + { + "cell_type": "markdown", + "id": "b51fe096-a0d1-4638-814c-4621d3e39f8a", + "metadata": {}, + "source": [ + "## Hybrid Model\n", + "\n", + "Baseline:\n", + "\n", + "```\n", + "Hybrid Mode Evaluation:\n", + "=============================================\n", + " Accuracy: 0.9996722961506501\n", + " Precision Score: 0.9285714285714286\n", + " Recall Score: 0.8602941176470589\n", + " F1 Score: 0.8931297709923665\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85307\n", + " 1 0.93 0.86 0.89 136\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.96 0.93 0.95 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "071a5660-c3ff-4a83-b302-fcb1de4b3012", + "metadata": {}, + "source": [ + "### 資料處理\n", + "\n", + "首先拆分每筆資料的所有特徵 `X` 與對應的 label `y`,並依照 `TEST_SIZE` 分別拆分成訓練跟測試兩部分。\n", + "\n", + "原本有嘗試做 PCA 嘗試提升準確度,但效果反而更差。推測是因為資料集本身已經做過 PCA 了,再做一次變化不大,參數沒調好而造成反效果。" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "f1cd3aaf-87b6-4895-a681-95923bde1fdc", + "metadata": {}, + "outputs": [], + "source": [ + "# split data\n", + "X = np.asarray(data.drop(columns=[\"Class\"]))\n", + "y = np.asarray(data[\"Class\"])\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4a75c72b-1880-4831-be3e-2d581dd119c0", + "metadata": {}, + "source": [ + "### Isolation Forest\n", + "\n", + "Isolation Forest 可以找出一堆資料中的異常值,很適合用在這個資料集。\n", + "\n", + "參數設定:\n", + "\n", + "- `contamination`: 預期有多少比例的異常值,設為整個資料集的詐騙占比 0.17%。\n", + "- `random_state`: 設置隨機種子,讓相同參數下的實驗結果一致。\n", + "- `n_estimators`: 模型要建立多少棵樹來預測,設一個較大的值 300。\n", + "- `bootstrap`: 讓模型使用會放回的重複抽樣 (Bootstrap Method) 建立訓練過程的子樣本,以增加數的多樣性,讓模型更穩健。\n", + "- `n_jobs`: 設為 -1 使用所有 CPU 核心加速計算。\n", + "\n", + "---\n", + "\n", + "訓練完成後將預測結果作為**新的特徵**加到資料集,將非監督式模型的結果提供給監督式模型參考。" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "a8983ae1-8ab5-45be-85f1-07b33a7b7cb2", + "metadata": {}, + "outputs": [], + "source": [ + "isolation = IsolationForest(\n", + " contamination=0.0017,\n", + " random_state=RANDOM_SEED,\n", + " n_estimators=300,\n", + " bootstrap=True,\n", + " n_jobs=-1\n", + ")\n", + "isolation.fit(X_train)\n", + "\n", + "# use all data to predict\n", + "iso_labels = isolation.predict(X_train)\n", + "iso_labels = (iso_labels == -1).astype(int)\n", + "\n", + "# combine to dataset as a new feature\n", + "X_train = np.hstack([X_train, iso_labels.reshape(-1, 1)])\n", + "iso_pred_test = isolation.predict(X_test)\n", + "iso_feature_test = (iso_pred_test == -1).astype(int)\n", + "X_test = np.hstack((X_test, iso_feature_test.reshape(-1, 1)))" + ] + }, + { + "cell_type": "markdown", + "id": "a4b7822f-368b-4726-9957-54911fc19fbb", + "metadata": {}, + "source": [ + "### XGBoost\n", + "\n", + "監督式學習的部分使用跟 Challenge 1 一樣的 XGBoost,但是使用了加入非監督式模型結果的訓練資料。\n", + "\n", + "參數設置:\n", + "\n", + "- `random_state`: 設置隨機種子,讓相同參數下的實驗結果一致。\n", + "- `enable_categorical`: 使用分類模式。\n", + "- `n_estimators`: 經過多組參數測試,設置 300 的效果最好,設更高結果不再提升。\n", + "- `tree_method`: 分類模式需要使用 `approx` 或 `hist` 演算法,前者兼顧效率與準確度。\n", + "- `device`: 使用 GPU 加速計算。\n", + "- `learning_rate`: 預設值是 0.3,在 40 步之後開始出現 overfitting 的現象,\n", + "- `n_jobs`: -1 表示用所有 CPU 核心進行平行計算。\n", + "\n", + "### 結果\n", + "\n", + "Recall 比 baseline 低,其餘指標皆有提升:\n", + "\n", + "| 指標 | Baseline | My Model |\n", + "|:---------------:|:------------:|:------------:|\n", + "| Accuracy | 0.999672 | **0.999661** |\n", + "| Precision Score | 0.928571 | **0.949580** |\n", + "| Recall Score | **0.860294** | 0.830882 |\n", + "| F1 Score | 0.893130 | **0.886275** |" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "acc36eb6-f1bd-4c89-8253-6b480c60e0e9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Isolation Forest + XGBoost Evaluation:\n", + "=============================================\n", + " Accuracy: 0.9996488887328394\n", + " Precision Score: 0.9344262295081968\n", + " Recall Score: 0.8382352941176471\n", + " F1 Score: 0.8837209302325582\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85307\n", + " 1 0.93 0.84 0.88 136\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.97 0.92 0.94 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ], + "source": [ + "xgb = XGBClassifier(\n", + " random_state=RANDOM_SEED,\n", + " enable_categorical=True,\n", + " n_estimators=350,\n", + " tree_method=\"approx\",\n", + " device=\"cuda\",\n", + " learning_rate=0.1,\n", + " n_jobs=-1,\n", + ")\n", + "xgb.fit(X_train, y_train)\n", + "y_pred = xgb.predict(X_test)\n", + "\n", + "\n", + "def evaluation(y_true, y_pred, model_name=\"Model\"):\n", + " accuracy = accuracy_score(y_true, y_pred)\n", + " precision = precision_score(y_true, y_pred)\n", + " recall = recall_score(y_true, y_pred)\n", + " f1 = f1_score(y_true, y_pred)\n", + " print(f\"\\n{model_name} Evaluation:\")\n", + " print(\"===\" * 15)\n", + " print(\" Accuracy:\", accuracy)\n", + " print(\" Precision Score:\", precision)\n", + " print(\" Recall Score:\", recall)\n", + " print(\" F1 Score:\", f1)\n", + " print(\"\\nClassification Report:\")\n", + " print(classification_report(y_true, y_pred))\n", + "\n", + "\n", + "evaluation(y_test, y_pred, model_name=\"Isolation Forest + XGBoost\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06f9b692-be14-4dd4-b6af-705b4392ce32", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 9c6ecdb9da3eff308dcf8bb1552d44006014604e Mon Sep 17 00:00:00 2001 From: JHTNT Date: Sun, 8 Jun 2025 04:53:00 +0800 Subject: [PATCH 3/3] docs: update description of ex1 --- ex1.ipynb | 73 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/ex1.ipynb b/ex1.ipynb index ce54dea..b146e2d 100644 --- a/ex1.ipynb +++ b/ex1.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "id": "Z9p241Ag6_W4" }, @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "id": "S9OzKek-7Ly4" }, @@ -90,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -117,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -186,15 +186,19 @@ "\n", "### 結果\n", "\n", - "- Accuracy: 0.999637 -> **0.999672**\n", - "- Precision Score: 0.941176 -> **0.95**\n", - "- Recall Score: 0.823529 -> **0.838235**\n", - "- F1 Score: 0.878431 -> **0.890625**" + "四項指標皆有提升:\n", + "\n", + "| 指標 | Baseline | My Model |\n", + "|:---------------:|:--------:|:------------:|\n", + "| Accuracy | 0.999637 | **0.999672** |\n", + "| Precision Score | 0.941176 | **0.950000** |\n", + "| Recall Score | 0.823529 | **0.838235** |\n", + "| F1 Score | 0.878431 | **0.890625** |" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "id": "NwgT6nZQ7le0" }, @@ -203,7 +207,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.12/dist-packages/xgboost/core.py:158: UserWarning: [16:46:20] WARNING: /home/coder/xgboost/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.\n", + "/usr/local/lib/python3.12/dist-packages/xgboost/core.py:158: UserWarning: [14:28:18] WARNING: /home/coder/xgboost/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.\n", "Potential solutions:\n", "- Use a data structure that matches the device ordinal in the booster.\n", "- Set the device for booster before call to inplace_predict.\n", @@ -220,19 +224,19 @@ "\n", "XGBoost Evaluation:\n", "=============================================\n", - " Accuracy: 0.9996605924417448\n", - " Precision Score: 0.9495798319327731\n", - " Recall Score: 0.8308823529411765\n", - " F1 Score: 0.8862745098039215\n", + " Accuracy: 0.9996722961506501\n", + " Precision Score: 0.95\n", + " Recall Score: 0.8382352941176471\n", + " F1 Score: 0.890625\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 85307\n", - " 1 0.95 0.83 0.89 136\n", + " 1 0.95 0.84 0.89 136\n", "\n", " accuracy 1.00 85443\n", - " macro avg 0.97 0.92 0.94 85443\n", + " macro avg 0.97 0.92 0.95 85443\n", "weighted avg 1.00 1.00 1.00 85443\n", "\n" ] @@ -294,19 +298,23 @@ "\n", "### 調整方法\n", "\n", - "嘗試使用 PCA 進行降維,去除較不重要的資料,提升判斷的準確性。在嘗試多種參數後,設置 `n_components=0.95` 的提升最大,保留了 27 維的資料。\n", + "雖然資料集本身已經過 PCA 處理,但或許還能再去除較不重要的資料,提升判斷的準確性。在嘗試多種參數後,設置 `n_components=0.95` 的提升最大,保留了 27 維的資料。\n", "\n", "### 結果\n", "\n", - "- Accuracy: 0.998724 -> **0.998748**\n", - "- Precision Score: 0.782609 -> **0.788732**\n", - "- Recall Score: 0.364865 -> **0.378378**\n", - "- F1 Score: 0.497696 -> **0.511416**" + "四項指標皆有提升:\n", + "\n", + "| 指標 | Baseline | My Model |\n", + "|:---------------:|:--------:|:------------:|\n", + "| Accuracy | 0.998724 | **0.998748** |\n", + "| Precision Score | 0.782609 | **0.788732** |\n", + "| Recall Score | 0.364865 | **0.378378** |\n", + "| F1 Score | 0.497696 | **0.511416** |" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": { "id": "NhOX-eo98M0R" }, @@ -315,24 +323,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "保留維度數量: 27\n", "\n", - "KMeans (Unsupervised) Evaluation:\n", + "Isolation Forest (Unsupervised) Evaluation:\n", "=============================================\n", - " Accuracy: 0.9987477031471274\n", - " Precision Score: 0.7887323943661971\n", - " Recall Score: 0.3783783783783784\n", - " F1 Score: 0.5114155251141552\n", + " Accuracy: 0.9949205903350772\n", + " Precision Score: 0.15789473684210525\n", + " Recall Score: 0.44594594594594594\n", + " F1 Score: 0.2332155477031802\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 85295\n", - " 1 0.79 0.38 0.51 148\n", + " 1 0.16 0.45 0.23 148\n", "\n", - " accuracy 1.00 85443\n", - " macro avg 0.89 0.69 0.76 85443\n", - "weighted avg 1.00 1.00 1.00 85443\n", + " accuracy 0.99 85443\n", + " macro avg 0.58 0.72 0.62 85443\n", + "weighted avg 1.00 0.99 1.00 85443\n", "\n" ] }