diff --git a/ACS111150_ex1/ex1.ipynb b/ACS111150_ex1/ex1.ipynb new file mode 100644 index 0000000..e4e5d87 --- /dev/null +++ b/ACS111150_ex1/ex1.ipynb @@ -0,0 +1,466 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "srrdv_JjhrLP" + }, + "outputs": [], + "source": [ + "#import\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.ensemble import IsolationForest\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report\n", + "import kagglehub\n", + "\n", + "from sklearn.metrics import silhouette_score\n", + "from sklearn.metrics import classification_report\n", + "#general set\n", + "RANDOM_SEED = 42\n", + "TEST_SIZE = 0.3\n", + "\n", + "#download\n", + "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n", + "data = pd.read_csv(f\"{path}/creditcard.csv\")\n", + "\n", + "#prepare\n", + "data['Class'] = data['Class'].astype(int)\n", + "data = data.drop(['Time'], axis=1)\n", + "data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "w0B8EoH7BjaF" + }, + "outputs": [], + "source": [ + "#output\n", + "def evaluation(y_true, y_pred, model_name=\"Model\"):\n", + " accuracy = accuracy_score(y_true, y_pred)\n", + " precision = precision_score(y_true, y_pred)\n", + " recall = recall_score(y_true, y_pred)\n", + " f1 = f1_score(y_true, y_pred)\n", + " print(f'\\n{model_name} Evaluation:')\n", + " print('=' * 30)\n", + " print(f' Accuracy : {accuracy:.8f}')\n", + " print(f' Precision Score: {precision:.8f}')\n", + " print(f' Recall Score : {recall:.8f}')\n", + " print(f' F1 Score : {f1:.8f}')\n", + " print('\\nClassification Report:')\n", + " print(classification_report(y_true, y_pred))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RFNW-rTlh05T", + "outputId": "07004d18-dbee-4d20-8464-577d01a5eec3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Random Forest Evaluation:\n", + "==============================\n", + " Accuracy : 0.99963719\n", + " Precision Score: 0.94117647\n", + " Recall Score : 0.82352941\n", + " F1 Score : 0.87843137\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85307\n", + " 1 0.94 0.82 0.88 136\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.97 0.91 0.94 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ], + "source": [ + "#basic\n", + "\n", + "X = data.drop(columns=['Class']).values\n", + "y = data['Class'].values\n", + "\n", + "#splite\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)\n", + "\n", + "#rf model\n", + "rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)\n", + "rf.fit(X_train, y_train)\n", + "\n", + "#output\n", + "y_pred_rf = rf.predict(X_test)\n", + "evaluation(y_test, y_pred_rf, model_name=\"Random Forest\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Und5nd7p6GX7", + "outputId": "fcac0b74-22ea-4b0b-8f5b-f821b573c54f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: xgboost in /usr/local/lib/python3.11/dist-packages (2.1.4)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from xgboost) (2.0.2)\n", + "Requirement already satisfied: nvidia-nccl-cu12 in /usr/local/lib/python3.11/dist-packages (from xgboost) (2.21.5)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from xgboost) (1.15.3)\n" + ] + } + ], + "source": [ + "!pip install xgboost" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rRgM6dX713kB", + "outputId": "1c85834e-8ab1-4c0b-d5f1-b8373f18d10b" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "XGBoost Evaluation:\n", + "==============================\n", + " Accuracy : 0.99969570\n", + " Precision Score: 0.94354839\n", + " Recall Score : 0.86029412\n", + " F1 Score : 0.90000000\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85307\n", + " 1 0.94 0.86 0.90 136\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.97 0.93 0.95 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ], + "source": [ + "#XGBoost\n", + "from xgboost import XGBClassifier\n", + "\n", + "xgb = XGBClassifier(\n", + " n_estimators=600,\n", + " max_depth=8,\n", + " learning_rate=0.1,\n", + " scale_pos_weight=8,\n", + " random_state=RANDOM_SEED,\n", + " #use_label_encoder=False,\n", + " eval_metric='logloss'\n", + ")\n", + "xgb.fit(X_train, y_train.ravel())\n", + "y_pred_xgb = xgb.predict(X_test)\n", + "evaluation(y_test, y_pred_xgb, model_name=\"XGBoost\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8EPc27Reh4u7", + "outputId": "92123952-47f2-470f-af00-1f15e4f40579" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "KMeans (Unsupervised) Evaluation:\n", + "==============================\n", + " Accuracy : 0.99872430\n", + " Precision Score: 0.78260870\n", + " Recall Score : 0.36486486\n", + " F1 Score : 0.49769585\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85295\n", + " 1 0.78 0.36 0.50 148\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.89 0.68 0.75 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ], + "source": [ + "#Kmeans\n", + "X = np.asarray(data.drop(columns=['Class']))\n", + "y = np.asarray(data['Class'])\n", + "\n", + "#Split\n", + "x_train, x_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y\n", + ")\n", + "\n", + "#ragular\n", + "scaler = StandardScaler()\n", + "x_train = scaler.fit_transform(x_train)\n", + "x_test = scaler.transform(x_test)\n", + "\n", + "#train KMeans\n", + "n_x_train = x_train[y_train == 0]\n", + "n_x_train = n_x_train[:1000]\n", + "\n", + "#find best k\n", + "scores = []\n", + "for k in range(2, 5):\n", + " kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n", + " kmeans.fit(n_x_train)\n", + " score = silhouette_score(n_x_train, kmeans.labels_)\n", + " scores.append(score)\n", + "optimal_k = np.argmax(scores) + 2\n", + "\n", + "#train best k\n", + "kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n", + "kmeans.fit(n_x_train)\n", + "y_pred_test = kmeans.predict(x_test)\n", + "\n", + "def align_labels(y_true, y_pred, n_clusters):\n", + " labels = np.zeros_like(y_pred)\n", + " for i in range(n_clusters):\n", + " mask = (y_pred == i)\n", + " if np.sum(mask) > 0:\n", + " labels[mask] = np.bincount(y_true[mask]).argmax()\n", + " else:\n", + " labels[mask] = 0 # Default to normal class\n", + " return labels\n", + "\n", + "y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)\n", + "\n", + "\n", + "evaluation(y_test, y_pred_aligned, model_name=\"KMeans (Unsupervised)\")\n" + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install --upgrade --force-reinstall --no-cache-dir jax jaxlib\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FLSMZ82PaQDh", + "outputId": "e801ab4e-9b6f-4c6e-e5ec-4a613733f45b", + "collapsed": true + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting jax\n", + " Downloading jax-0.6.1-py3-none-any.whl.metadata (13 kB)\n", + "Collecting jaxlib\n", + " Downloading jaxlib-0.6.1-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)\n", + "Collecting ml_dtypes>=0.5.0 (from jax)\n", + " Downloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)\n", + "Collecting numpy>=1.25 (from jax)\n", + " Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.0/62.0 kB\u001b[0m \u001b[31m28.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting opt_einsum (from jax)\n", + " Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)\n", + "Collecting scipy>=1.11.1 (from jax)\n", + " Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.0/62.0 kB\u001b[0m \u001b[31m141.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading jax-0.6.1-py3-none-any.whl (2.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m50.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading jaxlib-0.6.1-cp311-cp311-manylinux2014_x86_64.whl (89.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m89.1/89.1 MB\u001b[0m \u001b[31m138.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.7/4.7 MB\u001b[0m \u001b[31m170.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.8/16.8 MB\u001b[0m \u001b[31m137.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m37.7/37.7 MB\u001b[0m \u001b[31m60.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading opt_einsum-3.4.0-py3-none-any.whl (71 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.9/71.9 kB\u001b[0m \u001b[31m126.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: opt_einsum, numpy, scipy, ml_dtypes, jaxlib, jax\n", + " Attempting uninstall: opt_einsum\n", + " Found existing installation: opt_einsum 3.4.0\n", + " Uninstalling opt_einsum-3.4.0:\n", + " Successfully uninstalled opt_einsum-3.4.0\n", + " Attempting uninstall: numpy\n", + " Found existing installation: numpy 2.2.6\n", + " Uninstalling numpy-2.2.6:\n", + " Successfully uninstalled numpy-2.2.6\n", + " Attempting uninstall: scipy\n", + " Found existing installation: scipy 1.15.3\n", + " Uninstalling scipy-1.15.3:\n", + " Successfully uninstalled scipy-1.15.3\n", + " Attempting uninstall: ml_dtypes\n", + " Found existing installation: ml_dtypes 0.5.1\n", + " Uninstalling ml_dtypes-0.5.1:\n", + " Successfully uninstalled ml_dtypes-0.5.1\n", + " Attempting uninstall: jaxlib\n", + " Found existing installation: jaxlib 0.6.1\n", + " Uninstalling jaxlib-0.6.1:\n", + " Successfully uninstalled jaxlib-0.6.1\n", + " Attempting uninstall: jax\n", + " Found existing installation: jax 0.6.1\n", + " Uninstalling jax-0.6.1:\n", + " Successfully uninstalled jax-0.6.1\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.18.0 requires ml-dtypes<0.5.0,>=0.4.0, but you have ml-dtypes 0.5.1 which is incompatible.\n", + "tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.6 which is incompatible.\n", + "numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed jax-0.6.1 jaxlib-0.6.1 ml_dtypes-0.5.1 numpy-2.2.6 opt_einsum-3.4.0 scipy-1.15.3\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#MY_KMeans\n", + "\n", + "normal = x_train[y_train == 0][:800]\n", + "fraud = x_train[y_train == 1][:200]\n", + "n_x_train = np.vstack([normal, fraud])\n", + "\n", + "# find k\n", + "scores = []\n", + "for k in range(2, 5):\n", + " kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n", + " kmeans.fit(n_x_train)\n", + " score = silhouette_score(n_x_train, kmeans.labels_)\n", + " scores.append(score)\n", + "optimal_k = np.argmax(scores) + 2\n", + "\n", + "#train with k\n", + "kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n", + "kmeans.fit(n_x_train)\n", + "\n", + "\n", + "y_pred_test = kmeans.predict(x_test)\n", + "def align_labels(y_true, y_pred, n_clusters):\n", + " labels = np.zeros_like(y_pred)\n", + " for i in range(n_clusters):\n", + " mask = (y_pred == i)\n", + " if np.sum(mask) > 0:\n", + " labels[mask] = np.bincount(y_true[mask]).argmax()\n", + " else:\n", + " labels[mask] = 0\n", + " return labels\n", + "\n", + "y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)\n", + "\n", + "\n", + "evaluation(y_test, y_pred_aligned, model_name=\"MY_KMeans\")\n", + "\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Va-AkmnhUWO3", + "outputId": "b63f7573-ddd4-4194-fd2d-73b4133a7579" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "MY_KMeans Evaluation:\n", + "==============================\n", + " Accuracy : 0.99897007\n", + " Precision Score: 0.83333333\n", + " Recall Score : 0.50675676\n", + " F1 Score : 0.63025210\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85295\n", + " 1 0.83 0.51 0.63 148\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.92 0.75 0.81 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyNxw6KMHh2yrVUGkJGepcz1", + "include_colab_link": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ACS111150_ex1/ex1.md b/ACS111150_ex1/ex1.md new file mode 100644 index 0000000..ab4121d --- /dev/null +++ b/ACS111150_ex1/ex1.md @@ -0,0 +1,107 @@ +# 信用卡詐欺檢測報告 + +## 1. 監督式學習:XGBoost + +**參數設定** +- `n_estimators=600` +- `max_depth=8` +- `learning_rate=0.1` +- `scale_pos_weight=8` +- `random_state=42` +- `eval_metric='logloss'` + +> 調高 `n_estimators` 與 `max_depth`,並加大 `scale_pos_weight` 以處理嚴重不平衡問題。 + +### 範例結果 + +``` +Random Forest Evaluation: +-------------------------------------------- +Accuracy : 0.99963719 +Precision Score: 0.94117647 +Recall Score : 0.82352941 +F1 Score : 0.87843137 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85307 + 1 0.94 0.82 0.88 136 + + accuracy 1.00 85443 + macro avg 0.97 0.91 0.94 85443 +weighted avg 1.00 1.00 1.00 85443 +-------------------------------------------- +``` + +### 實作結果 + +``` +XGBoost Evaluation: +-------------------------------------------- +Accuracy : 0.99969570 +Precision Score: 0.94354839 +Recall Score : 0.86029412 +F1 Score : 0.90000000 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85307 + 1 0.94 0.86 0.90 136 + + accuracy 1.00 85443 + macro avg 0.97 0.93 0.95 85443 +weighted avg 1.00 1.00 1.00 85443 +-------------------------------------------- +``` + +--- + +## 2. 非監督式學習:KMeans + +> **改動**:原本只用正常樣本訓練,改成取 800 筆正常樣本與 200 筆詐騙樣本共同訓練。 + +### 範例結果 + +``` +KMeans (Unsupervised) Evaluation: +-------------------------------------------- +Accuracy : 0.99872430 +Precision Score: 0.78260870 +Recall Score : 0.36486486 +F1 Score : 0.49769585 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85295 + 1 0.78 0.36 0.50 148 + + accuracy 1.00 85443 + macro avg 0.89 0.68 0.75 85443 +weighted avg 1.00 1.00 1.00 85443 +-------------------------------------------- +``` + +### 實作結果 + +``` +MY KMeans Evaluation: +-------------------------------------------- +Accuracy : 0.99897007 +Precision Score: 0.83333333 +Recall Score : 0.50675676 +F1 Score : 0.63025210 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85295 + 1 0.83 0.51 0.63 148 + + accuracy 1.00 85443 + macro avg 0.92 0.75 0.81 85443 +weighted avg 1.00 1.00 1.00 85443 +-------------------------------------------- +``` diff --git a/ACS111150_ex2/ex2.ipynb b/ACS111150_ex2/ex2.ipynb new file mode 100644 index 0000000..2b2b676 --- /dev/null +++ b/ACS111150_ex2/ex2.ipynb @@ -0,0 +1,280 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyM2eyDjzA73AcHvxuaEcDvZ", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oMCrMKZxURfg", + "outputId": "27147b6f-96f3-4ead-f122-a18e533af342" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: xgboost in /usr/local/lib/python3.11/dist-packages (2.1.4)\n", + "Collecting xgboost\n", + " Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)\n", + "Requirement already satisfied: imbalanced-learn in /usr/local/lib/python3.11/dist-packages (0.13.0)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from xgboost) (2.0.2)\n", + "Requirement already satisfied: nvidia-nccl-cu12 in /usr/local/lib/python3.11/dist-packages (from xgboost) (2.21.5)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from xgboost) (1.15.3)\n", + "Requirement already satisfied: scikit-learn<2,>=1.3.2 in /usr/local/lib/python3.11/dist-packages (from imbalanced-learn) (1.6.1)\n", + "Requirement already satisfied: sklearn-compat<1,>=0.1 in /usr/local/lib/python3.11/dist-packages (from imbalanced-learn) (0.1.3)\n", + "Requirement already satisfied: joblib<2,>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from imbalanced-learn) (1.5.0)\n", + "Requirement already satisfied: threadpoolctl<4,>=2.0.0 in /usr/local/lib/python3.11/dist-packages (from imbalanced-learn) (3.6.0)\n", + "Downloading xgboost-3.0.2-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m253.9/253.9 MB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: xgboost\n", + " Attempting uninstall: xgboost\n", + " Found existing installation: xgboost 2.1.4\n", + " Uninstalling xgboost-2.1.4:\n", + " Successfully uninstalled xgboost-2.1.4\n", + "Successfully installed xgboost-3.0.2\n" + ] + } + ], + "source": [ + "\n", + "!pip install --upgrade xgboost imbalanced-learn\n" + ] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split, GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.ensemble import IsolationForest\n", + "from xgboost import XGBClassifier\n", + "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report\n", + "import kagglehub" + ], + "metadata": { + "id": "m67jikTkUVDu" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def evaluation(y_true, y_pred, model_name=\"Model\"):\n", + " acc = accuracy_score(y_true, y_pred)\n", + " prec = precision_score(y_true, y_pred, zero_division=0)\n", + " rec = recall_score(y_true, y_pred)\n", + " f1 = f1_score(y_true, y_pred)\n", + " print(f\"\\n{model_name} Evaluation:\")\n", + " print(\"=\" * 40)\n", + " print(f\" Accuracy : {acc:.8f}\")\n", + " print(f\" Precision Score: {prec:.8f}\")\n", + " print(f\" Recall Score : {rec:.8f}\")\n", + " print(f\" F1 Score : {f1:.8f}\\n\")\n", + " print(\"Classification Report:\")\n", + " print(classification_report(y_true, y_pred))" + ], + "metadata": { + "id": "6mY9Dv0nUdUP" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def load_data():\n", + " path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n", + " data = pd.read_csv(f\"{path}/creditcard.csv\")\n", + " data['Class'] = data['Class'].astype(int)\n", + " data.drop(['Time'], axis=1, inplace=True)\n", + " data['Amount'] = StandardScaler().fit_transform(\n", + " data['Amount'].values.reshape(-1, 1))\n", + " return data\n", + "\n", + "# Load and split\n", + "RANDOM_SEED = 42\n", + "TEST_SIZE = 0.3\n", + "\n", + "data = load_data()\n", + "X = data.drop(columns=['Class']).values\n", + "y = data['Class'].values\n", + "x_train, x_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y)" + ], + "metadata": { + "id": "QggrMsSNUdbf" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "iso = IsolationForest(contamination=0.002, random_state=RANDOM_SEED)\n", + "iso.fit(x_train[y_train == 0])\n", + "score_train = iso.decision_function(x_train)\n", + "score_test = iso.decision_function(x_test)\n", + "\n", + "x_train_feat = np.hstack([x_train, score_train.reshape(-1,1)])\n", + "x_test_feat = np.hstack([x_test, score_test.reshape(-1,1)])" + ], + "metadata": { + "id": "rmGHK9f8UdeX" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "param_grid = {\n", + " 'n_estimators': [200, 400],\n", + " 'max_depth': [5, 8],\n", + " 'learning_rate': [0.05, 0.1],\n", + " 'scale_pos_weight': [10, 20]\n", + "}\n", + "\n", + "\n", + "xgb = XGBClassifier(\n", + " tree_method='hist', # <-- changed here: use CPU 'hist' instead of 'gpu_hist'\n", + " eval_metric='logloss',\n", + " scale_pos_weight=0.172,\n", + " random_state=RANDOM_SEED\n", + ")\n", + "grid = GridSearchCV(\n", + " xgb, param_grid, scoring='f1', cv=3, n_jobs=-1, verbose=1\n", + ")\n", + "\n", + "grid.fit(x_train_feat, y_train)\n", + "best_model = grid.best_estimator_\n", + "print(\"Best parameters:\", grid.best_params_)\n", + "\n", + "# Default threshold (0.5)\n", + "y_pred_default = best_model.predict(x_test_feat)\n", + "evaluation(y_test, y_pred_default, model_name=\"DefaultThreshold\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "GS6juA2iUmMH", + "outputId": "052864db-5851-416e-ce5e-58a503729210" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Fitting 3 folds for each of 16 candidates, totalling 48 fits\n", + "Best parameters: {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 400, 'scale_pos_weight': 10}\n", + "\n", + "DefaultThreshold Evaluation:\n", + "========================================\n", + " Accuracy : 0.99952015\n", + " Precision Score: 0.92125984\n", + " Recall Score : 0.79054054\n", + " F1 Score : 0.85090909\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85295\n", + " 1 0.92 0.79 0.85 148\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.96 0.90 0.93 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "y_proba = best_model.predict_proba(x_test_feat)[:,1]\n", + "\n", + "# 門檻掃描\n", + "best_f1, best_thresh = 0, 0.5\n", + "for t in np.arange(0.1, 0.9, 0.01):\n", + " preds = (y_proba > t).astype(int)\n", + " f1 = f1_score(y_test, preds)\n", + " if f1 > best_f1:\n", + " best_f1, best_thresh = f1, t\n", + "\n", + "print(f\"Best F1: {best_f1:.5f} at threshold: {best_thresh:.2f}\")\n", + "\n", + "# 使用最佳門檻\n", + "y_pred_tuned = (y_proba > best_thresh).astype(int)\n", + "evaluation(y_test, y_pred_tuned, model_name=f\"Threshold {best_thresh:.2f}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FxsNZgP_UmPP", + "outputId": "dc613e99-4cc1-472c-f21b-7985c15e3bc1" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Best F1: 0.85507 at threshold: 0.42\n", + "\n", + "Threshold 0.42 Evaluation:\n", + "========================================\n", + " Accuracy : 0.99953185\n", + " Precision Score: 0.92187500\n", + " Recall Score : 0.79729730\n", + " F1 Score : 0.85507246\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85295\n", + " 1 0.92 0.80 0.86 148\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.96 0.90 0.93 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ] + } + ] +} diff --git a/ACS111150_ex2/ex2.md b/ACS111150_ex2/ex2.md new file mode 100644 index 0000000..fbf9c03 --- /dev/null +++ b/ACS111150_ex2/ex2.md @@ -0,0 +1,97 @@ +# 挑戰二:融合異常檢測與監督式學習於信用卡詐欺偵測 + +--- + +## Isolation Forest + +1. 建立 Isolation Forest,僅以正常樣本訓練: + ```python + iso = IsolationForest(contamination=0.002, random_state=42) + iso.fit(x_train[y_train==0]) + ``` +2. 計算 anomaly score(連續值): + ```python + anomaly_train = iso.decision_function(x_train).reshape(-1,1) + anomaly_test = iso.decision_function(x_test).reshape(-1,1) + ``` +3. 合併至特徵矩陣: + ```python + x_train_feat = np.hstack([x_train, anomaly_train]) + x_test_feat = np.hstack([x_test, anomaly_test]) + ``` + +--- + +## XGBoost + +- **模型**:`XGBClassifier(tree_method='hist', eval_metric='logloss', random_state=42)` +- **參數網格**: + ```yaml + n_estimators: [200, 400] + max_depth: [5, 8] + learning_rate:[0.05, 0.1] + scale_pos_weight: [10, 20] + ``` +- **交叉驗證**:3 折 (cv=3),以 F1 score 作為搜尋目標 + +```python +grid = GridSearchCV( + estimator=xgb, + param_grid=param_grid, + scoring='f1', + cv=3, + n_jobs=-1, + verbose=1 +) +grid.fit(x_train_feat, y_train) +best_model = grid.best_estimator_ +print("Best parameters:", grid.best_params_) +``` + +### 4.1 預設閾值 (0.5) 評估 + +```python +y_pred_default = best_model.predict(x_test_feat) +evaluation(y_test, y_pred_default, model_name="DefaultThreshold") +``` + +``` +DefaultThreshold Evaluation: +======================================== +Accuracy : 0.99952015 +Precision Score: 0.92125984 +Recall Score : 0.79054054 +F1 Score : 0.85090909 +... +``` + +--- + +## 5. 門檻調整 (Threshold Tuning) + +- 掃描閾值範圍 [0.1, 0.9),每 0.01 為一步,選出最佳 F1 分數對應之閾值: + +```python +y_proba = best_model.predict_proba(x_test_feat)[:,1] +best_f1, best_thresh = 0, 0.5 +for t in np.arange(0.1, 0.9, 0.01): + preds = (y_proba > t).astype(int) + f1 = f1_score(y_test, preds) + if f1 > best_f1: + best_f1, best_thresh = f1, t +print(f"Best F1={best_f1:.5f} at threshold={best_thresh:.2f}") +y_pred_tuned = (y_proba > best_thresh).astype(int) +evaluation(y_test, y_pred_tuned, model_name=f"Threshold {best_thresh:.2f}") +``` + +``` +Threshold 0.43 Evaluation: +======================================== +Accuracy : 0.99953185 +Precision Score: 0.92187500 +Recall Score : 0.79729730 +F1 Score : 0.85507246 +... +``` + +---