0lai0 · Radcliffe0731 · May 22, 2025 · May 24, 2025 · May 26, 2025 · Jun 13, 2025
diff --git a/acs111125-ex1/ex1.ipynb b/acs111125-ex1/ex1.ipynb
@@ -0,0 +1,226 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0a8b1f63",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\yianm\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading from https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 66.0M/66.0M [00:25<00:00, 2.76MB/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting files...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ 資料載入完成，總筆數： 284807\n",
+      "\n",
+      "📊 Random Forest 評估結果\n",
+      "========================================\n",
+      "Accuracy : 0.9994499256814484\n",
+      "Precision: 0.9719626168224299\n",
+      "Recall   : 0.7027027027027027\n",
+      "F1 Score : 0.8156862745098039\n",
+      "\n",
+      "Classification Report:\n",
+      "               precision    recall  f1-score   support\n",
+      "\n",
+      "           0       1.00      1.00      1.00     85295\n",
+      "           1       0.97      0.70      0.82       148\n",
+      "\n",
+      "    accuracy                           1.00     85443\n",
+      "   macro avg       0.99      0.85      0.91     85443\n",
+      "weighted avg       1.00      1.00      1.00     85443\n",
+      "\n",
+      "🌀 KMeans 最佳群數 k = 2\n",
+      "\n",
+      "📊 KMeans (Unsupervised) 評估結果\n",
+      "========================================\n",
+      "Accuracy : 0.9982678510820079\n",
+      "Precision: 0.0\n",
+      "Recall   : 0.0\n",
+      "F1 Score : 0.0\n",
+      "\n",
+      "Classification Report:\n",
+      "               precision    recall  f1-score   support\n",
+      "\n",
+      "           0       1.00      1.00      1.00     85295\n",
+      "           1       0.00      0.00      0.00       148\n",
+      "\n",
+      "    accuracy                           1.00     85443\n",
+      "   macro avg       0.50      0.50      0.50     85443\n",
+      "weighted avg       1.00      1.00      1.00     85443\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\yianm\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
+      "c:\\Users\\yianm\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
+      "c:\\Users\\yianm\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
+      "c:\\Users\\yianm\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ex1.ipynb - 挑戰一 練習作業框架\n",
+    "# Author: [請寫上你的名字或學號]\n",
+    "\n",
+    "# ====== 🔹 套件匯入區 ======\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
+    "from sklearn.cluster import KMeans\n",
+    "from sklearn.metrics import silhouette_score\n",
+    "\n",
+    "# ====== 🔹 資料載入與前處理 ======\n",
+    "# 如果 kagglehub 有裝就用這個；如果你下載 csv，改成讀本地檔\n",
+    "try:\n",
+    "    import kagglehub\n",
+    "    path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n",
+    "    data = pd.read_csv(f\"{path}/creditcard.csv\")\n",
+    "except:\n",
+    "    print(\"使用本地資料路徑\")\n",
+    "    data = pd.read_csv(\"creditcard.csv\")  # <-- 下載檔案後請放這裡\n",
+    "\n",
+    "data['Class'] = data['Class'].astype(int)\n",
+    "data = data.drop(['Time'], axis=1)\n",
+    "data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n",
+    "\n",
+    "print(\"✅ 資料載入完成，總筆數：\", len(data))\n",
+    "\n",
+    "# ====== 🔹 評估函式 ======\n",
+    "def evaluation(y_true, y_pred, model_name=\"Model\"):\n",
+    "    print(f\"\\n📊 {model_name} 評估結果\")\n",
+    "    print(\"=\"*40)\n",
+    "    print(\"Accuracy :\", accuracy_score(y_true, y_pred))\n",
+    "    print(\"Precision:\", precision_score(y_true, y_pred))\n",
+    "    print(\"Recall   :\", recall_score(y_true, y_pred))\n",
+    "    print(\"F1 Score :\", f1_score(y_true, y_pred))\n",
+    "    print(\"\\nClassification Report:\\n\", classification_report(y_true, y_pred))\n",
+    "\n",
+    "# ====== 🔹 Random Forest：有監督式學習 ======\n",
+    "X = np.asarray(data.drop(columns=['Class']))\n",
+    "Y = np.asarray(data['Class'])\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X, Y, test_size=0.3, random_state=42, stratify=Y\n",
+    ")\n",
+    "\n",
+    "rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')\n",
+    "rf_model.fit(X_train, y_train)\n",
+    "y_pred_rf = rf_model.predict(X_test)\n",
+    "\n",
+    "evaluation(y_test, y_pred_rf, \"Random Forest\")\n",
+    "\n",
+    "# ====== 🔹 KMeans：非監督式學習 ======\n",
+    "scaler = StandardScaler()\n",
+    "X_scaled = scaler.fit_transform(X)\n",
+    "\n",
+    "X_train_unsupervised = X_scaled[Y == 0][:1000]  # 取前 1000 筆非詐欺樣本\n",
+    "\n",
+    "# 選擇最佳 k 值\n",
+    "scores = []\n",
+    "for k in range(2, 5):\n",
+    "    kmeans = KMeans(n_clusters=k, random_state=42)\n",
+    "    kmeans.fit(X_train_unsupervised)\n",
+    "    score = silhouette_score(X_train_unsupervised, kmeans.labels_)\n",
+    "    scores.append(score)\n",
+    "\n",
+    "optimal_k = np.argmax(scores) + 2\n",
+    "print(\"🌀 KMeans 最佳群數 k =\", optimal_k)\n",
+    "\n",
+    "# 用最佳 k 訓練模型\n",
+    "kmeans = KMeans(n_clusters=optimal_k, random_state=42)\n",
+    "kmeans.fit(X_train_unsupervised)\n",
+    "X_test_scaled = scaler.transform(X_test)\n",
+    "y_pred_kmeans = kmeans.predict(X_test_scaled)\n",
+    "\n",
+    "# 對齊群集標籤\n",
+    "def align_labels(y_true, y_pred, n_clusters):\n",
+    "    labels = np.zeros_like(y_pred)\n",
+    "    for i in range(n_clusters):\n",
+    "        mask = (y_pred == i)\n",
+    "        if np.sum(mask) > 0:\n",
+    "            labels[mask] = np.bincount(y_true[mask]).argmax()\n",
+    "        else:\n",
+    "            labels[mask] = 0\n",
+    "    return labels\n",
+    "\n",
+    "y_pred_aligned = align_labels(y_test, y_pred_kmeans, optimal_k)\n",
+    "evaluation(y_test, y_pred_aligned, \"KMeans (Unsupervised)\")\n",
+    "\n",
+    "# ====== 🔹 TODO：你可以在這裡進行改進 ======\n",
+    "# 例如：\n",
+    "# - 改用其他分類器（如 XGBoost、SVM）\n",
+    "# - 嘗試調整 Random Forest 參數\n",
+    "# - SMOTE 資料平衡處理\n",
+    "# - 比較更多非監督模型\n",
+    "# - 把結果寫入 ex1.md 檔案\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/acs111125-ex1/ex1.md b/acs111125-ex1/ex1.md
@@ -0,0 +1,116 @@
+
+# 💼 ex1.md - 挑戰一 練習作業報告
+> 📌 課程練習目標：使用監督與非監督式方法進行詐欺交易偵測
+
+---
+
+## 👤 基本資訊
+- 作者：[資三甲周庭嫻ACS111125]
+- 作業名稱：挑戰一 - 信用卡詐欺資料分析
+
+---
+
+## 🎯 目標說明
+
+本次練習目的為透過機器學習方法，對信用卡詐欺偵測資料集進行處理與模型建立，並嘗試比較 **監督式學習（Random Forest）** 以及 **非監督式學習（KMeans）** 的偵測效果。
+
+---
+
+## 📦 使用資料集
+
+- 資料來源：[Kaggle - Credit Card Fraud Detection](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud)
+- 資料筆數：284,807 筆
+- 欄位數量：30 個欄位（經 PCA 處理）
+- 資料不平衡比例：
+  - 正常交易（Class=0）：99.8%
+  - 詐欺交易（Class=1）：0.2%
+
+---
+
+## 🧹 資料前處理
+
+1. 移除 `Time` 欄位
+2. 使用 `StandardScaler` 對 `Amount` 欄位進行標準化
+3. 分離特徵 X 與標籤 Y
+4. 使用 `train_test_split` 切分資料為 70% 訓練 / 30% 測試（保留類別比例）
+
+---
+
+## ✅ 模型一：Random Forest（有監督式學習）
+
+- 模型參數：
+  - `n_estimators=100`
+  - `class_weight='balanced'`
+- 訓練資料：70%
+- 測試資料：30%
+
+### 📊 評估結果
+
+```
+Accuracy : 0.9994
+Precision: 0.9719
+Recall   : 0.7027
+F1 Score : 0.8159
+```
+
+| 類別 | precision | recall | f1-score | support |
+|------|-----------|--------|----------|---------|
+| 0    | 1.00      | 1.00   | 1.00     | 85295   |
+| 1    | 0.97      | 0.70   | 0.82     | 148     |
+
+> ✅ 模型在極度不平衡資料下仍能達到高 precision 與合理 recall，透過 `class_weight='balanced'` 可提升少數類別偵測效果。
+
+---
+
+## ✅ 模型二：KMeans（非監督式學習）
+
+- 使用非詐欺樣本（Class = 0）前 1000 筆進行群聚學習
+- 探索最佳群數 k ∈ {2, 3, 4}
+- 使用 silhouette score 選出最佳群數
+- 最佳 k = **2**
+
+### 📊 評估結果
+
+```
+Accuracy : 0.9983
+Precision: 0.00
+Recall   : 0.00
+F1 Score : 0.00
+```
+
+| 指標類型     | 值   |
+|--------------|------|
+| macro avg    | 0.50 |
+| weighted avg | 1.00 |
+
+> ⚠️ 雖然整體 accuracy 看似很高，但實際未能有效辨識詐欺樣本，precision 與 recall 為 0，表示所有詐欺樣本皆被錯判。
+
+---
+
+## 📊 模型比較與總結
+
+| 模型        | Accuracy | Precision | Recall | F1 Score |
+|-------------|----------|-----------|--------|----------|
+| RandomForest| 0.9994   | 0.9719    | 0.7027 | 0.8159   |
+| KMeans      | 0.9983   | 0.0000    | 0.0000 | 0.0000   |
+
+> ✅ **Random Forest** 在處理不平衡資料中表現穩定，能有效抓出詐欺樣本。  
+> ⚠️ **KMeans** 雖然是無監督方法，accuracy 偏高主要是因為資料極度不平衡，實際上未成功辨識任何詐欺交易。
+
+---
+
+## 🔁 建議改進方向（TODO）
+
+- 嘗試 **XGBoost**，進一步提升 Recall 與 F1 分數
+- 使用 **SMOTE** 進行少數類別過採樣
+- 嘗試調整門檻值（如 `predict_proba >= 0.3~0.5`）
+- 加入更多非監督模型：如 Isolation Forest、DBSCAN
+- 將模型評估結果可視化（ROC 曲線、PR 曲線、混淆矩陣）
+
+---
+
+## ✅ 結論
+
+本次實驗展示監督與非監督學習方法在信用卡詐欺偵測中的應用，Random Forest 能有效從極度不平衡的資料中辨識詐欺行為，而 KMeans 雖為無監督方法，在未經調整的情況下效果有限。透過後續進階技術（如 SMOTE 或 Boosting），可望進一步提升偵測表現。
+
+---
diff --git a/acs111125-ex2/ex2.ipynb b/acs111125-ex2/ex2.ipynb