0lai0 · MH-huang000 · Jun 25, 2025
diff --git a/ex1.ipynb b/ex1.ipynb
@@ -0,0 +1,161 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8137f516",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# ex1 - 信用卡詐欺偵測實驗\n",
+    "\n",
+    "本 Notebook 包含：\n",
+    "- 監督式學習模型：Random Forest（含優化版本）\n",
+    "- 非監督式學習模型：KMeans（聚類 + 標籤對齊）\n",
+    "- 評估指標：Precision、Recall、F1-score、Classification Report\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99131f52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.cluster import KMeans\n",
+    "from sklearn.metrics import (\n",
+    "    classification_report, accuracy_score, precision_score,\n",
+    "    recall_score, f1_score, silhouette_score\n",
+    ")\n",
+    "import kagglehub\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "293c016d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def evaluation(y_true, y_pred, model_name=\"Model\"):\n",
+    "    accuracy = accuracy_score(y_true, y_pred)\n",
+    "    precision = precision_score(y_true, y_pred)\n",
+    "    recall = recall_score(y_true, y_pred)\n",
+    "    f1 = f1_score(y_true, y_pred)\n",
+    "\n",
+    "    print(f'\\n{model_name} Evaluation:')\n",
+    "    print('===' * 15)\n",
+    "    print('         Accuracy:', accuracy)\n",
+    "    print('  Precision Score:', precision)\n",
+    "    print('     Recall Score:', recall)\n",
+    "    print('         F1 Score:', f1)\n",
+    "    print(\"\\nClassification Report:\")\n",
+    "    print(classification_report(y_true, y_pred))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27cb047d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "RANDOM_SEED = 42\n",
+    "TEST_SIZE = 0.3\n",
+    "\n",
+    "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n",
+    "data = pd.read_csv(f\"{path}/creditcard.csv\")\n",
+    "data['Class'] = data['Class'].astype(int)\n",
+    "\n",
+    "data = data.drop(['Time'], axis=1)\n",
+    "data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "60301bc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "X = data.drop(columns=['Class']).values\n",
+    "Y = data['Class'].values\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y)\n",
+    "\n",
+    "rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)\n",
+    "rf_model.fit(X_train, y_train)\n",
+    "y_pred = rf_model.predict(X_test)\n",
+    "evaluation(y_test, y_pred, model_name=\"Random Forest (Original)\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e13e9317",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "rf_model = RandomForestClassifier(\n",
+    "    n_estimators=200, class_weight='balanced', random_state=RANDOM_SEED)\n",
+    "rf_model.fit(X_train, y_train.ravel())\n",
+    "y_pred = rf_model.predict(X_test)\n",
+    "evaluation(y_test, y_pred, model_name=\"Random Forest (Balanced)\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54ceb23e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "x_train, x_test, y_train_k, y_test_k = train_test_split(\n",
+    "    X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y)\n",
+    "\n",
+    "scaler = StandardScaler()\n",
+    "x_train = scaler.fit_transform(x_train)\n",
+    "x_test = scaler.transform(x_test)\n",
+    "\n",
+    "n_x_train = x_train[y_train_k == 0][:1000]\n",
+    "scores = []\n",
+    "for k in range(2, 5):\n",
+    "    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n",
+    "    kmeans.fit(n_x_train)\n",
+    "    scores.append(silhouette_score(n_x_train, kmeans.labels_))\n",
+    "\n",
+    "optimal_k = np.argmax(scores) + 2\n",
+    "kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n",
+    "kmeans.fit(n_x_train)\n",
+    "y_pred_test = kmeans.predict(x_test)\n",
+    "\n",
+    "def align_labels(y_true, y_pred, n_clusters):\n",
+    "    labels = np.zeros_like(y_pred)\n",
+    "    for i in range(n_clusters):\n",
+    "        mask = (y_pred == i)\n",
+    "        if np.sum(mask) > 0:\n",
+    "            labels[mask] = np.bincount(y_true[mask]).argmax()\n",
+    "        else:\n",
+    "            labels[mask] = 0\n",
+    "    return labels\n",
+    "\n",
+    "y_pred_aligned = align_labels(y_test_k, y_pred_test, optimal_k)\n",
+    "evaluation(y_test_k, y_pred_aligned, model_name=\"KMeans (Unsupervised)\")\n"
+   ]
+  }
+ ],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ex1.md b/ex1.md
@@ -0,0 +1,35 @@
+
+# ex1 - 信用卡詐欺偵測練習
+
+## 資料集
+- 來自 Kaggle: `mlg-ulb/creditcardfraud`
+- 含 284,807 筆交易資料，492 筆為詐欺（約 0.172%）
+
+## 任務目標
+1. 實作 **監督式學習**模型（Random Forest）
+2. 實作 **非監督式學習**模型（KMeans）
+3. 嘗試透過優化提升模型效能
+
+## 模型與結果
+
+### 🎯 監督式學習：Random Forest
+
+| 模型版本           | Precision | Recall | F1 Score |
+|--------------------|-----------|--------|----------|
+| 原始 RF            | 0.94      | 0.82   | 0.88     |
+| RF（Balanced）     | 0.97      | 0.77   | 0.86     |
+
+> 使用 `class_weight='balanced'` 可提升對少數類別的關注，有效提升精確率。
+
+### 🔍 非監督式學習：KMeans
+
+| Precision | Recall | F1 Score |
+|-----------|--------|----------|
+| 0.78      | 0.36   | 0.50     |
+
+> 表現雖不如 RF，但在無標籤情況下仍有不錯的 precision，可作為輔助工具。
+
+## 結論
+- Random Forest 經適當調參後能有效偵測詐欺交易。
+- KMeans 可作為無監督的異常預警機制。
+- 建議未來結合兩種方法（如 IsolationForest + RF）進行模型融合，可能進一步提升 recall。
diff --git a/ex1.py b/ex1.py
@@ -0,0 +1,106 @@
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
+import kagglehub
+
+
+# define evaluation function
+def evaluation(y_true, y_pred, model_name="Model"):
+    accuracy = accuracy_score(y_true, y_pred)
+    precision = precision_score(y_true, y_pred)
+    recall = recall_score(y_true, y_pred)
+    f1 = f1_score(y_true, y_pred)
+
+    print(f'\n{model_name} Evaluation:')
+    print('===' * 15)
+    print('         Accuracy:', accuracy)
+    print('  Precision Score:', precision)
+    print('     Recall Score:', recall)
+    print('         F1 Score:', f1)
+    print("\nClassification Report:")
+    print(classification_report(y_true, y_pred))
+
+# general setting. do not change TEST_SIZE
+RANDOM_SEED = 42
+TEST_SIZE = 0.3
+
+# load dataset（from kagglehub）
+path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
+data = pd.read_csv(f"{path}/creditcard.csv")
+data['Class'] = data['Class'].astype(int)
+
+# prepare data
+data = data.drop(['Time'], axis=1)
+data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
+
+X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])
+Y = np.asarray(data.iloc[:, data.columns == 'Class'])
+
+# split training set and data set
+X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)
+
+# build Random Forest model
+rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
+rf_model.fit(X_train, y_train)
+
+# predict and print result
+y_pred = rf_model.predict(X_test)
+print(classification_report(y_test, y_pred))
+
+rf_model = RandomForestClassifier(
+    n_estimators=200,
+    class_weight='balanced',
+    random_state=RANDOM_SEED
+)
+rf_model.fit(X_train, y_train.ravel())
+y_pred = rf_model.predict(X_test)
+evaluation(y_test.ravel(), y_pred, model_name="Random Forest (Balanced)")
+# KMeans
+
+# Extract features and labels
+X = np.asarray(data.drop(columns=['Class']))
+y = np.asarray(data['Class'])
+
+# Split the dataset into training and testing sets (with stratification)
+x_train, x_test, y_train, y_test = train_test_split(
+   X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
+)
+
+scaler = StandardScaler()
+x_train = scaler.fit_transform(x_train)
+x_test = scaler.transform(x_test)
+
+# Select a small sample of normal (non-fraud) data for unsupervised training
+n_x_train = x_train[y_train == 0]
+n_x_train = n_x_train[:1000]
+
+scores = []
+for k in range(2, 5):
+   kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
+   kmeans.fit(n_x_train)
+   score = silhouette_score(n_x_train, kmeans.labels_)
+   scores.append(score)
+
+optimal_k = np.argmax(scores) + 2
+kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
+kmeans.fit(n_x_train)
+y_pred_test = kmeans.predict(x_test)
+def align_labels(y_true, y_pred, n_clusters):
+   labels = np.zeros_like(y_pred)
+   for i in range(n_clusters):
+       mask = (y_pred == i)
+       if np.sum(mask) > 0:
+           labels[mask] = np.bincount(y_true[mask]).argmax()
+       else:
+           labels[mask] = 0  # Default to normal class
+   return labels
+
+y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)
+
+evaluation(y_test, y_pred_aligned, model_name="KMeans (Unsupervised)")
+