Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 226 additions & 0 deletions acs111125-ex1/ex1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "0a8b1f63",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\yianm\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading from https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 66.0M/66.0M [00:25<00:00, 2.76MB/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extracting files...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ 資料載入完成,總筆數: 284807\n",
"\n",
"📊 Random Forest 評估結果\n",
"========================================\n",
"Accuracy : 0.9994499256814484\n",
"Precision: 0.9719626168224299\n",
"Recall : 0.7027027027027027\n",
"F1 Score : 0.8156862745098039\n",
"\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0 1.00 1.00 1.00 85295\n",
" 1 0.97 0.70 0.82 148\n",
"\n",
" accuracy 1.00 85443\n",
" macro avg 0.99 0.85 0.91 85443\n",
"weighted avg 1.00 1.00 1.00 85443\n",
"\n",
"🌀 KMeans 最佳群數 k = 2\n",
"\n",
"📊 KMeans (Unsupervised) 評估結果\n",
"========================================\n",
"Accuracy : 0.9982678510820079\n",
"Precision: 0.0\n",
"Recall : 0.0\n",
"F1 Score : 0.0\n",
"\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0 1.00 1.00 1.00 85295\n",
" 1 0.00 0.00 0.00 148\n",
"\n",
" accuracy 1.00 85443\n",
" macro avg 0.50 0.50 0.50 85443\n",
"weighted avg 1.00 1.00 1.00 85443\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\yianm\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\yianm\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\yianm\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"c:\\Users\\yianm\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
}
],
"source": [
"# ex1.ipynb - 挑戰一 練習作業框架\n",
"# Author: [請寫上你的名字或學號]\n",
"\n",
"# ====== 🔹 套件匯入區 ======\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.metrics import silhouette_score\n",
"\n",
"# ====== 🔹 資料載入與前處理 ======\n",
"# 如果 kagglehub 有裝就用這個;如果你下載 csv,改成讀本地檔\n",
"try:\n",
" import kagglehub\n",
" path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n",
" data = pd.read_csv(f\"{path}/creditcard.csv\")\n",
"except:\n",
" print(\"使用本地資料路徑\")\n",
" data = pd.read_csv(\"creditcard.csv\") # <-- 下載檔案後請放這裡\n",
"\n",
"data['Class'] = data['Class'].astype(int)\n",
"data = data.drop(['Time'], axis=1)\n",
"data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n",
"\n",
"print(\"✅ 資料載入完成,總筆數:\", len(data))\n",
"\n",
"# ====== 🔹 評估函式 ======\n",
"def evaluation(y_true, y_pred, model_name=\"Model\"):\n",
" print(f\"\\n📊 {model_name} 評估結果\")\n",
" print(\"=\"*40)\n",
" print(\"Accuracy :\", accuracy_score(y_true, y_pred))\n",
" print(\"Precision:\", precision_score(y_true, y_pred))\n",
" print(\"Recall :\", recall_score(y_true, y_pred))\n",
" print(\"F1 Score :\", f1_score(y_true, y_pred))\n",
" print(\"\\nClassification Report:\\n\", classification_report(y_true, y_pred))\n",
"\n",
"# ====== 🔹 Random Forest:有監督式學習 ======\n",
"X = np.asarray(data.drop(columns=['Class']))\n",
"Y = np.asarray(data['Class'])\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, Y, test_size=0.3, random_state=42, stratify=Y\n",
")\n",
"\n",
"rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')\n",
"rf_model.fit(X_train, y_train)\n",
"y_pred_rf = rf_model.predict(X_test)\n",
"\n",
"evaluation(y_test, y_pred_rf, \"Random Forest\")\n",
"\n",
"# ====== 🔹 KMeans:非監督式學習 ======\n",
"scaler = StandardScaler()\n",
"X_scaled = scaler.fit_transform(X)\n",
"\n",
"X_train_unsupervised = X_scaled[Y == 0][:1000] # 取前 1000 筆非詐欺樣本\n",
"\n",
"# 選擇最佳 k 值\n",
"scores = []\n",
"for k in range(2, 5):\n",
" kmeans = KMeans(n_clusters=k, random_state=42)\n",
" kmeans.fit(X_train_unsupervised)\n",
" score = silhouette_score(X_train_unsupervised, kmeans.labels_)\n",
" scores.append(score)\n",
"\n",
"optimal_k = np.argmax(scores) + 2\n",
"print(\"🌀 KMeans 最佳群數 k =\", optimal_k)\n",
"\n",
"# 用最佳 k 訓練模型\n",
"kmeans = KMeans(n_clusters=optimal_k, random_state=42)\n",
"kmeans.fit(X_train_unsupervised)\n",
"X_test_scaled = scaler.transform(X_test)\n",
"y_pred_kmeans = kmeans.predict(X_test_scaled)\n",
"\n",
"# 對齊群集標籤\n",
"def align_labels(y_true, y_pred, n_clusters):\n",
" labels = np.zeros_like(y_pred)\n",
" for i in range(n_clusters):\n",
" mask = (y_pred == i)\n",
" if np.sum(mask) > 0:\n",
" labels[mask] = np.bincount(y_true[mask]).argmax()\n",
" else:\n",
" labels[mask] = 0\n",
" return labels\n",
"\n",
"y_pred_aligned = align_labels(y_test, y_pred_kmeans, optimal_k)\n",
"evaluation(y_test, y_pred_aligned, \"KMeans (Unsupervised)\")\n",
"\n",
"# ====== 🔹 TODO:你可以在這裡進行改進 ======\n",
"# 例如:\n",
"# - 改用其他分類器(如 XGBoost、SVM)\n",
"# - 嘗試調整 Random Forest 參數\n",
"# - SMOTE 資料平衡處理\n",
"# - 比較更多非監督模型\n",
"# - 把結果寫入 ex1.md 檔案\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
116 changes: 116 additions & 0 deletions acs111125-ex1/ex1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@

# 💼 ex1.md - 挑戰一 練習作業報告
> 📌 課程練習目標:使用監督與非監督式方法進行詐欺交易偵測

---

## 👤 基本資訊
- 作者:[資三甲周庭嫻ACS111125]
- 作業名稱:挑戰一 - 信用卡詐欺資料分析

---

## 🎯 目標說明

本次練習目的為透過機器學習方法,對信用卡詐欺偵測資料集進行處理與模型建立,並嘗試比較 **監督式學習(Random Forest)** 以及 **非監督式學習(KMeans)** 的偵測效果。

---

## 📦 使用資料集

- 資料來源:[Kaggle - Credit Card Fraud Detection](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud)
- 資料筆數:284,807 筆
- 欄位數量:30 個欄位(經 PCA 處理)
- 資料不平衡比例:
- 正常交易(Class=0):99.8%
- 詐欺交易(Class=1):0.2%

---

## 🧹 資料前處理

1. 移除 `Time` 欄位
2. 使用 `StandardScaler` 對 `Amount` 欄位進行標準化
3. 分離特徵 X 與標籤 Y
4. 使用 `train_test_split` 切分資料為 70% 訓練 / 30% 測試(保留類別比例)

---

## ✅ 模型一:Random Forest(有監督式學習)

- 模型參數:
- `n_estimators=100`
- `class_weight='balanced'`
- 訓練資料:70%
- 測試資料:30%

### 📊 評估結果

```
Accuracy : 0.9994
Precision: 0.9719
Recall : 0.7027
F1 Score : 0.8159
```

| 類別 | precision | recall | f1-score | support |
|------|-----------|--------|----------|---------|
| 0 | 1.00 | 1.00 | 1.00 | 85295 |
| 1 | 0.97 | 0.70 | 0.82 | 148 |

> ✅ 模型在極度不平衡資料下仍能達到高 precision 與合理 recall,透過 `class_weight='balanced'` 可提升少數類別偵測效果。

---

## ✅ 模型二:KMeans(非監督式學習)

- 使用非詐欺樣本(Class = 0)前 1000 筆進行群聚學習
- 探索最佳群數 k ∈ {2, 3, 4}
- 使用 silhouette score 選出最佳群數
- 最佳 k = **2**

### 📊 評估結果

```
Accuracy : 0.9983
Precision: 0.00
Recall : 0.00
F1 Score : 0.00
```

| 指標類型 | 值 |
|--------------|------|
| macro avg | 0.50 |
| weighted avg | 1.00 |

> ⚠️ 雖然整體 accuracy 看似很高,但實際未能有效辨識詐欺樣本,precision 與 recall 為 0,表示所有詐欺樣本皆被錯判。

---

## 📊 模型比較與總結

| 模型 | Accuracy | Precision | Recall | F1 Score |
|-------------|----------|-----------|--------|----------|
| RandomForest| 0.9994 | 0.9719 | 0.7027 | 0.8159 |
| KMeans | 0.9983 | 0.0000 | 0.0000 | 0.0000 |

> ✅ **Random Forest** 在處理不平衡資料中表現穩定,能有效抓出詐欺樣本。
> ⚠️ **KMeans** 雖然是無監督方法,accuracy 偏高主要是因為資料極度不平衡,實際上未成功辨識任何詐欺交易。

---

## 🔁 建議改進方向(TODO)

- 嘗試 **XGBoost**,進一步提升 Recall 與 F1 分數
- 使用 **SMOTE** 進行少數類別過採樣
- 嘗試調整門檻值(如 `predict_proba >= 0.3~0.5`)
- 加入更多非監督模型:如 Isolation Forest、DBSCAN
- 將模型評估結果可視化(ROC 曲線、PR 曲線、混淆矩陣)

---

## ✅ 結論

本次實驗展示監督與非監督學習方法在信用卡詐欺偵測中的應用,Random Forest 能有效從極度不平衡的資料中辨識詐欺行為,而 KMeans 雖為無監督方法,在未經調整的情況下效果有限。透過後續進階技術(如 SMOTE 或 Boosting),可望進一步提升偵測表現。

---
332 changes: 332 additions & 0 deletions acs111125-ex2/ex2.ipynb

Large diffs are not rendered by default.

Loading