Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 161 additions & 0 deletions ex1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "8137f516",
"metadata": {},
"source": [
"\n",
"# ex1 - 信用卡詐欺偵測實驗\n",
"\n",
"本 Notebook 包含:\n",
"- 監督式學習模型:Random Forest(含優化版本)\n",
"- 非監督式學習模型:KMeans(聚類 + 標籤對齊)\n",
"- 評估指標:Precision、Recall、F1-score、Classification Report\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "99131f52",
"metadata": {},
"outputs": [],
"source": [
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.metrics import (\n",
" classification_report, accuracy_score, precision_score,\n",
" recall_score, f1_score, silhouette_score\n",
")\n",
"import kagglehub\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "293c016d",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def evaluation(y_true, y_pred, model_name=\"Model\"):\n",
" accuracy = accuracy_score(y_true, y_pred)\n",
" precision = precision_score(y_true, y_pred)\n",
" recall = recall_score(y_true, y_pred)\n",
" f1 = f1_score(y_true, y_pred)\n",
"\n",
" print(f'\\n{model_name} Evaluation:')\n",
" print('===' * 15)\n",
" print(' Accuracy:', accuracy)\n",
" print(' Precision Score:', precision)\n",
" print(' Recall Score:', recall)\n",
" print(' F1 Score:', f1)\n",
" print(\"\\nClassification Report:\")\n",
" print(classification_report(y_true, y_pred))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "27cb047d",
"metadata": {},
"outputs": [],
"source": [
"\n",
"RANDOM_SEED = 42\n",
"TEST_SIZE = 0.3\n",
"\n",
"path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n",
"data = pd.read_csv(f\"{path}/creditcard.csv\")\n",
"data['Class'] = data['Class'].astype(int)\n",
"\n",
"data = data.drop(['Time'], axis=1)\n",
"data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "60301bc6",
"metadata": {},
"outputs": [],
"source": [
"\n",
"X = data.drop(columns=['Class']).values\n",
"Y = data['Class'].values\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y)\n",
"\n",
"rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)\n",
"rf_model.fit(X_train, y_train)\n",
"y_pred = rf_model.predict(X_test)\n",
"evaluation(y_test, y_pred, model_name=\"Random Forest (Original)\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e13e9317",
"metadata": {},
"outputs": [],
"source": [
"\n",
"rf_model = RandomForestClassifier(\n",
" n_estimators=200, class_weight='balanced', random_state=RANDOM_SEED)\n",
"rf_model.fit(X_train, y_train.ravel())\n",
"y_pred = rf_model.predict(X_test)\n",
"evaluation(y_test, y_pred, model_name=\"Random Forest (Balanced)\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "54ceb23e",
"metadata": {},
"outputs": [],
"source": [
"\n",
"x_train, x_test, y_train_k, y_test_k = train_test_split(\n",
" X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y)\n",
"\n",
"scaler = StandardScaler()\n",
"x_train = scaler.fit_transform(x_train)\n",
"x_test = scaler.transform(x_test)\n",
"\n",
"n_x_train = x_train[y_train_k == 0][:1000]\n",
"scores = []\n",
"for k in range(2, 5):\n",
" kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n",
" kmeans.fit(n_x_train)\n",
" scores.append(silhouette_score(n_x_train, kmeans.labels_))\n",
"\n",
"optimal_k = np.argmax(scores) + 2\n",
"kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n",
"kmeans.fit(n_x_train)\n",
"y_pred_test = kmeans.predict(x_test)\n",
"\n",
"def align_labels(y_true, y_pred, n_clusters):\n",
" labels = np.zeros_like(y_pred)\n",
" for i in range(n_clusters):\n",
" mask = (y_pred == i)\n",
" if np.sum(mask) > 0:\n",
" labels[mask] = np.bincount(y_true[mask]).argmax()\n",
" else:\n",
" labels[mask] = 0\n",
" return labels\n",
"\n",
"y_pred_aligned = align_labels(y_test_k, y_pred_test, optimal_k)\n",
"evaluation(y_test_k, y_pred_aligned, model_name=\"KMeans (Unsupervised)\")\n"
]
}
],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
35 changes: 35 additions & 0 deletions ex1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@

# ex1 - 信用卡詐欺偵測練習

## 資料集
- 來自 Kaggle: `mlg-ulb/creditcardfraud`
- 含 284,807 筆交易資料,492 筆為詐欺(約 0.172%)

## 任務目標
1. 實作 **監督式學習**模型(Random Forest)
2. 實作 **非監督式學習**模型(KMeans)
3. 嘗試透過優化提升模型效能

## 模型與結果

### 🎯 監督式學習:Random Forest

| 模型版本 | Precision | Recall | F1 Score |
|--------------------|-----------|--------|----------|
| 原始 RF | 0.94 | 0.82 | 0.88 |
| RF(Balanced) | 0.97 | 0.77 | 0.86 |

> 使用 `class_weight='balanced'` 可提升對少數類別的關注,有效提升精確率。

### 🔍 非監督式學習:KMeans

| Precision | Recall | F1 Score |
|-----------|--------|----------|
| 0.78 | 0.36 | 0.50 |

> 表現雖不如 RF,但在無標籤情況下仍有不錯的 precision,可作為輔助工具。

## 結論
- Random Forest 經適當調參後能有效偵測詐欺交易。
- KMeans 可作為無監督的異常預警機制。
- 建議未來結合兩種方法(如 IsolationForest + RF)進行模型融合,可能進一步提升 recall。
106 changes: 106 additions & 0 deletions ex1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import kagglehub


# define evaluation function
def evaluation(y_true, y_pred, model_name="Model"):
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f'\n{model_name} Evaluation:')
print('===' * 15)
print(' Accuracy:', accuracy)
print(' Precision Score:', precision)
print(' Recall Score:', recall)
print(' F1 Score:', f1)
print("\nClassification Report:")
print(classification_report(y_true, y_pred))

# general setting. do not change TEST_SIZE
RANDOM_SEED = 42
TEST_SIZE = 0.3

# load dataset(from kagglehub)
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# prepare data
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])
Y = np.asarray(data.iloc[:, data.columns == 'Class'])

# split training set and data set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# build Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
rf_model.fit(X_train, y_train)

# predict and print result
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

rf_model = RandomForestClassifier(
n_estimators=200,
class_weight='balanced',
random_state=RANDOM_SEED
)
rf_model.fit(X_train, y_train.ravel())
y_pred = rf_model.predict(X_test)
evaluation(y_test.ravel(), y_pred, model_name="Random Forest (Balanced)")
# KMeans

# Extract features and labels
X = np.asarray(data.drop(columns=['Class']))
y = np.asarray(data['Class'])

# Split the dataset into training and testing sets (with stratification)
x_train, x_test, y_train, y_test = train_test_split(
X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Select a small sample of normal (non-fraud) data for unsupervised training
n_x_train = x_train[y_train == 0]
n_x_train = n_x_train[:1000]

scores = []
for k in range(2, 5):
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(n_x_train)
score = silhouette_score(n_x_train, kmeans.labels_)
scores.append(score)

optimal_k = np.argmax(scores) + 2
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(n_x_train)
y_pred_test = kmeans.predict(x_test)
def align_labels(y_true, y_pred, n_clusters):
labels = np.zeros_like(y_pred)
for i in range(n_clusters):
mask = (y_pred == i)
if np.sum(mask) > 0:
labels[mask] = np.bincount(y_true[mask]).argmax()
else:
labels[mask] = 0 # Default to normal class
return labels

y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)

evaluation(y_test, y_pred_aligned, model_name="KMeans (Unsupervised)")

Loading