Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ACS111151_ex/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
作業一放這裡
116 changes: 116 additions & 0 deletions ACS111151_ex/ex1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "dQc5pfBVV_SF"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from imblearn.over_sampling import SMOTE\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.metrics import (\n",
" classification_report,\n",
" confusion_matrix,\n",
" roc_auc_score\n",
")\n",
"\n",
"# 固定參數\n",
"RANDOM_SEED = 42\n",
"TEST_SIZE = 0.3\n",
"\n",
"def supervised_pipeline(X_train, X_test, y_train, y_test):\n",
" \"\"\"監督式:SMOTE + RandomForest\"\"\"\n",
" sm = SMOTE(random_state=RANDOM_SEED)\n",
" X_res, y_res = sm.fit_resample(X_train, y_train)\n",
"\n",
" clf = RandomForestClassifier(\n",
" n_estimators=100,\n",
" class_weight='balanced',\n",
" random_state=RANDOM_SEED\n",
" )\n",
" clf.fit(X_res, y_res)\n",
"\n",
" y_pred = clf.predict(X_test)\n",
" y_prob = clf.predict_proba(X_test)[:,1]\n",
"\n",
" print(\"\\n--- 監督式學習:SMOTE + RandomForest ---\")\n",
" print(classification_report(y_test, y_pred, digits=4))\n",
" print(\"Confusion Matrix:\")\n",
" print(confusion_matrix(y_test, y_pred))\n",
" print(f\"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}\")\n",
"\n",
"def unsupervised_pipeline(X_all, y_all):\n",
" \"\"\"非監督式:KMeans(k=3) 異常偵測\"\"\"\n",
" # 全資料標準化\n",
" X_scaled = StandardScaler().fit_transform(X_all)\n",
"\n",
" k = 3\n",
" km = KMeans(n_clusters=k, random_state=RANDOM_SEED).fit(X_scaled)\n",
" labels = km.labels_\n",
"\n",
" # 群內多數標籤當預測\n",
" y_pred = np.zeros_like(labels)\n",
" for c in range(k):\n",
" mask = (labels == c)\n",
" majority = pd.Series(y_all[mask]).mode()[0]\n",
" y_pred[mask] = majority\n",
"\n",
" print(\"\\n--- 非監督式學習:KMeans (k=3) ---\")\n",
" print(classification_report(y_all, y_pred, digits=4))\n",
" print(\"Confusion Matrix:\")\n",
" print(confusion_matrix(y_all, y_pred))\n",
"\n",
"def main():\n",
" # 1. 讀檔 & 前處理\n",
" data = pd.read_csv(\"data/creditcard.csv\")\n",
" data = data.drop(columns=['Time'])\n",
" data['Amount'] = StandardScaler().fit_transform(\n",
" data['Amount'].values.reshape(-1,1)\n",
" )\n",
"\n",
" X = data.drop(columns=['Class']).values\n",
" y = data['Class'].values\n",
"\n",
" # 2. 切 supervised 的 train/test\n",
" X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y,\n",
" test_size=TEST_SIZE,\n",
" random_state=RANDOM_SEED,\n",
" stratify=y\n",
" )\n",
"\n",
" # 3. 執行監督式流程\n",
" supervised_pipeline(X_train, X_test, y_train, y_test)\n",
"\n",
" # 4. 執行非監督式流程(用全部資料評估)\n",
" unsupervised_pipeline(X, y)\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()\n"
]
}
]
}
22 changes: 22 additions & 0 deletions ACS111151_ex/ex1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
1. 前處理
- 資料來源:`data/creditcard.csv`
- 刪除 `Time` 欄位,對 `Amount` 做 StandardScaler。

2. 監督式實驗:SMOTE + RandomForest
- SMOTE 過採樣後的訓練集:正/負樣本比例接近平衡。
- RandomForest 參數:`n_estimators=100, class_weight='balanced'`。
- 結果:
- Precision、Recall、F1-score、ROC AUC 如下表。

| 類別 | Precision | Recall | F1 |
|----|---------|-------|-------|
| 0 | … | … | … |
| 1 | … | … | … |

3. 非監督式實驗:KMeans(k=3)
- 對全資料做標準化後聚成三群,每群以多數真實標籤做預測
- 結果:
- Precision、Recall、F1-score 如下。

4. 結論
- 監督式方法效果遠優於非監督式。
1 change: 1 addition & 0 deletions ACS111151_ex2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
作業2放這裡
129 changes: 129 additions & 0 deletions ACS111151_ex2/ex2.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "dQc5pfBVV_SF"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.ensemble import IsolationForest\n",
"from xgboost import XGBClassifier\n",
"from sklearn.metrics import (\n",
" classification_report,\n",
" confusion_matrix,\n",
" roc_auc_score,\n",
" f1_score\n",
")\n",
"\n",
"# 固定參數\n",
"RANDOM_SEED = 42\n",
"TEST_SIZE = 0.3\n",
"\n",
"def evaluate_pipeline(cont_list, percentile_list):\n",
" # 讀檔 & 前處理\n",
" df = pd.read_csv(\"data/creditcard.csv\")\n",
" df = df.drop(columns=[\"Time\"])\n",
" df[\"Amount\"] = StandardScaler().fit_transform(\n",
" df[\"Amount\"].values.reshape(-1, 1)\n",
" )\n",
" X = df.drop(columns=[\"Class\"]).values\n",
" y = df[\"Class\"].values\n",
"\n",
" # 切分\n",
" X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y,\n",
" test_size=TEST_SIZE,\n",
" random_state=RANDOM_SEED,\n",
" stratify=y\n",
" )\n",
"\n",
" # 訓練 XGBoost(全資料)\n",
" xgb = XGBClassifier(\n",
" n_estimators=100,\n",
" random_state=RANDOM_SEED,\n",
" use_label_encoder=False,\n",
" eval_metric=\"logloss\"\n",
" )\n",
" xgb.fit(X_train, y_train)\n",
"\n",
" best_cfg = None\n",
" best_f1 = 0\n",
"\n",
" # 掃描不同的 contamination\n",
" for cont in cont_list:\n",
" iso = IsolationForest(\n",
" contamination=cont,\n",
" random_state=RANDOM_SEED\n",
" )\n",
" iso.fit(X_train[y_train==0])\n",
"\n",
" # decision_function 取分數\n",
" scores = -iso.decision_function(X_test)\n",
"\n",
" # 在這個 contamination 下,掃描不同的 percentile 作為 threshold\n",
" for pct in percentile_list:\n",
" thr = np.percentile(scores, pct)\n",
" mask_anom = (scores >= thr)\n",
"\n",
" # 合併預測\n",
" y_pred = np.zeros_like(y_test)\n",
" if mask_anom.any():\n",
" y_pred[mask_anom] = xgb.predict(X_test[mask_anom])\n",
"\n",
" # 計算 F1\n",
" f1 = f1_score(y_test, y_pred)\n",
" if f1 > best_f1:\n",
" best_f1 = f1\n",
" best_cfg = (cont, pct, thr, f1)\n",
"\n",
" cont, pct, thr, f1 = best_cfg\n",
" print(f\"\\n最佳配置 → contamination={cont}, percentile={pct:.1f}, thr={thr:.3f}\")\n",
" print(f\"對應 F1 = {f1:.4f}\\n\")\n",
"\n",
" # 用最佳配置重跑一次並印最終報告\n",
" iso = IsolationForest(contamination=cont, random_state=RANDOM_SEED)\n",
" iso.fit(X_train[y_train==0])\n",
" scores = -iso.decision_function(X_test)\n",
" mask_anom = (scores >= thr)\n",
"\n",
" y_pred = np.zeros_like(y_test)\n",
" y_pred[mask_anom] = xgb.predict(X_test[mask_anom])\n",
" y_prob = np.zeros_like(y_test, dtype=float)\n",
" y_prob[mask_anom] = xgb.predict_proba(X_test[mask_anom])[:,1]\n",
"\n",
" print(\"=== 最終評估 ===\")\n",
" print(classification_report(y_test, y_pred, digits=4))\n",
" print(\"Confusion Matrix:\")\n",
" print(confusion_matrix(y_test, y_pred))\n",
" print(f\"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}\")\n",
"\n",
"if __name__ == \"__main__\":\n",
" # 自訂 contamination 與 percentile 的範圍\n",
" cons = [0.001, 0.002, 0.005, 0.01]\n",
" pers = [99, 99.5, 99.8, 99.9]\n",
" evaluate_pipeline(cons, pers)\n"
]
}
]
}
55 changes: 55 additions & 0 deletions ACS111151_ex2/ex2.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
為什麼要用 AutoEncoder + XGBoost?
AutoEncoder 是一種神經網路架構,用來壓縮並還原輸入資料。如果某筆資料「無法被還原得很好」,那可能表示這是異常樣本。

XGBoost 是目前最受歡迎的梯度提升樹模型,對不平衡資料具有良好表現。

結合這兩者:利用 AutoEncoder 偵測異常的能力,為每筆資料生成一個「異常分數」,再加入到 XGBoost 當作額外特徵,使模型能更好地識別詐欺交易。

實作步驟與程式碼解說
載入並預處理資料
df = pd.read_csv("creditcard.csv")
X = df.drop(['Class', 'Time'], axis=1)
y = df['Class']
Class 是標籤,0 代表正常交易,1 代表詐欺。
Time 被移除,因為對模型學習幫助不大。

使用 MinMaxScaler 對資料進行正規化:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

訓練 AutoEncoder
X_normal = X_scaled[y == 0] # 只用正常樣本

設計一個簡單的 AutoEncoder 結構(中間隱藏層是 16 維):
input_dim = X_normal.shape[1]
input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(16, activation='relu')(input_layer)
decoded = layers.Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = models.Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_normal, X_normal, epochs=10, batch_size=256, shuffle=True)

計算重建誤差(異常分數)
X_reconstructed = autoencoder.predict(X_scaled)
recon_error = np.mean(np.power(X_scaled - X_reconstructed, 2), axis=1)
這個 recon_error 就是每筆資料與其重建結果的誤差,數值愈大,表示愈可能是異常。

將異常分數加入原始特徵
X_with_score = pd.DataFrame(X_scaled, columns=X.columns)
X_with_score['recon_error'] = recon_error

使用 XGBoost 做分類
X_train, X_test, y_train, y_test = train_test_split(X_with_score, y, test_size=0.2, stratify=y)

model = xgb.XGBClassifier(scale_pos_weight=10, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
scale_pos_weight=10 是為了解決資料不平衡問題,可以根據實際詐欺比例微調。

模型評估
print(classification_report(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, y_prob))
輸出 Precision、Recall、F1-score 與 AUC 分數,讓你評估模型在詐欺樣本上的準確程度。
1 change: 1 addition & 0 deletions ex1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

1 change: 1 addition & 0 deletions ex1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

1 change: 1 addition & 0 deletions ex2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@