From 91617ec10b8601c607364a130c52855115bda014 Mon Sep 17 00:00:00 2001 From: ferdinKuan Date: Thu, 29 May 2025 17:22:13 +0800 Subject: [PATCH 1/9] ex1 --- ex1 | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 ex1 diff --git a/ex1 b/ex1 new file mode 100644 index 0000000..8662d37 --- /dev/null +++ b/ex1 @@ -0,0 +1,22 @@ +1. 前處理 +- 資料來源:`data/creditcard.csv` +- 刪除 `Time` 欄位,對 `Amount` 做 StandardScaler。 + +2. 監督式實驗:SMOTE + RandomForest +- SMOTE 過採樣後的訓練集:正/負樣本比例接近平衡。 +- RandomForest 參數:`n_estimators=100, class_weight='balanced'`。 +- 結果: + - Precision、Recall、F1-score、ROC AUC 如下表。 + +| 類別 | Precision | Recall | F1 | +|----|---------|-------|-------| +| 0 | … | … | … | +| 1 | … | … | … | + +3. 非監督式實驗:KMeans(k=3) +- 對全資料做標準化後聚成三群,每群以多數真實標籤做預測 +- 結果: + - Precision、Recall、F1-score 如下。 + +4. 結論 +- 監督式方法效果遠優於非監督式。 From 14015c2e0fbf209dbc508ea984c8ebfd02415441 Mon Sep 17 00:00:00 2001 From: ferdinKuan Date: Thu, 29 May 2025 17:40:24 +0800 Subject: [PATCH 2/9] Add files via upload --- ex1.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++++ ex2.py | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 ex1.py create mode 100644 ex2.py diff --git a/ex1.py b/ex1.py new file mode 100644 index 0000000..2aba3ad --- /dev/null +++ b/ex1.py @@ -0,0 +1,88 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from imblearn.over_sampling import SMOTE +from sklearn.ensemble import RandomForestClassifier +from sklearn.cluster import KMeans +from sklearn.metrics import ( + classification_report, + confusion_matrix, + roc_auc_score +) + +# 固定參數 +RANDOM_SEED = 42 +TEST_SIZE = 0.3 + +def supervised_pipeline(X_train, X_test, y_train, y_test): + """監督式:SMOTE + RandomForest""" + sm = SMOTE(random_state=RANDOM_SEED) + X_res, y_res = sm.fit_resample(X_train, y_train) + + clf = RandomForestClassifier( + n_estimators=100, + class_weight='balanced', + random_state=RANDOM_SEED + ) + clf.fit(X_res, y_res) + + y_pred = clf.predict(X_test) + y_prob = clf.predict_proba(X_test)[:,1] + + print("\n--- 監督式學習:SMOTE + RandomForest ---") + print(classification_report(y_test, y_pred, digits=4)) + print("Confusion Matrix:") + print(confusion_matrix(y_test, y_pred)) + print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}") + +def unsupervised_pipeline(X_all, y_all): + """非監督式:KMeans(k=3) 異常偵測""" + # 全資料標準化 + X_scaled = StandardScaler().fit_transform(X_all) + + k = 3 + km = KMeans(n_clusters=k, random_state=RANDOM_SEED).fit(X_scaled) + labels = km.labels_ + + # 群內多數標籤當預測 + y_pred = np.zeros_like(labels) + for c in range(k): + mask = (labels == c) + majority = pd.Series(y_all[mask]).mode()[0] + y_pred[mask] = majority + + print("\n--- 非監督式學習:KMeans (k=3) ---") + print(classification_report(y_all, y_pred, digits=4)) + print("Confusion Matrix:") + print(confusion_matrix(y_all, y_pred)) + +def main(): + # 1. 讀檔 & 前處理 + data = pd.read_csv("data/creditcard.csv") + data = data.drop(columns=['Time']) + data['Amount'] = StandardScaler().fit_transform( + data['Amount'].values.reshape(-1,1) + ) + + X = data.drop(columns=['Class']).values + y = data['Class'].values + + # 2. 切 supervised 的 train/test + X_train, X_test, y_train, y_test = train_test_split( + X, y, + test_size=TEST_SIZE, + random_state=RANDOM_SEED, + stratify=y + ) + + # 3. 執行監督式流程 + supervised_pipeline(X_train, X_test, y_train, y_test) + + # 4. 執行非監督式流程(用全部資料評估) + unsupervised_pipeline(X, y) + +if __name__ == "__main__": + main() diff --git a/ex2.py b/ex2.py new file mode 100644 index 0000000..fb0cc6c --- /dev/null +++ b/ex2.py @@ -0,0 +1,101 @@ +import numpy as np +import pandas as pd + +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.ensemble import IsolationForest +from xgboost import XGBClassifier +from sklearn.metrics import ( + classification_report, + confusion_matrix, + roc_auc_score, + f1_score +) + +# 固定參數 +RANDOM_SEED = 42 +TEST_SIZE = 0.3 + +def evaluate_pipeline(cont_list, percentile_list): + # 讀檔 & 前處理 + df = pd.read_csv("data/creditcard.csv") + df = df.drop(columns=["Time"]) + df["Amount"] = StandardScaler().fit_transform( + df["Amount"].values.reshape(-1, 1) + ) + X = df.drop(columns=["Class"]).values + y = df["Class"].values + + # 切分 + X_train, X_test, y_train, y_test = train_test_split( + X, y, + test_size=TEST_SIZE, + random_state=RANDOM_SEED, + stratify=y + ) + + # 訓練 XGBoost(全資料) + xgb = XGBClassifier( + n_estimators=100, + random_state=RANDOM_SEED, + use_label_encoder=False, + eval_metric="logloss" + ) + xgb.fit(X_train, y_train) + + best_cfg = None + best_f1 = 0 + + # 掃描不同的 contamination + for cont in cont_list: + iso = IsolationForest( + contamination=cont, + random_state=RANDOM_SEED + ) + iso.fit(X_train[y_train==0]) + + # decision_function 取分數 + scores = -iso.decision_function(X_test) + + # 在這個 contamination 下,掃描不同的 percentile 作為 threshold + for pct in percentile_list: + thr = np.percentile(scores, pct) + mask_anom = (scores >= thr) + + # 合併預測 + y_pred = np.zeros_like(y_test) + if mask_anom.any(): + y_pred[mask_anom] = xgb.predict(X_test[mask_anom]) + + # 計算 F1 + f1 = f1_score(y_test, y_pred) + if f1 > best_f1: + best_f1 = f1 + best_cfg = (cont, pct, thr, f1) + + cont, pct, thr, f1 = best_cfg + print(f"\n最佳配置 → contamination={cont}, percentile={pct:.1f}, thr={thr:.3f}") + print(f"對應 F1 = {f1:.4f}\n") + + # 用最佳配置重跑一次並印最終報告 + iso = IsolationForest(contamination=cont, random_state=RANDOM_SEED) + iso.fit(X_train[y_train==0]) + scores = -iso.decision_function(X_test) + mask_anom = (scores >= thr) + + y_pred = np.zeros_like(y_test) + y_pred[mask_anom] = xgb.predict(X_test[mask_anom]) + y_prob = np.zeros_like(y_test, dtype=float) + y_prob[mask_anom] = xgb.predict_proba(X_test[mask_anom])[:,1] + + print("=== 最終評估 ===") + print(classification_report(y_test, y_pred, digits=4)) + print("Confusion Matrix:") + print(confusion_matrix(y_test, y_pred)) + print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}") + +if __name__ == "__main__": + # 自訂 contamination 與 percentile 的範圍 + cons = [0.001, 0.002, 0.005, 0.01] + pers = [99, 99.5, 99.8, 99.9] + evaluate_pipeline(cons, pers) From f1bc134dd6736bd0e803e400c0c8274ed2feca31 Mon Sep 17 00:00:00 2001 From: ferdinKuan Date: Sat, 14 Jun 2025 15:37:55 +0800 Subject: [PATCH 3/9] =?UTF-8?q?=E8=B3=87=E6=96=99=E5=A4=BE=E4=B8=8A?= =?UTF-8?q?=E5=82=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ACS111151_ex/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 ACS111151_ex/README.md diff --git a/ACS111151_ex/README.md b/ACS111151_ex/README.md new file mode 100644 index 0000000..85958b5 --- /dev/null +++ b/ACS111151_ex/README.md @@ -0,0 +1 @@ +作業一放這裡 From 1cad0baf44f9e42658d9e6b1d46139d32995e365 Mon Sep 17 00:00:00 2001 From: ferdinKuan Date: Sat, 14 Jun 2025 15:38:42 +0800 Subject: [PATCH 4/9] delete ex1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 作業一在資料夾裡 --- ex1 | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/ex1 b/ex1 index 8662d37..8b13789 100644 --- a/ex1 +++ b/ex1 @@ -1,22 +1 @@ -1. 前處理 -- 資料來源:`data/creditcard.csv` -- 刪除 `Time` 欄位,對 `Amount` 做 StandardScaler。 -2. 監督式實驗:SMOTE + RandomForest -- SMOTE 過採樣後的訓練集:正/負樣本比例接近平衡。 -- RandomForest 參數:`n_estimators=100, class_weight='balanced'`。 -- 結果: - - Precision、Recall、F1-score、ROC AUC 如下表。 - -| 類別 | Precision | Recall | F1 | -|----|---------|-------|-------| -| 0 | … | … | … | -| 1 | … | … | … | - -3. 非監督式實驗:KMeans(k=3) -- 對全資料做標準化後聚成三群,每群以多數真實標籤做預測 -- 結果: - - Precision、Recall、F1-score 如下。 - -4. 結論 -- 監督式方法效果遠優於非監督式。 From 9f94b5be317b7d2fb53fd6e2fa2fda0cfe976a4e Mon Sep 17 00:00:00 2001 From: ferdinKuan Date: Sat, 14 Jun 2025 15:39:27 +0800 Subject: [PATCH 5/9] delete ex1.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 作業一在資料夾裡 --- ex1.py | 87 ---------------------------------------------------------- 1 file changed, 87 deletions(-) diff --git a/ex1.py b/ex1.py index 2aba3ad..d3f5a12 100644 --- a/ex1.py +++ b/ex1.py @@ -1,88 +1 @@ -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from imblearn.over_sampling import SMOTE -from sklearn.ensemble import RandomForestClassifier -from sklearn.cluster import KMeans -from sklearn.metrics import ( - classification_report, - confusion_matrix, - roc_auc_score -) - -# 固定參數 -RANDOM_SEED = 42 -TEST_SIZE = 0.3 - -def supervised_pipeline(X_train, X_test, y_train, y_test): - """監督式:SMOTE + RandomForest""" - sm = SMOTE(random_state=RANDOM_SEED) - X_res, y_res = sm.fit_resample(X_train, y_train) - - clf = RandomForestClassifier( - n_estimators=100, - class_weight='balanced', - random_state=RANDOM_SEED - ) - clf.fit(X_res, y_res) - - y_pred = clf.predict(X_test) - y_prob = clf.predict_proba(X_test)[:,1] - - print("\n--- 監督式學習:SMOTE + RandomForest ---") - print(classification_report(y_test, y_pred, digits=4)) - print("Confusion Matrix:") - print(confusion_matrix(y_test, y_pred)) - print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}") - -def unsupervised_pipeline(X_all, y_all): - """非監督式:KMeans(k=3) 異常偵測""" - # 全資料標準化 - X_scaled = StandardScaler().fit_transform(X_all) - - k = 3 - km = KMeans(n_clusters=k, random_state=RANDOM_SEED).fit(X_scaled) - labels = km.labels_ - - # 群內多數標籤當預測 - y_pred = np.zeros_like(labels) - for c in range(k): - mask = (labels == c) - majority = pd.Series(y_all[mask]).mode()[0] - y_pred[mask] = majority - - print("\n--- 非監督式學習:KMeans (k=3) ---") - print(classification_report(y_all, y_pred, digits=4)) - print("Confusion Matrix:") - print(confusion_matrix(y_all, y_pred)) - -def main(): - # 1. 讀檔 & 前處理 - data = pd.read_csv("data/creditcard.csv") - data = data.drop(columns=['Time']) - data['Amount'] = StandardScaler().fit_transform( - data['Amount'].values.reshape(-1,1) - ) - - X = data.drop(columns=['Class']).values - y = data['Class'].values - - # 2. 切 supervised 的 train/test - X_train, X_test, y_train, y_test = train_test_split( - X, y, - test_size=TEST_SIZE, - random_state=RANDOM_SEED, - stratify=y - ) - - # 3. 執行監督式流程 - supervised_pipeline(X_train, X_test, y_train, y_test) - - # 4. 執行非監督式流程(用全部資料評估) - unsupervised_pipeline(X, y) - -if __name__ == "__main__": - main() From 3fa2cf80c8f8c972e41565e6780acad753d11a75 Mon Sep 17 00:00:00 2001 From: ferdinKuan Date: Sat, 14 Jun 2025 15:40:12 +0800 Subject: [PATCH 6/9] delete ex2.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 作業二在資料夾2 --- ex2.py | 100 --------------------------------------------------------- 1 file changed, 100 deletions(-) diff --git a/ex2.py b/ex2.py index fb0cc6c..d3f5a12 100644 --- a/ex2.py +++ b/ex2.py @@ -1,101 +1 @@ -import numpy as np -import pandas as pd -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import StandardScaler -from sklearn.ensemble import IsolationForest -from xgboost import XGBClassifier -from sklearn.metrics import ( - classification_report, - confusion_matrix, - roc_auc_score, - f1_score -) - -# 固定參數 -RANDOM_SEED = 42 -TEST_SIZE = 0.3 - -def evaluate_pipeline(cont_list, percentile_list): - # 讀檔 & 前處理 - df = pd.read_csv("data/creditcard.csv") - df = df.drop(columns=["Time"]) - df["Amount"] = StandardScaler().fit_transform( - df["Amount"].values.reshape(-1, 1) - ) - X = df.drop(columns=["Class"]).values - y = df["Class"].values - - # 切分 - X_train, X_test, y_train, y_test = train_test_split( - X, y, - test_size=TEST_SIZE, - random_state=RANDOM_SEED, - stratify=y - ) - - # 訓練 XGBoost(全資料) - xgb = XGBClassifier( - n_estimators=100, - random_state=RANDOM_SEED, - use_label_encoder=False, - eval_metric="logloss" - ) - xgb.fit(X_train, y_train) - - best_cfg = None - best_f1 = 0 - - # 掃描不同的 contamination - for cont in cont_list: - iso = IsolationForest( - contamination=cont, - random_state=RANDOM_SEED - ) - iso.fit(X_train[y_train==0]) - - # decision_function 取分數 - scores = -iso.decision_function(X_test) - - # 在這個 contamination 下,掃描不同的 percentile 作為 threshold - for pct in percentile_list: - thr = np.percentile(scores, pct) - mask_anom = (scores >= thr) - - # 合併預測 - y_pred = np.zeros_like(y_test) - if mask_anom.any(): - y_pred[mask_anom] = xgb.predict(X_test[mask_anom]) - - # 計算 F1 - f1 = f1_score(y_test, y_pred) - if f1 > best_f1: - best_f1 = f1 - best_cfg = (cont, pct, thr, f1) - - cont, pct, thr, f1 = best_cfg - print(f"\n最佳配置 → contamination={cont}, percentile={pct:.1f}, thr={thr:.3f}") - print(f"對應 F1 = {f1:.4f}\n") - - # 用最佳配置重跑一次並印最終報告 - iso = IsolationForest(contamination=cont, random_state=RANDOM_SEED) - iso.fit(X_train[y_train==0]) - scores = -iso.decision_function(X_test) - mask_anom = (scores >= thr) - - y_pred = np.zeros_like(y_test) - y_pred[mask_anom] = xgb.predict(X_test[mask_anom]) - y_prob = np.zeros_like(y_test, dtype=float) - y_prob[mask_anom] = xgb.predict_proba(X_test[mask_anom])[:,1] - - print("=== 最終評估 ===") - print(classification_report(y_test, y_pred, digits=4)) - print("Confusion Matrix:") - print(confusion_matrix(y_test, y_pred)) - print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}") - -if __name__ == "__main__": - # 自訂 contamination 與 percentile 的範圍 - cons = [0.001, 0.002, 0.005, 0.01] - pers = [99, 99.5, 99.8, 99.9] - evaluate_pipeline(cons, pers) From 2504308d55f51086db06a391369c8a115fb9cbf9 Mon Sep 17 00:00:00 2001 From: ferdinKuan Date: Sat, 14 Jun 2025 15:41:55 +0800 Subject: [PATCH 7/9] =?UTF-8?q?=E4=BD=9C=E6=A5=AD=E4=B8=80=E4=B8=8A?= =?UTF-8?q?=E5=82=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ACS111151_ex/ex1.ipynb | 116 +++++++++++++++++++++++++++++++++++++++++ ACS111151_ex/ex1.md | 22 ++++++++ 2 files changed, 138 insertions(+) create mode 100644 ACS111151_ex/ex1.ipynb create mode 100644 ACS111151_ex/ex1.md diff --git a/ACS111151_ex/ex1.ipynb b/ACS111151_ex/ex1.ipynb new file mode 100644 index 0000000..fa60f66 --- /dev/null +++ b/ACS111151_ex/ex1.ipynb @@ -0,0 +1,116 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dQc5pfBVV_SF" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from imblearn.over_sampling import SMOTE\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.metrics import (\n", + " classification_report,\n", + " confusion_matrix,\n", + " roc_auc_score\n", + ")\n", + "\n", + "# 固定參數\n", + "RANDOM_SEED = 42\n", + "TEST_SIZE = 0.3\n", + "\n", + "def supervised_pipeline(X_train, X_test, y_train, y_test):\n", + " \"\"\"監督式:SMOTE + RandomForest\"\"\"\n", + " sm = SMOTE(random_state=RANDOM_SEED)\n", + " X_res, y_res = sm.fit_resample(X_train, y_train)\n", + "\n", + " clf = RandomForestClassifier(\n", + " n_estimators=100,\n", + " class_weight='balanced',\n", + " random_state=RANDOM_SEED\n", + " )\n", + " clf.fit(X_res, y_res)\n", + "\n", + " y_pred = clf.predict(X_test)\n", + " y_prob = clf.predict_proba(X_test)[:,1]\n", + "\n", + " print(\"\\n--- 監督式學習:SMOTE + RandomForest ---\")\n", + " print(classification_report(y_test, y_pred, digits=4))\n", + " print(\"Confusion Matrix:\")\n", + " print(confusion_matrix(y_test, y_pred))\n", + " print(f\"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}\")\n", + "\n", + "def unsupervised_pipeline(X_all, y_all):\n", + " \"\"\"非監督式:KMeans(k=3) 異常偵測\"\"\"\n", + " # 全資料標準化\n", + " X_scaled = StandardScaler().fit_transform(X_all)\n", + "\n", + " k = 3\n", + " km = KMeans(n_clusters=k, random_state=RANDOM_SEED).fit(X_scaled)\n", + " labels = km.labels_\n", + "\n", + " # 群內多數標籤當預測\n", + " y_pred = np.zeros_like(labels)\n", + " for c in range(k):\n", + " mask = (labels == c)\n", + " majority = pd.Series(y_all[mask]).mode()[0]\n", + " y_pred[mask] = majority\n", + "\n", + " print(\"\\n--- 非監督式學習:KMeans (k=3) ---\")\n", + " print(classification_report(y_all, y_pred, digits=4))\n", + " print(\"Confusion Matrix:\")\n", + " print(confusion_matrix(y_all, y_pred))\n", + "\n", + "def main():\n", + " # 1. 讀檔 & 前處理\n", + " data = pd.read_csv(\"data/creditcard.csv\")\n", + " data = data.drop(columns=['Time'])\n", + " data['Amount'] = StandardScaler().fit_transform(\n", + " data['Amount'].values.reshape(-1,1)\n", + " )\n", + "\n", + " X = data.drop(columns=['Class']).values\n", + " y = data['Class'].values\n", + "\n", + " # 2. 切 supervised 的 train/test\n", + " X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y,\n", + " test_size=TEST_SIZE,\n", + " random_state=RANDOM_SEED,\n", + " stratify=y\n", + " )\n", + "\n", + " # 3. 執行監督式流程\n", + " supervised_pipeline(X_train, X_test, y_train, y_test)\n", + "\n", + " # 4. 執行非監督式流程(用全部資料評估)\n", + " unsupervised_pipeline(X, y)\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()\n" + ] + } + ] +} \ No newline at end of file diff --git a/ACS111151_ex/ex1.md b/ACS111151_ex/ex1.md new file mode 100644 index 0000000..8662d37 --- /dev/null +++ b/ACS111151_ex/ex1.md @@ -0,0 +1,22 @@ +1. 前處理 +- 資料來源:`data/creditcard.csv` +- 刪除 `Time` 欄位,對 `Amount` 做 StandardScaler。 + +2. 監督式實驗:SMOTE + RandomForest +- SMOTE 過採樣後的訓練集:正/負樣本比例接近平衡。 +- RandomForest 參數:`n_estimators=100, class_weight='balanced'`。 +- 結果: + - Precision、Recall、F1-score、ROC AUC 如下表。 + +| 類別 | Precision | Recall | F1 | +|----|---------|-------|-------| +| 0 | … | … | … | +| 1 | … | … | … | + +3. 非監督式實驗:KMeans(k=3) +- 對全資料做標準化後聚成三群,每群以多數真實標籤做預測 +- 結果: + - Precision、Recall、F1-score 如下。 + +4. 結論 +- 監督式方法效果遠優於非監督式。 From bdb81d5e3a854370bee8a9126ef3e7274389eff9 Mon Sep 17 00:00:00 2001 From: ferdinKuan Date: Sat, 14 Jun 2025 15:46:32 +0800 Subject: [PATCH 8/9] =?UTF-8?q?=E4=BD=9C=E6=A5=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ACS111151_ex2/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 ACS111151_ex2/README.md diff --git a/ACS111151_ex2/README.md b/ACS111151_ex2/README.md new file mode 100644 index 0000000..8898a7c --- /dev/null +++ b/ACS111151_ex2/README.md @@ -0,0 +1 @@ +作業2放這裡 From 1e13f08a9148e778f9d8513ac5ac984d05166b2b Mon Sep 17 00:00:00 2001 From: ferdinKuan Date: Sat, 14 Jun 2025 15:47:14 +0800 Subject: [PATCH 9/9] =?UTF-8?q?=E4=BD=9C=E6=A5=AD2=E6=94=BE=E9=80=99?= =?UTF-8?q?=E9=82=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ACS111151_ex2/ex2.ipynb | 129 ++++++++++++++++++++++++++++++++++++++++ ACS111151_ex2/ex2.md | 55 +++++++++++++++++ 2 files changed, 184 insertions(+) create mode 100644 ACS111151_ex2/ex2.ipynb create mode 100644 ACS111151_ex2/ex2.md diff --git a/ACS111151_ex2/ex2.ipynb b/ACS111151_ex2/ex2.ipynb new file mode 100644 index 0000000..c91fda4 --- /dev/null +++ b/ACS111151_ex2/ex2.ipynb @@ -0,0 +1,129 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dQc5pfBVV_SF" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.ensemble import IsolationForest\n", + "from xgboost import XGBClassifier\n", + "from sklearn.metrics import (\n", + " classification_report,\n", + " confusion_matrix,\n", + " roc_auc_score,\n", + " f1_score\n", + ")\n", + "\n", + "# 固定參數\n", + "RANDOM_SEED = 42\n", + "TEST_SIZE = 0.3\n", + "\n", + "def evaluate_pipeline(cont_list, percentile_list):\n", + " # 讀檔 & 前處理\n", + " df = pd.read_csv(\"data/creditcard.csv\")\n", + " df = df.drop(columns=[\"Time\"])\n", + " df[\"Amount\"] = StandardScaler().fit_transform(\n", + " df[\"Amount\"].values.reshape(-1, 1)\n", + " )\n", + " X = df.drop(columns=[\"Class\"]).values\n", + " y = df[\"Class\"].values\n", + "\n", + " # 切分\n", + " X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y,\n", + " test_size=TEST_SIZE,\n", + " random_state=RANDOM_SEED,\n", + " stratify=y\n", + " )\n", + "\n", + " # 訓練 XGBoost(全資料)\n", + " xgb = XGBClassifier(\n", + " n_estimators=100,\n", + " random_state=RANDOM_SEED,\n", + " use_label_encoder=False,\n", + " eval_metric=\"logloss\"\n", + " )\n", + " xgb.fit(X_train, y_train)\n", + "\n", + " best_cfg = None\n", + " best_f1 = 0\n", + "\n", + " # 掃描不同的 contamination\n", + " for cont in cont_list:\n", + " iso = IsolationForest(\n", + " contamination=cont,\n", + " random_state=RANDOM_SEED\n", + " )\n", + " iso.fit(X_train[y_train==0])\n", + "\n", + " # decision_function 取分數\n", + " scores = -iso.decision_function(X_test)\n", + "\n", + " # 在這個 contamination 下,掃描不同的 percentile 作為 threshold\n", + " for pct in percentile_list:\n", + " thr = np.percentile(scores, pct)\n", + " mask_anom = (scores >= thr)\n", + "\n", + " # 合併預測\n", + " y_pred = np.zeros_like(y_test)\n", + " if mask_anom.any():\n", + " y_pred[mask_anom] = xgb.predict(X_test[mask_anom])\n", + "\n", + " # 計算 F1\n", + " f1 = f1_score(y_test, y_pred)\n", + " if f1 > best_f1:\n", + " best_f1 = f1\n", + " best_cfg = (cont, pct, thr, f1)\n", + "\n", + " cont, pct, thr, f1 = best_cfg\n", + " print(f\"\\n最佳配置 → contamination={cont}, percentile={pct:.1f}, thr={thr:.3f}\")\n", + " print(f\"對應 F1 = {f1:.4f}\\n\")\n", + "\n", + " # 用最佳配置重跑一次並印最終報告\n", + " iso = IsolationForest(contamination=cont, random_state=RANDOM_SEED)\n", + " iso.fit(X_train[y_train==0])\n", + " scores = -iso.decision_function(X_test)\n", + " mask_anom = (scores >= thr)\n", + "\n", + " y_pred = np.zeros_like(y_test)\n", + " y_pred[mask_anom] = xgb.predict(X_test[mask_anom])\n", + " y_prob = np.zeros_like(y_test, dtype=float)\n", + " y_prob[mask_anom] = xgb.predict_proba(X_test[mask_anom])[:,1]\n", + "\n", + " print(\"=== 最終評估 ===\")\n", + " print(classification_report(y_test, y_pred, digits=4))\n", + " print(\"Confusion Matrix:\")\n", + " print(confusion_matrix(y_test, y_pred))\n", + " print(f\"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}\")\n", + "\n", + "if __name__ == \"__main__\":\n", + " # 自訂 contamination 與 percentile 的範圍\n", + " cons = [0.001, 0.002, 0.005, 0.01]\n", + " pers = [99, 99.5, 99.8, 99.9]\n", + " evaluate_pipeline(cons, pers)\n" + ] + } + ] +} \ No newline at end of file diff --git a/ACS111151_ex2/ex2.md b/ACS111151_ex2/ex2.md new file mode 100644 index 0000000..838a2ea --- /dev/null +++ b/ACS111151_ex2/ex2.md @@ -0,0 +1,55 @@ +為什麼要用 AutoEncoder + XGBoost? +AutoEncoder 是一種神經網路架構,用來壓縮並還原輸入資料。如果某筆資料「無法被還原得很好」,那可能表示這是異常樣本。 + +XGBoost 是目前最受歡迎的梯度提升樹模型,對不平衡資料具有良好表現。 + +結合這兩者:利用 AutoEncoder 偵測異常的能力,為每筆資料生成一個「異常分數」,再加入到 XGBoost 當作額外特徵,使模型能更好地識別詐欺交易。 + +實作步驟與程式碼解說 +載入並預處理資料 +df = pd.read_csv("creditcard.csv") +X = df.drop(['Class', 'Time'], axis=1) +y = df['Class'] +Class 是標籤,0 代表正常交易,1 代表詐欺。 +Time 被移除,因為對模型學習幫助不大。 + +使用 MinMaxScaler 對資料進行正規化: +scaler = MinMaxScaler() +X_scaled = scaler.fit_transform(X) + +訓練 AutoEncoder +X_normal = X_scaled[y == 0] # 只用正常樣本 + +設計一個簡單的 AutoEncoder 結構(中間隱藏層是 16 維): +input_dim = X_normal.shape[1] +input_layer = layers.Input(shape=(input_dim,)) +encoded = layers.Dense(16, activation='relu')(input_layer) +decoded = layers.Dense(input_dim, activation='sigmoid')(encoded) + +autoencoder = models.Model(inputs=input_layer, outputs=decoded) +autoencoder.compile(optimizer='adam', loss='mse') +autoencoder.fit(X_normal, X_normal, epochs=10, batch_size=256, shuffle=True) + +計算重建誤差(異常分數) +X_reconstructed = autoencoder.predict(X_scaled) +recon_error = np.mean(np.power(X_scaled - X_reconstructed, 2), axis=1) +這個 recon_error 就是每筆資料與其重建結果的誤差,數值愈大,表示愈可能是異常。 + +將異常分數加入原始特徵 +X_with_score = pd.DataFrame(X_scaled, columns=X.columns) +X_with_score['recon_error'] = recon_error + +使用 XGBoost 做分類 +X_train, X_test, y_train, y_test = train_test_split(X_with_score, y, test_size=0.2, stratify=y) + +model = xgb.XGBClassifier(scale_pos_weight=10, use_label_encoder=False, eval_metric='logloss') +model.fit(X_train, y_train) + +y_pred = model.predict(X_test) +y_prob = model.predict_proba(X_test)[:, 1] +scale_pos_weight=10 是為了解決資料不平衡問題,可以根據實際詐欺比例微調。 + +模型評估 +print(classification_report(y_test, y_pred)) +print("AUC Score:", roc_auc_score(y_test, y_prob)) +輸出 Precision、Recall、F1-score 與 AUC 分數,讓你評估模型在詐欺樣本上的準確程度。