diff --git a/ACS111136_ex1/ex1.ipynb b/ACS111136_ex1/ex1.ipynb new file mode 100644 index 0000000..b368bf0 --- /dev/null +++ b/ACS111136_ex1/ex1.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOOH1/HuG9u0bTVs71ffNHL"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":[],"metadata":{"id":"w5iHKqTXmgVn"}},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","from sklearn.model_selection import train_test_split\n","from sklearn.preprocessing import StandardScaler\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import classification_report,precision_score, recall_score, f1_score\n","from sklearn.cluster import KMeans\n","from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix\n","import kagglehub\n","from imblearn.over_sampling import SMOTE\n","from sklearn.decomposition import PCA\n","\n","\n","# general setting. do not change TEST_SIZE\n","RANDOM_SEED = 42\n","TEST_SIZE = 0.3\n"],"metadata":{"id":"9GaIHp5rqDEY"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import kagglehub\n","path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n","data = pd.read_csv(f\"{path}/creditcard.csv\")"],"metadata":{"id":"3mvCBbqgC_B9"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":[],"metadata":{"id":"njccW7KIGqEN"}},{"cell_type":"code","source":["# load dataset(from kagglehub)\n","path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n","data = pd.read_csv(f\"{path}/creditcard.csv\")\n","data['Class'] = data['Class'].astype(int)\n","\n","# prepare data\n","data = data.drop(['Time'], axis=1)\n","data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n"],"metadata":{"id":"SxyaGUXIrPtp"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["fraud = data[data['Class'] == 1]\n","nonfraud = data[data['Class'] == 0]\n","print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')\n","print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Xq6-1WLkrR6A","executionInfo":{"status":"ok","timestamp":1748154020701,"user_tz":-480,"elapsed":70,"user":{"displayName":"stewart S","userId":"12277001259464691529"}},"outputId":"633c2348-eec8-48de-89b6-576e7e1d6968"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Fraudulent:492, non-fraudulent:284315\n","the positive class (frauds) percentage: 492/284807 (0.173%)\n"]}]},{"cell_type":"code","source":["X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])\n","Y = np.asarray(data.iloc[:, data.columns == 'Class'])\n","\n","# split training set and data set\n","X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)\n","\n","# build Random Forest model\n","rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)\n","rf_model.fit(X_train, y_train)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":135},"id":"ZEAhP7NsrXB4","outputId":"91ba4214-ee99-49d4-c4ff-4081fbc5fd99","executionInfo":{"status":"ok","timestamp":1748154330792,"user_tz":-480,"elapsed":310096,"user":{"displayName":"stewart S","userId":"12277001259464691529"}}},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.11/dist-packages/sklearn/base.py:1389: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n"," return fit_method(estimator, *args, **kwargs)\n"]},{"output_type":"execute_result","data":{"text/plain":["RandomForestClassifier(random_state=42)"],"text/html":["
RandomForestClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
"]},"metadata":{},"execution_count":34}]},{"cell_type":"code","source":["from sklearn.ensemble import RandomForestClassifier\n","\n","model = RandomForestClassifier(\n"," n_estimators=100, #50、100、150\n"," max_depth=10, #adjust depth\n"," min_samples_split=10,\n"," class_weight='balanced', # unbalance solved\n"," random_state=42\n",")\n"],"metadata":{"id":"tguRmxrpV4J6"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 2. model training\n","model.fit(X_train, y_train)\n","\n","# 3. test the dataset\n","y_pred = model.predict(X_test)\n","# define evaluation function\n","def evaluation(y_true, y_pred, model_name=\"Model\"):\n"," accuracy = accuracy_score(y_true, y_pred)\n"," precision = precision_score(y_true, y_pred)\n"," recall = recall_score(y_true, y_pred)\n"," f1 = f1_score(y_true, y_pred)\n","\n"," print(f'\\n{model_name} Evaluation:')\n"," print('===' * 15)\n"," print(' Accuracy:', accuracy)\n"," print(' Precision Score:', precision)\n"," print(' Recall Score:', recall)\n"," print(' F1 Score:', f1)\n"," print(\"\\nClassification Report:\")\n"," print(classification_report(y_true, y_pred))\n","\n","# predict and print result\n","y_pred = rf_model.predict(X_test)\n","print(classification_report(y_test, y_pred))\n","\n","evaluation(y_test, y_pred, model_name=\"Random Forest\")\n","\n","# train model\n","model.fit(X_train, y_train)\n","\n","# new model prediction\n","y_pred_new = model.predict(X_test)\n","evaluation(y_test, y_pred_new, model_name=\"Tuned Random Forest\")\n","\n"],"metadata":{"id":"IbT1p7p_rdOp","colab":{"base_uri":"https://localhost:8080/"},"outputId":"be8b3cde-d86c-45e5-cdd5-72e9b22286d9","executionInfo":{"status":"ok","timestamp":1748154640323,"user_tz":-480,"elapsed":309536,"user":{"displayName":"stewart S","userId":"12277001259464691529"}}},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.11/dist-packages/sklearn/base.py:1389: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n"," return fit_method(estimator, *args, **kwargs)\n"]},{"output_type":"stream","name":"stdout","text":[" precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 85307\n"," 1 0.94 0.82 0.88 136\n","\n"," accuracy 1.00 85443\n"," macro avg 0.97 0.91 0.94 85443\n","weighted avg 1.00 1.00 1.00 85443\n","\n","\n","Random Forest Evaluation:\n","=============================================\n"," Accuracy: 0.9996371850239341\n"," Precision Score: 0.9411764705882353\n"," Recall Score: 0.8235294117647058\n"," F1 Score: 0.8784313725490196\n","\n","Classification Report:\n"," precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 85307\n"," 1 0.94 0.82 0.88 136\n","\n"," accuracy 1.00 85443\n"," macro avg 0.97 0.91 0.94 85443\n","weighted avg 1.00 1.00 1.00 85443\n","\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.11/dist-packages/sklearn/base.py:1389: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n"," return fit_method(estimator, *args, **kwargs)\n"]},{"output_type":"stream","name":"stdout","text":["\n","Tuned Random Forest Evaluation:\n","=============================================\n"," Accuracy: 0.9994850368081645\n"," Precision Score: 0.8333333333333334\n"," Recall Score: 0.8455882352941176\n"," F1 Score: 0.8394160583941606\n","\n","Classification Report:\n"," precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 85307\n"," 1 0.83 0.85 0.84 136\n","\n"," accuracy 1.00 85443\n"," macro avg 0.92 0.92 0.92 85443\n","weighted avg 1.00 1.00 1.00 85443\n","\n"]}]},{"cell_type":"code","source":["# Extract features and labels\n","X = np.asarray(data.drop(columns=['Class']))\n","y = np.asarray(data['Class'])\n","\n","# Split the dataset into training and testing sets (with stratification)\n","x_train, x_test, y_train, y_test = train_test_split(\n"," X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y\n",")\n","\n","scaler = StandardScaler()\n","x_train = scaler.fit_transform(x_train)\n","x_test = scaler.transform(x_test)\n","\n","# Select a small sample of normal (non-fraud) data for unsupervised training\n","n_x_train = x_train[y_train == 0]\n","n_x_train = n_x_train[:1000]\n"],"metadata":{"id":"LkSSccV8rm2o"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["scores = []\n","for k in range(2, 5):\n"," kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n"," kmeans.fit(n_x_train)\n"," score = silhouette_score(n_x_train, kmeans.labels_)\n"," scores.append(score)\n","\n","optimal_k = np.argmax(scores) + 2\n","kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n","kmeans.fit(n_x_train)\n","y_pred_test = kmeans.predict(x_test)\n","def align_labels(y_true, y_pred, n_clusters):\n"," labels = np.zeros_like(y_pred)\n"," for i in range(n_clusters):\n"," mask = (y_pred == i)\n"," if np.sum(mask) > 0:\n"," labels[mask] = np.bincount(y_true[mask]).argmax()\n"," else:\n"," labels[mask] = 0 # Default to normal class\n"," return labels\n","\n","y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)\n"],"metadata":{"id":"RNojQgykrr_w"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def evaluation(y_true, y_pred, model_name=\"Model\"):\n"," accuracy = accuracy_score(y_true, y_pred)\n"," precision = precision_score(y_true, y_pred, zero_division=0)\n"," recall = recall_score(y_true, y_pred)\n"," f1 = f1_score(y_true, y_pred)\n","\n"," print(f'\\n{model_name} Evaluation:')\n"," print('===' * 15)\n"," print(' Accuracy:', accuracy)\n"," print(' Precision Score:', precision)\n"," print(' Recall Score:', recall)\n"," print(' F1 Score:', f1)\n"," print(\"\\nClassification Report:\")\n"," print(classification_report(y_true, y_pred))\n","\n","evaluation(y_test, y_pred_aligned, model_name=\"KMeans (Unsupervised)\")\n","\n","pca = PCA(n_components=5, random_state=RANDOM_SEED)\n","x_train_pca = pca.fit_transform(x_train)\n","x_test_pca = pca.transform(x_test)\n"],"metadata":{"id":"8HgrvzGMrxUv","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1748154641406,"user_tz":-480,"elapsed":250,"user":{"displayName":"stewart S","userId":"12277001259464691529"}},"outputId":"4c4932ce-21e9-470b-e27c-6bed98483876"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","KMeans (Unsupervised) Evaluation:\n","=============================================\n"," Accuracy: 0.9987242957293166\n"," Precision Score: 0.782608695652174\n"," Recall Score: 0.36486486486486486\n"," F1 Score: 0.4976958525345622\n","\n","Classification Report:\n"," precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 85295\n"," 1 0.78 0.36 0.50 148\n","\n"," accuracy 1.00 85443\n"," macro avg 0.89 0.68 0.75 85443\n","weighted avg 1.00 1.00 1.00 85443\n","\n"]}]},{"cell_type":"code","source":["from sklearn.decomposition import PCA\n","from sklearn.metrics import silhouette_score\n","\n","# PCA\n","pca = PCA(n_components=0.9, random_state=RANDOM_SEED)\n","x_train_pca = pca.fit_transform(x_train)\n","x_test_pca = pca.transform(x_test)\n","\n","# searching K\n","scores = []\n","for k in range(2, 5):\n"," kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n"," kmeans.fit(x_train_pca[y_train == 0][:1000])\n"," score = silhouette_score(x_train_pca[y_train == 0][:1000], kmeans.labels_)\n"," scores.append(score)\n","\n","optimal_k = np.argmax(scores) + 2\n","print(\"Best K value:\", optimal_k)\n","\n","#training\n","kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n","kmeans.fit(x_train_pca[y_train == 0][:1000])\n","y_pred_test = kmeans.predict(x_test_pca)\n","\n","#evaluate again\n","evaluation(y_test, y_pred_aligned, model_name=\"KMeans + PCA\")\n","\n","# sil score output\n","sil_score = silhouette_score(x_test_pca, y_pred_test)\n","print(f\"\\nSilhouette Score on test set: {sil_score:.4f}\")\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oBRN5QcZH5GJ","executionInfo":{"status":"ok","timestamp":1748154737771,"user_tz":-480,"elapsed":96369,"user":{"displayName":"stewart S","userId":"12277001259464691529"}},"outputId":"19e1aa04-1123-4e5f-c4be-2df9fd0c45ae"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Best K value: 4\n","\n","KMeans + PCA Evaluation:\n","=============================================\n"," Accuracy: 0.9987242957293166\n"," Precision Score: 0.782608695652174\n"," Recall Score: 0.36486486486486486\n"," F1 Score: 0.4976958525345622\n","\n","Classification Report:\n"," precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 85295\n"," 1 0.78 0.36 0.50 148\n","\n"," accuracy 1.00 85443\n"," macro avg 0.89 0.68 0.75 85443\n","weighted avg 1.00 1.00 1.00 85443\n","\n","\n","Silhouette Score on test set: 0.1075\n"]}]},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","from sklearn.ensemble import IsolationForest\n","from sklearn.model_selection import train_test_split\n","from sklearn.preprocessing import StandardScaler\n","from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_curve\n","from xgboost import XGBClassifier\n","\n","# 載入資料\n","data = pd.read_csv(f\"{path}/creditcard.csv\")\n","\n","# 特徵與標籤分開\n","X = data.drop(columns=[\"Class\"])\n","y = data[\"Class\"]\n","\n","# 分割訓練與測試集(保持詐欺比例)\n","x_train, x_test, y_train, y_test = train_test_split(\n"," X, y, test_size=0.3, stratify=y, random_state=42\n",")\n","\n","# 資料標準化\n","scaler = StandardScaler()\n","x_train_scaled = scaler.fit_transform(x_train)\n","x_test_scaled = scaler.transform(x_test)\n","\n","# ==============================\n","# 1️⃣ IsolationForest 異常分數作為特徵\n","# ==============================\n","iso = IsolationForest(n_estimators=100, contamination=0.002, random_state=42)\n","iso.fit(x_train_scaled)\n","\n","# 取得 anomaly score(愈小代表愈異常)\n","train_scores = iso.decision_function(x_train_scaled)\n","test_scores = iso.decision_function(x_test_scaled)\n","\n","# 合併 anomaly score 作為新特徵\n","x_train_combined = np.hstack([x_train_scaled, train_scores.reshape(-1, 1)])\n","x_test_combined = np.hstack([x_test_scaled, test_scores.reshape(-1, 1)])\n","\n","# ==============================\n","# 2️⃣ XGBoost 訓練\n","# ==============================\n","xgb = XGBClassifier(\n"," n_estimators=100,\n"," max_depth=5,\n"," learning_rate=0.1,\n"," subsample=0.8,\n"," scale_pos_weight=(len(y_train[y_train == 0]) / len(y_train[y_train == 1])), # 動態調整\n"," random_state=42,\n"," use_label_encoder=False,\n"," eval_metric='logloss'\n",")\n","\n","xgb.fit(x_train_combined, y_train)\n","\n","# ==============================\n","# 3️⃣ 預測 + 門檻調整 + 評估\n","# ==============================\n","\n","# 預測詐欺機率\n","y_prob = xgb.predict_proba(x_test_combined)[:, 1]\n","\n","# 利用 precision_recall_curve 找最佳門檻使 Precision >= 0.9,且 Recall 最大化\n","precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)\n","\n","best_threshold = 0.5\n","best_recall = 0\n","for p, r, t in zip(precisions, recalls, thresholds):\n"," if p >= 0.9 and r > best_recall:\n"," best_recall = r\n"," best_threshold = t\n","\n","# 使用最佳門檻轉換預測標籤\n","y_pred_adj = (y_prob >= best_threshold).astype(int)\n","\n","def evaluation(y_true, y_pred, model_name=\"Model\"):\n"," accuracy = accuracy_score(y_true, y_pred)\n"," precision = precision_score(y_true, y_pred, zero_division=0)\n"," recall = recall_score(y_true, y_pred)\n"," f1 = f1_score(y_true, y_pred)\n","\n"," print(f'\\n{model_name} Evaluation:')\n"," print('===' * 15)\n"," print(' Accuracy:', round(accuracy, 4))\n"," print(' Precision Score:', round(precision, 4))\n"," print(' Recall Score:', round(recall, 4))\n"," print(' F1 Score:', round(f1, 4))\n"," print(\"\\nClassification Report:\")\n"," print(classification_report(y_true, y_pred))\n","\n","evaluation(y_test, y_pred_adj, model_name=\"Threshold Adjusted Hybrid Model\")\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"VetnJv0dGQ8R","executionInfo":{"status":"ok","timestamp":1748157347111,"user_tz":-480,"elapsed":12858,"user":{"displayName":"stewart S","userId":"12277001259464691529"}},"outputId":"67c5ee72-13e9-440c-d85e-7eadb01ea83e"},"execution_count":48,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158: UserWarning: [07:15:44] WARNING: /workspace/src/learner.cc:740: \n","Parameters: { \"use_label_encoder\" } are not used.\n","\n"," warnings.warn(smsg, UserWarning)\n"]},{"output_type":"stream","name":"stdout","text":["\n","Threshold Adjusted Hybrid Model Evaluation:\n","=============================================\n"," Accuracy: 0.9994\n"," Precision Score: 0.9024\n"," Recall Score: 0.75\n"," F1 Score: 0.8192\n","\n","Classification Report:\n"," precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 85295\n"," 1 0.90 0.75 0.82 148\n","\n"," accuracy 1.00 85443\n"," macro avg 0.95 0.87 0.91 85443\n","weighted avg 1.00 1.00 1.00 85443\n","\n"]}]}]} \ No newline at end of file diff --git a/ACS111136_ex1/ex1.md b/ACS111136_ex1/ex1.md new file mode 100644 index 0000000..eeaab53 --- /dev/null +++ b/ACS111136_ex1/ex1.md @@ -0,0 +1,103 @@ +挑戰1. +改善random forest模型的輸出數據結果: + +1.Random Forest 調參數 (Tuned Random Forest) +使用 Random Forest 分類器來偵測信用卡詐欺。 + +2.調整模型參數以改善分類效果,主要參數如下: + +n_estimators=100:森林中樹的數量,提升模型穩定性。 + +max_depth=10:限制樹的最大深度,避免過度擬合。 + +min_samples_split=10:節點分裂所需的最少樣本數,控制模型複雜度。 + +class_weight='balanced':解決資料類別不平衡問題,對少數類別(詐欺)加權。 + + +使用分層抽樣將資料切分為訓練集和測試集,確保正負樣本比例一致。 + +評估指標包括 Accuracy、Precision、Recall 與 F1-score。 + +調參後模型相較於預設參數模型,在召回率(Recall)及整體 F1-score 有明顯提升,更能有效辨識詐欺樣本。 + +3.結果 +Tuned Random Forest Evaluation: +============================================= + Accuracy: 0.9994850368081645 + Precision Score: 0.8333333333333334 + Recall Score: 0.8455882352941176 + F1 Score: 0.8394160583941606 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85307 + 1 0.83 0.85 0.84 136 + + accuracy 1.00 85443 + macro avg 0.92 0.92 0.92 85443 +weighted avg 1.00 1.00 1.00 85443 + +4.比較: +Random Forest Evaluation: +============================================= + Accuracy: 0.9996371850239341 + Precision Score: 0.9411764705882353 + Recall Score: 0.8235294117647058 + F1 Score: 0.8784313725490196 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85307 + 1 0.94 0.82 0.88 136 + + accuracy 1.00 85443 + macro avg 0.97 0.91 0.94 85443 +weighted avg 1.00 1.00 1.00 85443 + + +挑戰1. +改善kmeans模型的輸出數據結果: + +1.KMeans + PCA (監督式學習捨棄) +使用 KMeans 無監督分群方法來嘗試偵測詐欺交易。 + +由於資料維度高,先透過 PCA(主成分分析) 進行降維,保留 90% 以上的資訊,降低運算成本與噪音影響。 + +只用正常交易(非詐欺)子集作為 KMeans 的訓練資料,因無標籤資料且異常樣本少,訓練時不考慮詐欺樣本。 + +2.透過 Silhouette Score 找出最佳的叢集數 (k 值),範圍在 2 到 4 之間。 + +叢集結果與真實標籤對齊,推估各叢集所代表的類別。 + +最後在測試集上評估,計算 Accuracy、Precision、Recall、F1-score,並輸出 Silhouette Score 以評估叢集品質。 + +3.分析: +優點是不用依賴標註資料,缺點是偵測性能較監督式模型低,且易受參數設定影響。對於原先的結果來說,PCA的recall score、f1 score過低 + +Best K value: 4 + +KMeans + PCA Evaluation: +============================================= + Accuracy: 0.9987242957293166 + Precision Score: 0.782608695652174 + Recall Score: 0.36486486486486486 + F1 Score: 0.4976958525345622 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85295 + 1 0.78 0.36 0.50 148 + + accuracy 1.00 85443 + macro avg 0.89 0.68 0.75 85443 +weighted avg 1.00 1.00 1.00 85443 + + +Silhouette Score on test set: 0.1075 + +4.考慮: +在選擇資料前的結果過於理想化了,導致在調整參數者的過程中我認為脫離了非監督式學習的範疇,故改用PCA取樣 \ No newline at end of file diff --git a/ACS111136_ex2/ex2.ipynb b/ACS111136_ex2/ex2.ipynb new file mode 100644 index 0000000..2fd7b00 --- /dev/null +++ b/ACS111136_ex2/ex2.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOOH1/HuG9u0bTVs71ffNHL"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":[],"metadata":{"id":"w5iHKqTXmgVn"}},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","from sklearn.model_selection import train_test_split\n","from sklearn.preprocessing import StandardScaler\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import classification_report,precision_score, recall_score, f1_score\n","from sklearn.cluster import KMeans\n","from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix\n","import kagglehub\n","from imblearn.over_sampling import SMOTE\n","from sklearn.decomposition import PCA\n","\n","\n","# general setting. do not change TEST_SIZE\n","RANDOM_SEED = 42\n","TEST_SIZE = 0.3\n"],"metadata":{"id":"9GaIHp5rqDEY"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import kagglehub\n","path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n","data = pd.read_csv(f\"{path}/creditcard.csv\")"],"metadata":{"id":"3mvCBbqgC_B9"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":[],"metadata":{"id":"njccW7KIGqEN"}},{"cell_type":"code","source":["# load dataset(from kagglehub)\n","path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n","data = pd.read_csv(f\"{path}/creditcard.csv\")\n","data['Class'] = data['Class'].astype(int)\n","\n","# prepare data\n","data = data.drop(['Time'], axis=1)\n","data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n"],"metadata":{"id":"SxyaGUXIrPtp"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["fraud = data[data['Class'] == 1]\n","nonfraud = data[data['Class'] == 0]\n","print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')\n","print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Xq6-1WLkrR6A","executionInfo":{"status":"ok","timestamp":1748154020701,"user_tz":-480,"elapsed":70,"user":{"displayName":"stewart S","userId":"12277001259464691529"}},"outputId":"633c2348-eec8-48de-89b6-576e7e1d6968"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Fraudulent:492, non-fraudulent:284315\n","the positive class (frauds) percentage: 492/284807 (0.173%)\n"]}]},{"cell_type":"code","source":["X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])\n","Y = np.asarray(data.iloc[:, data.columns == 'Class'])\n","\n","# split training set and data set\n","X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)\n","\n","# build Random Forest model\n","rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)\n","rf_model.fit(X_train, y_train)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":135},"id":"ZEAhP7NsrXB4","outputId":"91ba4214-ee99-49d4-c4ff-4081fbc5fd99","executionInfo":{"status":"ok","timestamp":1748154330792,"user_tz":-480,"elapsed":310096,"user":{"displayName":"stewart S","userId":"12277001259464691529"}}},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.11/dist-packages/sklearn/base.py:1389: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n"," return fit_method(estimator, *args, **kwargs)\n"]},{"output_type":"execute_result","data":{"text/plain":["RandomForestClassifier(random_state=42)"],"text/html":["
RandomForestClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
"]},"metadata":{},"execution_count":34}]},{"cell_type":"code","source":["from sklearn.ensemble import RandomForestClassifier\n","\n","model = RandomForestClassifier(\n"," n_estimators=100, #50、100、150\n"," max_depth=10, #adjust depth\n"," min_samples_split=10,\n"," class_weight='balanced', # unbalance solved\n"," random_state=42\n",")\n"],"metadata":{"id":"tguRmxrpV4J6"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# 2. model training\n","model.fit(X_train, y_train)\n","\n","# 3. test the dataset\n","y_pred = model.predict(X_test)\n","# define evaluation function\n","def evaluation(y_true, y_pred, model_name=\"Model\"):\n"," accuracy = accuracy_score(y_true, y_pred)\n"," precision = precision_score(y_true, y_pred)\n"," recall = recall_score(y_true, y_pred)\n"," f1 = f1_score(y_true, y_pred)\n","\n"," print(f'\\n{model_name} Evaluation:')\n"," print('===' * 15)\n"," print(' Accuracy:', accuracy)\n"," print(' Precision Score:', precision)\n"," print(' Recall Score:', recall)\n"," print(' F1 Score:', f1)\n"," print(\"\\nClassification Report:\")\n"," print(classification_report(y_true, y_pred))\n","\n","# predict and print result\n","y_pred = rf_model.predict(X_test)\n","print(classification_report(y_test, y_pred))\n","\n","evaluation(y_test, y_pred, model_name=\"Random Forest\")\n","\n","# train model\n","model.fit(X_train, y_train)\n","\n","# new model prediction\n","y_pred_new = model.predict(X_test)\n","evaluation(y_test, y_pred_new, model_name=\"Tuned Random Forest\")\n","\n"],"metadata":{"id":"IbT1p7p_rdOp","colab":{"base_uri":"https://localhost:8080/"},"outputId":"be8b3cde-d86c-45e5-cdd5-72e9b22286d9","executionInfo":{"status":"ok","timestamp":1748154640323,"user_tz":-480,"elapsed":309536,"user":{"displayName":"stewart S","userId":"12277001259464691529"}}},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.11/dist-packages/sklearn/base.py:1389: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n"," return fit_method(estimator, *args, **kwargs)\n"]},{"output_type":"stream","name":"stdout","text":[" precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 85307\n"," 1 0.94 0.82 0.88 136\n","\n"," accuracy 1.00 85443\n"," macro avg 0.97 0.91 0.94 85443\n","weighted avg 1.00 1.00 1.00 85443\n","\n","\n","Random Forest Evaluation:\n","=============================================\n"," Accuracy: 0.9996371850239341\n"," Precision Score: 0.9411764705882353\n"," Recall Score: 0.8235294117647058\n"," F1 Score: 0.8784313725490196\n","\n","Classification Report:\n"," precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 85307\n"," 1 0.94 0.82 0.88 136\n","\n"," accuracy 1.00 85443\n"," macro avg 0.97 0.91 0.94 85443\n","weighted avg 1.00 1.00 1.00 85443\n","\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.11/dist-packages/sklearn/base.py:1389: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n"," return fit_method(estimator, *args, **kwargs)\n"]},{"output_type":"stream","name":"stdout","text":["\n","Tuned Random Forest Evaluation:\n","=============================================\n"," Accuracy: 0.9994850368081645\n"," Precision Score: 0.8333333333333334\n"," Recall Score: 0.8455882352941176\n"," F1 Score: 0.8394160583941606\n","\n","Classification Report:\n"," precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 85307\n"," 1 0.83 0.85 0.84 136\n","\n"," accuracy 1.00 85443\n"," macro avg 0.92 0.92 0.92 85443\n","weighted avg 1.00 1.00 1.00 85443\n","\n"]}]},{"cell_type":"code","source":["# Extract features and labels\n","X = np.asarray(data.drop(columns=['Class']))\n","y = np.asarray(data['Class'])\n","\n","# Split the dataset into training and testing sets (with stratification)\n","x_train, x_test, y_train, y_test = train_test_split(\n"," X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y\n",")\n","\n","scaler = StandardScaler()\n","x_train = scaler.fit_transform(x_train)\n","x_test = scaler.transform(x_test)\n","\n","# Select a small sample of normal (non-fraud) data for unsupervised training\n","n_x_train = x_train[y_train == 0]\n","n_x_train = n_x_train[:1000]\n"],"metadata":{"id":"LkSSccV8rm2o"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["scores = []\n","for k in range(2, 5):\n"," kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n"," kmeans.fit(n_x_train)\n"," score = silhouette_score(n_x_train, kmeans.labels_)\n"," scores.append(score)\n","\n","optimal_k = np.argmax(scores) + 2\n","kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n","kmeans.fit(n_x_train)\n","y_pred_test = kmeans.predict(x_test)\n","def align_labels(y_true, y_pred, n_clusters):\n"," labels = np.zeros_like(y_pred)\n"," for i in range(n_clusters):\n"," mask = (y_pred == i)\n"," if np.sum(mask) > 0:\n"," labels[mask] = np.bincount(y_true[mask]).argmax()\n"," else:\n"," labels[mask] = 0 # Default to normal class\n"," return labels\n","\n","y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)\n"],"metadata":{"id":"RNojQgykrr_w"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def evaluation(y_true, y_pred, model_name=\"Model\"):\n"," accuracy = accuracy_score(y_true, y_pred)\n"," precision = precision_score(y_true, y_pred, zero_division=0)\n"," recall = recall_score(y_true, y_pred)\n"," f1 = f1_score(y_true, y_pred)\n","\n"," print(f'\\n{model_name} Evaluation:')\n"," print('===' * 15)\n"," print(' Accuracy:', accuracy)\n"," print(' Precision Score:', precision)\n"," print(' Recall Score:', recall)\n"," print(' F1 Score:', f1)\n"," print(\"\\nClassification Report:\")\n"," print(classification_report(y_true, y_pred))\n","\n","evaluation(y_test, y_pred_aligned, model_name=\"KMeans (Unsupervised)\")\n","\n","pca = PCA(n_components=5, random_state=RANDOM_SEED)\n","x_train_pca = pca.fit_transform(x_train)\n","x_test_pca = pca.transform(x_test)\n"],"metadata":{"id":"8HgrvzGMrxUv","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1748154641406,"user_tz":-480,"elapsed":250,"user":{"displayName":"stewart S","userId":"12277001259464691529"}},"outputId":"4c4932ce-21e9-470b-e27c-6bed98483876"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","KMeans (Unsupervised) Evaluation:\n","=============================================\n"," Accuracy: 0.9987242957293166\n"," Precision Score: 0.782608695652174\n"," Recall Score: 0.36486486486486486\n"," F1 Score: 0.4976958525345622\n","\n","Classification Report:\n"," precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 85295\n"," 1 0.78 0.36 0.50 148\n","\n"," accuracy 1.00 85443\n"," macro avg 0.89 0.68 0.75 85443\n","weighted avg 1.00 1.00 1.00 85443\n","\n"]}]},{"cell_type":"code","source":["from sklearn.decomposition import PCA\n","from sklearn.metrics import silhouette_score\n","\n","# PCA\n","pca = PCA(n_components=0.9, random_state=RANDOM_SEED)\n","x_train_pca = pca.fit_transform(x_train)\n","x_test_pca = pca.transform(x_test)\n","\n","# searching K\n","scores = []\n","for k in range(2, 5):\n"," kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n"," kmeans.fit(x_train_pca[y_train == 0][:1000])\n"," score = silhouette_score(x_train_pca[y_train == 0][:1000], kmeans.labels_)\n"," scores.append(score)\n","\n","optimal_k = np.argmax(scores) + 2\n","print(\"Best K value:\", optimal_k)\n","\n","#training\n","kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n","kmeans.fit(x_train_pca[y_train == 0][:1000])\n","y_pred_test = kmeans.predict(x_test_pca)\n","\n","#evaluate again\n","evaluation(y_test, y_pred_aligned, model_name=\"KMeans + PCA\")\n","\n","# sil score output\n","sil_score = silhouette_score(x_test_pca, y_pred_test)\n","print(f\"\\nSilhouette Score on test set: {sil_score:.4f}\")\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oBRN5QcZH5GJ","executionInfo":{"status":"ok","timestamp":1748154737771,"user_tz":-480,"elapsed":96369,"user":{"displayName":"stewart S","userId":"12277001259464691529"}},"outputId":"19e1aa04-1123-4e5f-c4be-2df9fd0c45ae"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Best K value: 4\n","\n","KMeans + PCA Evaluation:\n","=============================================\n"," Accuracy: 0.9987242957293166\n"," Precision Score: 0.782608695652174\n"," Recall Score: 0.36486486486486486\n"," F1 Score: 0.4976958525345622\n","\n","Classification Report:\n"," precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 85295\n"," 1 0.78 0.36 0.50 148\n","\n"," accuracy 1.00 85443\n"," macro avg 0.89 0.68 0.75 85443\n","weighted avg 1.00 1.00 1.00 85443\n","\n","\n","Silhouette Score on test set: 0.1075\n"]}]},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","from sklearn.ensemble import IsolationForest\n","from sklearn.model_selection import train_test_split\n","from sklearn.preprocessing import StandardScaler\n","from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_curve\n","from xgboost import XGBClassifier\n","\n","# 載入資料\n","data = pd.read_csv(f\"{path}/creditcard.csv\")\n","\n","# 特徵與標籤分開\n","X = data.drop(columns=[\"Class\"])\n","y = data[\"Class\"]\n","\n","# 分割訓練與測試集(保持詐欺比例)\n","x_train, x_test, y_train, y_test = train_test_split(\n"," X, y, test_size=0.3, stratify=y, random_state=42\n",")\n","\n","# 資料標準化\n","scaler = StandardScaler()\n","x_train_scaled = scaler.fit_transform(x_train)\n","x_test_scaled = scaler.transform(x_test)\n","\n","# ==============================\n","# 1️⃣ IsolationForest 異常分數作為特徵\n","# ==============================\n","iso = IsolationForest(n_estimators=100, contamination=0.002, random_state=42)\n","iso.fit(x_train_scaled)\n","\n","# 取得 anomaly score(愈小代表愈異常)\n","train_scores = iso.decision_function(x_train_scaled)\n","test_scores = iso.decision_function(x_test_scaled)\n","\n","# 合併 anomaly score 作為新特徵\n","x_train_combined = np.hstack([x_train_scaled, train_scores.reshape(-1, 1)])\n","x_test_combined = np.hstack([x_test_scaled, test_scores.reshape(-1, 1)])\n","\n","# ==============================\n","# 2️⃣ XGBoost 訓練\n","# ==============================\n","xgb = XGBClassifier(\n"," n_estimators=100,\n"," max_depth=5,\n"," learning_rate=0.1,\n"," subsample=0.8,\n"," scale_pos_weight=(len(y_train[y_train == 0]) / len(y_train[y_train == 1])), # 動態調整\n"," random_state=42,\n"," use_label_encoder=False,\n"," eval_metric='logloss'\n",")\n","\n","xgb.fit(x_train_combined, y_train)\n","\n","# ==============================\n","# 3️⃣ 預測 + 門檻調整 + 評估\n","# ==============================\n","\n","# 預測詐欺機率\n","y_prob = xgb.predict_proba(x_test_combined)[:, 1]\n","\n","# 利用 precision_recall_curve 找最佳門檻使 Precision >= 0.9,且 Recall 最大化\n","precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)\n","\n","best_threshold = 0.5\n","best_recall = 0\n","for p, r, t in zip(precisions, recalls, thresholds):\n"," if p >= 0.9 and r > best_recall:\n"," best_recall = r\n"," best_threshold = t\n","\n","# 使用最佳門檻轉換預測標籤\n","y_pred_adj = (y_prob >= best_threshold).astype(int)\n","\n","def evaluation(y_true, y_pred, model_name=\"Model\"):\n"," accuracy = accuracy_score(y_true, y_pred)\n"," precision = precision_score(y_true, y_pred, zero_division=0)\n"," recall = recall_score(y_true, y_pred)\n"," f1 = f1_score(y_true, y_pred)\n","\n"," print(f'\\n{model_name} Evaluation:')\n"," print('===' * 15)\n"," print(' Accuracy:', round(accuracy, 4))\n"," print(' Precision Score:', round(precision, 4))\n"," print(' Recall Score:', round(recall, 4))\n"," print(' F1 Score:', round(f1, 4))\n"," print(\"\\nClassification Report:\")\n"," print(classification_report(y_true, y_pred))\n","\n","evaluation(y_test, y_pred_adj, model_name=\"Threshold Adjusted Hybrid Model\")\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"VetnJv0dGQ8R","executionInfo":{"status":"ok","timestamp":1748157347111,"user_tz":-480,"elapsed":12858,"user":{"displayName":"stewart S","userId":"12277001259464691529"}},"outputId":"67c5ee72-13e9-440c-d85e-7eadb01ea83e"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158: UserWarning: [07:15:44] WARNING: /workspace/src/learner.cc:740: \n","Parameters: { \"use_label_encoder\" } are not used.\n","\n"," warnings.warn(smsg, UserWarning)\n"]},{"output_type":"stream","name":"stdout","text":["\n","Threshold Adjusted Hybrid Model Evaluation:\n","=============================================\n"," Accuracy: 0.9994\n"," Precision Score: 0.9024\n"," Recall Score: 0.75\n"," F1 Score: 0.8192\n","\n","Classification Report:\n"," precision recall f1-score support\n","\n"," 0 1.00 1.00 1.00 85295\n"," 1 0.90 0.75 0.82 148\n","\n"," accuracy 1.00 85443\n"," macro avg 0.95 0.87 0.91 85443\n","weighted avg 1.00 1.00 1.00 85443\n","\n"]}]}]} \ No newline at end of file diff --git a/ACS111136_ex2/ex2.md b/ACS111136_ex2/ex2.md new file mode 100644 index 0000000..56af8d6 --- /dev/null +++ b/ACS111136_ex2/ex2.md @@ -0,0 +1,68 @@ +Hybrid Model +1.目的: +混合模型結合監督式學習(如 XGBoost)與非監督式學習(如 Isolation Forest),利用兩者優勢提升在信用卡詐欺偵測中的準確度與召回率,特別適用於處理不平衡資料集與未知異常樣本。 +2.流程: + +非監督異常檢測:Isolation Forest +使用 IsolationForest 模型對標準化後的交易資料進行訓練。 + +模型會為每筆樣本產生 anomaly score(異常分數),分數愈低代表該樣本愈異常。 + +此步驟不使用資料標籤,僅依據資料分佈學習異常行為。 + +目的:從資料中額外萃取「異常程度」作為新特徵。 + +監督式分類器:XGBoost +使用 XGBClassifier 對新特徵集進行訓練。 + +主要參數設定如下: + +n_estimators=100, max_depth=5, learning_rate=0.1:常見的控制樹模型深度與學習速率。 + +scale_pos_weight=負:正類比例:動態調整權重以應對類別不平衡問題。 + +模型會預測每筆交易為詐欺的機率(probability) + +門檻調整 +使用 precision_recall_curve 評估不同門檻下的 Precision 與 Recall。 + +採用策略: + +精確率≥ 0.90 前提下,選擇(Recall)最大 的門檻值。 + +目的:降低誤報(false positives)並盡可能捕捉更多詐欺樣本。 + +3.適用範圍 +適用於異常樣本稀少、異常行為模式多變的金融詐欺偵測任務 + +評估指標 +Accuracy:整體分類正確率。 + +Precision:詐欺預測中實際為詐欺的比例(低誤報率)。 + +Recall:成功預測出詐欺的比例(高召回能力)。 + +F1-score:Precision 與 Recall 的加權平均數。 + +Threshold Adjusted Hybrid Model Evaluation: +============================================= + Accuracy: 0.9994 + Precision Score: 0.9024 + Recall Score: 0.75 + F1 Score: 0.8192 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85295 + 1 0.90 0.75 0.82 148 + + accuracy 1.00 85443 + macro avg 0.95 0.87 0.91 85443 +weighted avg 1.00 1.00 1.00 85443 + +4.數據為甚麼recall score 會低於預期? +Recall 是針對正類(詐欺交易),XGBoost 被過度限制以追求高 Precision。 +詐欺樣本數量太少,模型學不到足夠樣式 +Isolation Forest 的分數幫助有限 +可能是這些受到影響? \ No newline at end of file