diff --git a/ex1/ex1.md b/ex1/ex1.md new file mode 100644 index 0000000..5105471 --- /dev/null +++ b/ex1/ex1.md @@ -0,0 +1,16 @@ +1. 監督式學習(隨機森林) +使用100 n_estimators + +參數: +max_depth=20:限制樹深度防止過擬合 +class_weight='balanced_subsample':自動調整權重 + +並且透過PR曲線尋找最佳 threshold,來最大化F1分數 + +2. 非監督式學習(K-Means) +僅使用正常交易數據訓練模型(模擬真實場景) + +自動尋找最佳的 cluster count(2-4範圍) +使用 silhouette score 評估 clustering 效果 + +標籤對齊:將 clusters 結果映射到真實的標籤,並且基於多數決原則分配類別 \ No newline at end of file diff --git a/ex1/ex1.py b/ex1/ex1.py new file mode 100644 index 0000000..b98a74c --- /dev/null +++ b/ex1/ex1.py @@ -0,0 +1,155 @@ +import numpy as np # 數值計算庫 +import pandas as pd # 數據處理庫 + +# 機器學習工具 +from sklearn.model_selection import train_test_split # 數據分割 +from sklearn.preprocessing import StandardScaler # 特徵標準化 +from sklearn.metrics import classification_report # 分類報告 +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 評估指標 +from sklearn.metrics import precision_recall_curve # PR曲線 + +import kagglehub # Kaggle數據集下載 + +# 機器學習模型 +from sklearn.ensemble import RandomForestClassifier # 監督式學習模型 +from sklearn.cluster import KMeans # 非監督式學習模型 +from sklearn.metrics import silhouette_score # 聚類效果評估 + +# ====================== +# 全局設置 +# ====================== +RANDOM_SEED = 42 # 隨機種子確保結果可重現 +TEST_SIZE = 0.3 # 測試集比例(不可更改) + +# ====================== +# 評估函數 +# ====================== +def evaluation(y_true, y_pred, model_name="Model"): + # 計算各項評估指標 + accuracy = accuracy_score(y_true, y_pred) # 準確率 + precision = precision_score(y_true, y_pred, zero_division=0) # 精確率(處理除零錯誤) + recall = recall_score(y_true, y_pred) # 召回率 + f1 = f1_score(y_true, y_pred) # F1分數 + + # 輸出評估結果 + print(f'\n{model_name} Evaluation:') + print('===' * 15) + print(' Accuracy:', accuracy) + print(' Precision Score:', precision) + print(' Recall Score:', recall) + print(' F1 Score:', f1) + print("\nClassification Report:") + print(classification_report(y_true, y_pred)) # 詳細分類報告 + +# ====================== +# 數據準備 +# ====================== +# 下載信用卡詐騙數據集 +path = kagglehub.dataset_download("mlg-ulb/creditcardfraud") +# 讀取CSV文件 +data = pd.read_csv(f"{path}/creditcard.csv") +# 將目標變量轉換為整數類型 +data['Class'] = data['Class'].astype(int) + +# 數據預處理 +data = data.drop(['Time'], axis=1) # 移除時間特徵 +# 標準化交易金額特徵 +data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1)) + +# 分析數據分布 +fraud = data[data['Class'] == 1] # 詐騙交易 +nonfraud = data[data['Class'] == 0] # 正常交易 +print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}') +# 計算詐騙交易百分比 +print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)') + +# 準備特徵和標籤 +X = data.drop(columns=['Class']).values # 特徵矩陣 +Y = data['Class'].values # 目標變量 + +# ============================================ +# 監督式學習段落 (Random Forest) +# ============================================ +# 分割訓練集和測試集(無分層抽樣) +x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED) + +# 初始化隨機森林分類器 +rf_model = RandomForestClassifier( + n_estimators=100, # 決策樹數量 + max_depth=20, # 樹的最大深度 + class_weight='balanced_subsample', # 自動平衡類別權重(處理不平衡數據) + random_state=RANDOM_SEED # 隨機種子 +) +# 訓練模型 +rf_model.fit(x_train, y_train) + +# 模型預測與評估 +y_prob = rf_model.predict_proba(x_test)[:, 1] # 預測詐騙概率 +# 計算PR曲線和閾值 +prec, rec, thresh = precision_recall_curve(y_test, y_prob) +# 計算F1分數(添加小數避免除零) +f1 = 2 * (prec * rec) / (prec + rec + 1e-6) +best_idx = np.argmax(f1) # 最佳F1索引 +best_threshold = thresh[best_idx] # 最佳概率閾值 +y_pred = (y_prob >= best_threshold).astype(int) # 應用閾值轉換預測類別 + +# 評估監督式模型 +evaluation(y_test, y_pred, model_name="RandomForestClassifier(Supervised)") + +# ============================================ +# 非監督式學習段落 (K-Means) +# ============================================ +# 重新分割數據集(使用分層抽樣確保類別比例) +x_train, x_test, y_train, y_test = train_test_split( + X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y +) + +# 特徵標準化 +scaler = StandardScaler() +x_train = scaler.fit_transform(x_train) # 擬合並轉換訓練集 +x_test = scaler.transform(x_test) # 轉換測試集 + +# 僅使用正常交易數據進行訓練(非監督學習特徵) +n_x_train = x_train[y_train == 0] # 篩選正常交易 +n_x_train = n_x_train[:1000] # 取1000個樣本 + +# 尋找最佳聚類數k(2-4範圍) +scores = [] +for k in range(2, 5): + # 初始化K-Means模型 + kmeans = KMeans( + n_clusters=k, # 聚類數量 + init='k-means++', # 智能初始化中心點 + random_state=RANDOM_SEED # 隨機種子 + ) + kmeans.fit(n_x_train) # 訓練模型 + # 計算輪廓係數(評估聚類效果) + score = silhouette_score(n_x_train, kmeans.labels_) + scores.append(score) + +# 選擇最佳k值(輪廓係數最高) +optimal_k = np.argmax(scores) + 2 # 索引轉實際k值 +# 使用最佳k值初始化模型 +kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED) +kmeans.fit(n_x_train) # 重新訓練 + +# 預測測試集 +y_pred_test = kmeans.predict(x_test) + +# 對齊聚類標籤與真實標籤 +def align_labels(y_true, y_pred, n_clusters): + labels = np.zeros_like(y_pred) # 初始化標籤數組 + for i in range(n_clusters): + mask = (y_pred == i) # 當前聚類的掩碼 + if np.sum(mask) > 0: # 確保聚類非空 + # 將聚類分配為該組中多數的真實類別 + labels[mask] = np.bincount(y_true[mask]).argmax() + else: + labels[mask] = 0 # 默認為正常交易 + return labels + +# 應用標籤對齊 +y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k) + +# 評估非監督式模型 +evaluation(y_test, y_pred_aligned, model_name="KMeans (Unsupervised)") \ No newline at end of file diff --git "a/ex1/ex1_\347\233\243\347\235\243\345\274\217\350\210\207\351\235\236\347\233\243\347\235\243\345\274\217.png" "b/ex1/ex1_\347\233\243\347\235\243\345\274\217\350\210\207\351\235\236\347\233\243\347\235\243\345\274\217.png" new file mode 100644 index 0000000..8922998 Binary files /dev/null and "b/ex1/ex1_\347\233\243\347\235\243\345\274\217\350\210\207\351\235\236\347\233\243\347\235\243\345\274\217.png" differ diff --git a/ex2/ex2.md b/ex2/ex2.md new file mode 100644 index 0000000..b05077a --- /dev/null +++ b/ex2/ex2.md @@ -0,0 +1,19 @@ +# 非監督式學習階段:Isolation Forest +核心概念:Anomalous data points 比 normal points 更容易被「隔離」 +運作方式:通過隨機選擇特徵和分割值構建 decision trees + +優勢:更有效率的處理高維的數據,並且使計算複雜度低 + +## 程式內容 + +contamination 參數:設定預期異常值比例,影響判定 threshold +不使用標籤:完全基於數據分布特性檢測異常 +輸出轉換:將預測結果作為新特徵加入原始數據 + +初步篩選可疑的交易內容,透過「異常程度」的量化指標,增強後續監督模型的 feature space + +# 監督式學習階段:XGBoost +核心概念:串聯多個 weak learners 形成 strong predictor + +優勢:處理不平衡數據能力強,防止過擬合 +XGBoost 的特點:內建正則化,自動處理缺失的數值 \ No newline at end of file diff --git a/ex2/ex2.py b/ex2/ex2.py new file mode 100644 index 0000000..d50afc2 --- /dev/null +++ b/ex2/ex2.py @@ -0,0 +1,97 @@ +import numpy as np +import pandas as pd + +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve +from sklearn.ensemble import IsolationForest +from xgboost import XGBClassifier +import kagglehub + +# General setting +RANDOM_SEED = 42 +TEST_SIZE = 0.3 + +# 評估函數 +def evaluation(y_true, y_pred, model_name="Model"): + accuracy = accuracy_score(y_true, y_pred) + precision = precision_score(y_true, y_pred, zero_division=0) + recall = recall_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred) + + print(f'\n{model_name} Evaluation:') + print('===' * 15) + print(' Accuracy:', accuracy) + print(' Precision Score:', precision) + print(' Recall Score:', recall) + print(' F1 Score:', f1) + print("\nClassification Report:") + print(classification_report(y_true, y_pred)) + +# 下載資料 +path = kagglehub.dataset_download("mlg-ulb/creditcardfraud") +data = pd.read_csv(f"{path}/creditcard.csv") +data['Class'] = data['Class'].astype(int) + +# 預處理 +data = data.drop(['Time'], axis=1) +data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1)) + +X = data.drop(columns=['Class']).values +Y = data['Class'].values + +# 分割訓練/測試集 +x_train, x_test, y_train, y_test = train_test_split( + X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y +) + +# 標準化 +scaler = StandardScaler() +x_train = scaler.fit_transform(x_train) +x_test = scaler.transform(x_test) + +# 加入 IsolationForest(非監督式) +iso_model = IsolationForest( + n_estimators=100, + contamination=0.01, + random_state=RANDOM_SEED +) +iso_model.fit(x_train) + +# 預測並加入為新特徵 +iso_train_scores = iso_model.predict(x_train) # -1 = 異常, 1 = 正常 +iso_test_scores = iso_model.predict(x_test) + +# 合併新特徵 +x_train_enhanced = np.concatenate([x_train, iso_train_scores.reshape(-1, 1)], axis=1) +x_test_enhanced = np.concatenate([x_test, iso_test_scores.reshape(-1, 1)], axis=1) + +# 使用 XGBoost(監督式) +xgb_model = XGBClassifier( + n_estimators=400, + max_depth=8, + learning_rate=0.03, + subsample=0.8, + colsample_bytree=0.8, + scale_pos_weight=(len(y_train[y_train == 0]) / max(len(y_train[y_train == 1]), 1)), + use_label_encoder=False, + eval_metric='logloss', + random_state=RANDOM_SEED +) +xgb_model.fit(x_train_enhanced, y_train) + +# 預測機率 +y_prob = xgb_model.predict_proba(x_test_enhanced)[:, 1] + +# 找最佳 threshold +prec, rec, thresh = precision_recall_curve(y_test, y_prob) +f1 = 2 * (prec * rec) / (prec + rec + 1e-6) +best_idx = np.argmax(f1) +best_threshold = thresh[best_idx] + +# 可自定 threshold(嘗試提高精確度) +manual_threshold = max(best_threshold, 0.6) +y_pred = (y_prob >= manual_threshold).astype(int) + +# 評估 +evaluation(y_test, y_pred, model_name="XGBoostClassifier (Supervised after IsolationForest)") \ No newline at end of file diff --git a/ex2/ex2_img.png b/ex2/ex2_img.png new file mode 100644 index 0000000..a34184b Binary files /dev/null and b/ex2/ex2_img.png differ