Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions ex1/ex1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
1. 監督式學習(隨機森林)
使用100 n_estimators

參數:
max_depth=20:限制樹深度防止過擬合
class_weight='balanced_subsample':自動調整權重

並且透過PR曲線尋找最佳 threshold,來最大化F1分數

2. 非監督式學習(K-Means)
僅使用正常交易數據訓練模型(模擬真實場景)

自動尋找最佳的 cluster count(2-4範圍)
使用 silhouette score 評估 clustering 效果

標籤對齊:將 clusters 結果映射到真實的標籤,並且基於多數決原則分配類別
155 changes: 155 additions & 0 deletions ex1/ex1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import numpy as np # 數值計算庫
import pandas as pd # 數據處理庫

# 機器學習工具
from sklearn.model_selection import train_test_split # 數據分割
from sklearn.preprocessing import StandardScaler # 特徵標準化
from sklearn.metrics import classification_report # 分類報告
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 評估指標
from sklearn.metrics import precision_recall_curve # PR曲線

import kagglehub # Kaggle數據集下載

# 機器學習模型
from sklearn.ensemble import RandomForestClassifier # 監督式學習模型
from sklearn.cluster import KMeans # 非監督式學習模型
from sklearn.metrics import silhouette_score # 聚類效果評估

# ======================
# 全局設置
# ======================
RANDOM_SEED = 42 # 隨機種子確保結果可重現
TEST_SIZE = 0.3 # 測試集比例(不可更改)

# ======================
# 評估函數
# ======================
def evaluation(y_true, y_pred, model_name="Model"):
# 計算各項評估指標
accuracy = accuracy_score(y_true, y_pred) # 準確率
precision = precision_score(y_true, y_pred, zero_division=0) # 精確率(處理除零錯誤)
recall = recall_score(y_true, y_pred) # 召回率
f1 = f1_score(y_true, y_pred) # F1分數

# 輸出評估結果
print(f'\n{model_name} Evaluation:')
print('===' * 15)
print(' Accuracy:', accuracy)
print(' Precision Score:', precision)
print(' Recall Score:', recall)
print(' F1 Score:', f1)
print("\nClassification Report:")
print(classification_report(y_true, y_pred)) # 詳細分類報告

# ======================
# 數據準備
# ======================
# 下載信用卡詐騙數據集
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
# 讀取CSV文件
data = pd.read_csv(f"{path}/creditcard.csv")
# 將目標變量轉換為整數類型
data['Class'] = data['Class'].astype(int)

# 數據預處理
data = data.drop(['Time'], axis=1) # 移除時間特徵
# 標準化交易金額特徵
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

# 分析數據分布
fraud = data[data['Class'] == 1] # 詐騙交易
nonfraud = data[data['Class'] == 0] # 正常交易
print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')
# 計算詐騙交易百分比
print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')

# 準備特徵和標籤
X = data.drop(columns=['Class']).values # 特徵矩陣
Y = data['Class'].values # 目標變量

# ============================================
# 監督式學習段落 (Random Forest)
# ============================================
# 分割訓練集和測試集(無分層抽樣)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# 初始化隨機森林分類器
rf_model = RandomForestClassifier(
n_estimators=100, # 決策樹數量
max_depth=20, # 樹的最大深度
class_weight='balanced_subsample', # 自動平衡類別權重(處理不平衡數據)
random_state=RANDOM_SEED # 隨機種子
)
# 訓練模型
rf_model.fit(x_train, y_train)

# 模型預測與評估
y_prob = rf_model.predict_proba(x_test)[:, 1] # 預測詐騙概率
# 計算PR曲線和閾值
prec, rec, thresh = precision_recall_curve(y_test, y_prob)
# 計算F1分數(添加小數避免除零)
f1 = 2 * (prec * rec) / (prec + rec + 1e-6)
best_idx = np.argmax(f1) # 最佳F1索引
best_threshold = thresh[best_idx] # 最佳概率閾值
y_pred = (y_prob >= best_threshold).astype(int) # 應用閾值轉換預測類別

# 評估監督式模型
evaluation(y_test, y_pred, model_name="RandomForestClassifier(Supervised)")

# ============================================
# 非監督式學習段落 (K-Means)
# ============================================
# 重新分割數據集(使用分層抽樣確保類別比例)
x_train, x_test, y_train, y_test = train_test_split(
X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y
)

# 特徵標準化
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train) # 擬合並轉換訓練集
x_test = scaler.transform(x_test) # 轉換測試集

# 僅使用正常交易數據進行訓練(非監督學習特徵)
n_x_train = x_train[y_train == 0] # 篩選正常交易
n_x_train = n_x_train[:1000] # 取1000個樣本

# 尋找最佳聚類數k(2-4範圍)
scores = []
for k in range(2, 5):
# 初始化K-Means模型
kmeans = KMeans(
n_clusters=k, # 聚類數量
init='k-means++', # 智能初始化中心點
random_state=RANDOM_SEED # 隨機種子
)
kmeans.fit(n_x_train) # 訓練模型
# 計算輪廓係數(評估聚類效果)
score = silhouette_score(n_x_train, kmeans.labels_)
scores.append(score)

# 選擇最佳k值(輪廓係數最高)
optimal_k = np.argmax(scores) + 2 # 索引轉實際k值
# 使用最佳k值初始化模型
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)
kmeans.fit(n_x_train) # 重新訓練

# 預測測試集
y_pred_test = kmeans.predict(x_test)

# 對齊聚類標籤與真實標籤
def align_labels(y_true, y_pred, n_clusters):
labels = np.zeros_like(y_pred) # 初始化標籤數組
for i in range(n_clusters):
mask = (y_pred == i) # 當前聚類的掩碼
if np.sum(mask) > 0: # 確保聚類非空
# 將聚類分配為該組中多數的真實類別
labels[mask] = np.bincount(y_true[mask]).argmax()
else:
labels[mask] = 0 # 默認為正常交易
return labels

# 應用標籤對齊
y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)

# 評估非監督式模型
evaluation(y_test, y_pred_aligned, model_name="KMeans (Unsupervised)")
Binary file added ex1/ex1_監督式與非監督式.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
19 changes: 19 additions & 0 deletions ex2/ex2.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# 非監督式學習階段:Isolation Forest
核心概念:Anomalous data points 比 normal points 更容易被「隔離」
運作方式:通過隨機選擇特徵和分割值構建 decision trees

優勢:更有效率的處理高維的數據,並且使計算複雜度低

## 程式內容

contamination 參數:設定預期異常值比例,影響判定 threshold
不使用標籤:完全基於數據分布特性檢測異常
輸出轉換:將預測結果作為新特徵加入原始數據

初步篩選可疑的交易內容,透過「異常程度」的量化指標,增強後續監督模型的 feature space

# 監督式學習階段:XGBoost
核心概念:串聯多個 weak learners 形成 strong predictor

優勢:處理不平衡數據能力強,防止過擬合
XGBoost 的特點:內建正則化,自動處理缺失的數值
97 changes: 97 additions & 0 deletions ex2/ex2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
import kagglehub

# General setting
RANDOM_SEED = 42
TEST_SIZE = 0.3

# 評估函數
def evaluation(y_true, y_pred, model_name="Model"):
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f'\n{model_name} Evaluation:')
print('===' * 15)
print(' Accuracy:', accuracy)
print(' Precision Score:', precision)
print(' Recall Score:', recall)
print(' F1 Score:', f1)
print("\nClassification Report:")
print(classification_report(y_true, y_pred))

# 下載資料
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
data = pd.read_csv(f"{path}/creditcard.csv")
data['Class'] = data['Class'].astype(int)

# 預處理
data = data.drop(['Time'], axis=1)
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

X = data.drop(columns=['Class']).values
Y = data['Class'].values

# 分割訓練/測試集
x_train, x_test, y_train, y_test = train_test_split(
X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=Y
)

# 標準化
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# 加入 IsolationForest(非監督式)
iso_model = IsolationForest(
n_estimators=100,
contamination=0.01,
random_state=RANDOM_SEED
)
iso_model.fit(x_train)

# 預測並加入為新特徵
iso_train_scores = iso_model.predict(x_train) # -1 = 異常, 1 = 正常
iso_test_scores = iso_model.predict(x_test)

# 合併新特徵
x_train_enhanced = np.concatenate([x_train, iso_train_scores.reshape(-1, 1)], axis=1)
x_test_enhanced = np.concatenate([x_test, iso_test_scores.reshape(-1, 1)], axis=1)

# 使用 XGBoost(監督式)
xgb_model = XGBClassifier(
n_estimators=400,
max_depth=8,
learning_rate=0.03,
subsample=0.8,
colsample_bytree=0.8,
scale_pos_weight=(len(y_train[y_train == 0]) / max(len(y_train[y_train == 1]), 1)),
use_label_encoder=False,
eval_metric='logloss',
random_state=RANDOM_SEED
)
xgb_model.fit(x_train_enhanced, y_train)

# 預測機率
y_prob = xgb_model.predict_proba(x_test_enhanced)[:, 1]

# 找最佳 threshold
prec, rec, thresh = precision_recall_curve(y_test, y_prob)
f1 = 2 * (prec * rec) / (prec + rec + 1e-6)
best_idx = np.argmax(f1)
best_threshold = thresh[best_idx]

# 可自定 threshold(嘗試提高精確度)
manual_threshold = max(best_threshold, 0.6)
y_pred = (y_prob >= manual_threshold).astype(int)

# 評估
evaluation(y_test, y_pred, model_name="XGBoostClassifier (Supervised after IsolationForest)")
Binary file added ex2/ex2_img.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.