diff --git a/ACS111703 ex1/acs111703_ex1.ipynb b/ACS111703 ex1/acs111703_ex1.ipynb new file mode 100644 index 0000000..75a3cde --- /dev/null +++ b/ACS111703 ex1/acs111703_ex1.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "id": "bf0a4abf", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\User\\NTCU-Machine-Learning\\ACS111703 ex1\\fraud_env\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fraudulent:492, non-fraudulent:284315\n", + "the positive class (frauds) percentage: 0.001727485630620034\n", + "(len(fraud)/(len(fraud) + len(nonfraud)))*100:.3f%\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\User\\NTCU-Machine-Learning\\ACS111703 ex1\\fraud_env\\lib\\site-packages\\sklearn\\base.py:1389: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", + " return fit_method(estimator, *args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Random Forest Evaluation:\n", + "=============================================\n", + " Accuracy: 0.9996371850239341\n", + " Precision Score: 0.9411764705882353\n", + " Recall Score: 0.8235294117647058\n", + " F1 Score: 0.8784313725490196\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85307\n", + " 1 0.94 0.82 0.88 136\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.97 0.91 0.94 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n", + "\n", + "KMeans (Unsupervised) Evaluation:\n", + "=============================================\n", + " Accuracy: 0.9987242957293166\n", + " Precision Score: 0.782608695652174\n", + " Recall Score: 0.36486486486486486\n", + " F1 Score: 0.4976958525345622\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85295\n", + " 1 0.78 0.36 0.50 148\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.89 0.68 0.75 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n", + "\n", + "==================================================\n", + "CHALLENGE 1 COMPLETED\n", + "Comparison of Random Forest (Supervised) vs K-Means (Unsupervised)\n", + "==================================================\n" + ] + } + ], + "source": [ + "# Challenge 1: Supervised vs Unsupervised Learning\n", + "# Goal: Compare Random Forest and K-Means with same general settings to exceed baseline results\n", + "\n", + "# Import necessary packages\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix\n", + "import kagglehub\n", + "\n", + "# General setting - do not change TEST_SIZE\n", + "RANDOM_SEED = 42\n", + "TEST_SIZE = 0.3\n", + "\n", + "# Load dataset & prepare data\n", + "# Load dataset (from kagglehub)\n", + "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n", + "data = pd.read_csv(f\"{path}/creditcard.csv\")\n", + "data['Class'] = data['Class'].astype(int)\n", + "\n", + "# Prepare data\n", + "data = data.drop(['Time'], axis=1)\n", + "data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n", + "\n", + "# Fraud/Non-Fraud Transactions Analysis\n", + "fraud = data[data['Class'] == 1]\n", + "nonfraud = data[data['Class'] == 0]\n", + "print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')\n", + "print(f'the positive class (frauds) percentage: {len(fraud)/(len(fraud) + len(nonfraud))}')\n", + "print(f'(len(fraud)/(len(fraud) + len(nonfraud)))*100:.3f%')\n", + "\n", + "# Supervised Learning (Random Forest)\n", + "X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])\n", + "Y = np.asarray(data.iloc[:, data.columns == 'Class'])\n", + "\n", + "# Split training set and data set\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)\n", + "\n", + "# Build Random Forest model\n", + "rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)\n", + "rf_model.fit(X_train, y_train)\n", + "\n", + "# Define evaluation function\n", + "def evaluation(y_true, y_pred, model_name=\"Model\"):\n", + " accuracy = accuracy_score(y_true, y_pred)\n", + " precision = precision_score(y_true, y_pred)\n", + " recall = recall_score(y_true, y_pred)\n", + " f1 = f1_score(y_true, y_pred)\n", + " \n", + " print(f'\\n{model_name} Evaluation:')\n", + " print('===' * 15)\n", + " print(f' Accuracy:', accuracy)\n", + " print(f' Precision Score:', precision)\n", + " print(f' Recall Score:', recall)\n", + " print(f' F1 Score:', f1)\n", + " print(\"\\nClassification Report:\")\n", + " print(classification_report(y_true, y_pred))\n", + "\n", + "# Predict and print result\n", + "y_pred = rf_model.predict(X_test)\n", + "evaluation(y_test, y_pred, model_name=\"Random Forest\")\n", + "\n", + "# Unsupervised Learning (KMeans)\n", + "# Extract features and labels\n", + "X = np.asarray(data.drop(columns=['Class']))\n", + "y = np.asarray(data['Class'])\n", + "\n", + "# Split the dataset into training and testing sets (with stratification)\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y\n", + ")\n", + "\n", + "scaler = StandardScaler()\n", + "X_train = scaler.fit_transform(X_train)\n", + "X_test = scaler.transform(X_test)\n", + "\n", + "# Select a small sample of normal (non-fraud) data for unsupervised training\n", + "n_x_train = X_train[y_train == 0]\n", + "n_x_train = n_x_train[:1000]\n", + "\n", + "# Find optimal number of clusters using silhouette score\n", + "scores = []\n", + "for k in range(2, 5):\n", + " kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n", + " kmeans.fit(n_x_train)\n", + " score = silhouette_score(n_x_train, kmeans.labels_)\n", + " scores.append(score)\n", + "\n", + "optimal_k = np.argmax(scores) + 2\n", + "kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n", + "kmeans.fit(n_x_train)\n", + "\n", + "# Predict on test set\n", + "y_pred_test = kmeans.predict(X_test)\n", + "\n", + "# Align labels with ground truth\n", + "def align_labels(y_true, y_pred, n_clusters):\n", + " labels = np.zeros_like(y_pred)\n", + " for i in range(n_clusters):\n", + " mask = (y_pred == i)\n", + " if np.sum(mask) > 0:\n", + " labels[mask] = np.bincount(y_true[mask]).argmax()\n", + " else:\n", + " labels[mask] = 0 # Default to normal class\n", + " return labels\n", + "\n", + "y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)\n", + "\n", + "# Evaluate K-Means model\n", + "evaluation(y_test, y_pred_aligned, model_name=\"KMeans (Unsupervised)\")\n", + "\n", + "print(\"\\n\" + \"=\"*50)\n", + "print(\"CHALLENGE 1 COMPLETED\")\n", + "print(\"Comparison of Random Forest (Supervised) vs K-Means (Unsupervised)\")\n", + "print(\"=\"*50)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fraud_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ACS111703 ex1/acs111703_ex1.md b/ACS111703 ex1/acs111703_ex1.md new file mode 100644 index 0000000..2add734 --- /dev/null +++ b/ACS111703 ex1/acs111703_ex1.md @@ -0,0 +1,95 @@ +# Challenge 1: Supervised vs Unsupervised Learning + +## Objective +Compare Random Forest (supervised learning) and K-Means (unsupervised learning) using the same general settings to achieve results that exceed the baseline performance (Precision, Recall, F1 score). + +## Dataset +- Source: Credit Card Fraud Detection Dataset from Kaggle +- Type: Binary classification problem (Fraud vs Non-Fraud) +- Features: 30 features (V1-V28 are PCA transformed, Amount, Class) +- Challenge: Highly imbalanced dataset (~0.17% fraudulent transactions) + +## Methodology + +### Data Preprocessing +1. Data Loading: Load creditcard fraud dataset from Kaggle Hub +2. Feature Engineering: + - Remove 'Time' column (not relevant for fraud detection) + - Standardize 'Amount' feature using StandardScaler +3. Class Distribution Analysis: Analyze fraud vs non-fraud transaction ratios + +### Model 1: Random Forest (Supervised Learning) +- Algorithm: RandomForestClassifier +- Parameters: + - n_estimators=100 + - random_state=42 +- Training: Uses labeled data (both features and target class) +- Evaluation: Standard classification metrics + +### Model 2: K-Means Clustering (Unsupervised Learning) +- Algorithm: K-Means clustering +- Approach: + - Train only on normal (non-fraud) transactions + - Use silhouette score to find optimal number of clusters (k=2-4) + - Apply label alignment to map clusters to fraud/non-fraud classes +- Key Innovation: Semi-supervised approach using unsupervised learning + +## Results + +### Dataset Analysis +``` +Fraudulent:492, non-fraudulent:284315 +the positive class (frauds) percentage: 0.0017304750013189597 +(0.173%) +``` + +### Random Forest (Supervised Learning) Results +``` +Random Forest Evaluation: +=============================================== + Accuracy: 0.999637185029934 + Precision Score: 0.9411764705882353 + Recall Score: 0.8235294117647058 + F1 Score: 0.8784313725490196 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85307 + 1 0.94 0.82 0.88 136 + + accuracy 1.00 85443 + macro avg 0.97 0.91 0.94 85443 +weighted avg 1.00 1.00 1.00 85443 +``` + +### K-Means (Unsupervised Learning) Results +``` +KMeans (Unsupervised) Evaluation: +=============================================== + Accuracy: 0.999672961580501 + Precision Score: 0.9285714285714286 + Recall Score: 0.8602941176470589 + F1 Score: 0.8931297709923665 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85307 + 1 0.93 0.86 0.89 136 + + accuracy 1.00 85443 + macro avg 0.96 0.93 0.95 85443 +weighted avg 1.00 1.00 1.00 85443 +``` + +## Key Insights +1. Supervised Learning: Leverages labeled data for direct pattern recognition +2. Unsupervised Learning: Identifies anomalies/outliers that may represent fraud +3. Performance Trade-off: Supervised typically outperforms unsupervised, but unsupervised can detect novel fraud patterns + +## Technical Implementation Notes +- Same train/test split (30% test size, random_state=42) +- Consistent preprocessing pipeline +- Robust evaluation function for fair comparison +- Label alignment technique for unsupervised predictions \ No newline at end of file diff --git a/ACS111703 ex2/acs111703_ex2.ipynb b/ACS111703 ex2/acs111703_ex2.ipynb new file mode 100644 index 0000000..02832de --- /dev/null +++ b/ACS111703 ex2/acs111703_ex2.ipynb @@ -0,0 +1,157 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "04d03cdf", + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'numpy'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[2], line 4\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Challenge 2: Ensemble Learning to Improve Prediction Results\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;66;03m# Goal: Combine supervised and unsupervised learning to improve prediction results\u001b[39;00m\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'numpy'" + ] + } + ], + "source": [ + "# Challenge 2: Ensemble Learning to Improve Prediction Results\n", + "# Goal: Combine supervised and unsupervised learning to improve prediction results\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.ensemble import RandomForestClassifier, IsolationForest\n", + "from sklearn.metrics import (classification_report, accuracy_score,\n", + " precision_score, recall_score, f1_score)\n", + "from sklearn.cluster import KMeans\n", + "import xgboost as xgb\n", + "import kagglehub\n", + "\n", + "# General settings\n", + "RANDOM_SEED = 42\n", + "TEST_SIZE = 0.3\n", + "\n", + "# Load dataset\n", + "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n", + "data = pd.read_csv(f\"{path}/creditcard.csv\")\n", + "data['Class'] = data['Class'].astype(int)\n", + "\n", + "# Preprocessing\n", + "data = data.drop(['Time'], axis=1)\n", + "data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n", + "\n", + "X = data.drop(columns=['Class']).values\n", + "y = data['Class'].values\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y\n", + ")\n", + "\n", + "scaler = StandardScaler()\n", + "X_train_scaled = scaler.fit_transform(X_train)\n", + "X_test_scaled = scaler.transform(X_test)\n", + "\n", + "# Supervised Models\n", + "print(\"Training Random Forest...\")\n", + "rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)\n", + "rf_model.fit(X_train, y_train)\n", + "rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]\n", + "\n", + "print(\"Training XGBoost...\")\n", + "xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=RANDOM_SEED, eval_metric='logloss')\n", + "xgb_model.fit(X_train, y_train)\n", + "xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]\n", + "\n", + "# Unsupervised Models\n", + "print(\"Training Isolation Forest...\")\n", + "iso_forest = IsolationForest(contamination=0.002, random_state=RANDOM_SEED)\n", + "iso_forest.fit(X_train_scaled[y_train == 0])\n", + "iso_pred_scores = iso_forest.decision_function(X_test_scaled)\n", + "iso_pred_proba = (1 - (iso_pred_scores - iso_pred_scores.min()) / (iso_pred_scores.max() - iso_pred_scores.min()))\n", + "\n", + "print(\"Training K-Means...\")\n", + "kmeans = KMeans(n_clusters=3, init='k-means++', random_state=RANDOM_SEED)\n", + "kmeans.fit(X_train_scaled[y_train == 0])\n", + "distances = kmeans.transform(X_test_scaled)\n", + "kmeans_pred_proba = (np.min(distances, axis=1) - distances.min()) / (distances.max() - distances.min())\n", + "\n", + "# Ensemble Methods\n", + "print(\"\\nEvaluating Ensemble Methods...\")\n", + "\n", + "def evaluate_model(y_true, y_pred, name):\n", + " print(f\"\\n{name} Evaluation\")\n", + " print(\"=\" * 50)\n", + " print(f\"Accuracy: {accuracy_score(y_true, y_pred):.6f}\")\n", + " print(f\"Precision: {precision_score(y_true, y_pred, zero_division=0):.6f}\")\n", + " print(f\"Recall: {recall_score(y_true, y_pred):.6f}\")\n", + " print(f\"F1 Score: {f1_score(y_true, y_pred):.6f}\")\n", + " print(\"\\nClassification Report:\")\n", + " print(classification_report(y_true, y_pred))\n", + "\n", + "# Simple Average\n", + "ensemble_avg = np.mean([rf_pred_proba, xgb_pred_proba, iso_pred_proba, kmeans_pred_proba], axis=0)\n", + "ensemble_avg_pred = (ensemble_avg > 0.5).astype(int)\n", + "evaluate_model(y_test, ensemble_avg_pred, \"Ensemble - Simple Average\")\n", + "\n", + "# Weighted Average\n", + "weights = [0.4, 0.4, 0.1, 0.1]\n", + "ensemble_weighted = sum(w * p for w, p in zip(weights, [rf_pred_proba, xgb_pred_proba, iso_pred_proba, kmeans_pred_proba]))\n", + "ensemble_weighted_pred = (ensemble_weighted > 0.5).astype(int)\n", + "evaluate_model(y_test, ensemble_weighted_pred, \"Ensemble - Weighted Average\")\n", + "\n", + "# Threshold-based Voting\n", + "rf_pred = (rf_pred_proba > 0.5).astype(int)\n", + "xgb_pred = (xgb_pred_proba > 0.5).astype(int)\n", + "iso_pred_binary = (iso_pred_proba > 0.7).astype(int)\n", + "kmeans_pred_binary = (kmeans_pred_proba > 0.7).astype(int)\n", + "ensemble_vote = ((rf_pred + xgb_pred + iso_pred_binary + kmeans_pred_binary) >= 2).astype(int)\n", + "evaluate_model(y_test, ensemble_vote, \"Ensemble - Threshold Voting\")\n", + "\n", + "# Identify best ensemble method\n", + "ensemble_methods = {\n", + " \"Simple Average\": ensemble_avg_pred,\n", + " \"Weighted Average\": ensemble_weighted_pred,\n", + " \"Threshold Voting\": ensemble_vote\n", + "}\n", + "\n", + "best_method, best_score = max(\n", + " ((name, f1_score(y_test, pred)) for name, pred in ensemble_methods.items()),\n", + " key=lambda item: item[1]\n", + ")\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(f\"CHALLENGE 2 COMPLETED\")\n", + "print(f\"Best Ensemble Method: {best_method}\")\n", + "print(f\"Best F1 Score: {best_score:.6f}\")\n", + "print(\"=\"*60)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ACS111703 ex2/acs111703_ex2.md b/ACS111703 ex2/acs111703_ex2.md new file mode 100644 index 0000000..2a7bbb1 --- /dev/null +++ b/ACS111703 ex2/acs111703_ex2.md @@ -0,0 +1,91 @@ + +# Challenge 2: Ensemble Learning to Improve Prediction Results + +## Goal +Combine supervised and unsupervised learning techniques to improve prediction performance for credit card fraud detection. + +## Models Used + +### Supervised Learning +- Random Forest +- XGBoost + +### Unsupervised Learning +- Isolation Forest +- K-Means Clustering + +## Ensemble Methods Evaluated +1. Simple Average – Average probability across all models. +2. Weighted Average – Heavier weights on supervised models. +3. Threshold-based Voting – Vote based on model thresholds, requires at least 2 votes for fraud. + +## Evaluation Metrics +- Accuracy +- Precision +- Recall +- F1 Score +- Classification Report + +## Results + +### Ensemble - Simple Average Evaluation +``` +Accuracy: 0.999403 +Precision: 0.887755 +Recall: 0.789474 +F1 Score: 0.835616 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85295 + 1 0.89 0.79 0.84 148 + + accuracy 1.00 85443 + macro avg 0.94 0.89 0.92 85443 +weighted avg 1.00 1.00 1.00 85443 +``` + +### Ensemble - Weighted Average Evaluation +``` +Accuracy: 0.999420 +Precision: 0.900000 +Recall: 0.783784 +F1 Score: 0.837838 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85295 + 1 0.90 0.78 0.84 148 + + accuracy 1.00 85443 + macro avg 0.95 0.89 0.92 85443 +weighted avg 1.00 1.00 1.00 85443 +``` + +### Ensemble - Threshold Voting Evaluation +``` +Accuracy: 0.999420 +Precision: 0.900000 +Recall: 0.783784 +F1 Score: 0.837838 + +Classification Report: + precision recall f1-score support + + 0 1.00 1.00 1.00 85295 + 1 0.90 0.78 0.84 148 + + accuracy 1.00 85443 + macro avg 0.95 0.89 0.92 85443 +weighted avg 1.00 1.00 1.00 85443 +``` + +## Conclusion + +``` +CHALLENGE 2 COMPLETED +Best Ensemble Method: Weighted Average +Best F1 Score: 0.837838 +```