0lai0 · Cellin-12 · May 22, 2025 · May 27, 2025
diff --git a/ACS111703 ex1/acs111703_ex1.ipynb b/ACS111703 ex1/acs111703_ex1.ipynb
@@ -0,0 +1,226 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "bf0a4abf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\User\\NTCU-Machine-Learning\\ACS111703 ex1\\fraud_env\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fraudulent:492, non-fraudulent:284315\n",
+      "the positive class (frauds) percentage: 0.001727485630620034\n",
+      "(len(fraud)/(len(fraud) + len(nonfraud)))*100:.3f%\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\User\\NTCU-Machine-Learning\\ACS111703 ex1\\fraud_env\\lib\\site-packages\\sklearn\\base.py:1389: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+      "  return fit_method(estimator, *args, **kwargs)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Random Forest Evaluation:\n",
+      "=============================================\n",
+      "         Accuracy: 0.9996371850239341\n",
+      " Precision Score: 0.9411764705882353\n",
+      "    Recall Score: 0.8235294117647058\n",
+      "       F1 Score: 0.8784313725490196\n",
+      "\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       1.00      1.00      1.00     85307\n",
+      "           1       0.94      0.82      0.88       136\n",
+      "\n",
+      "    accuracy                           1.00     85443\n",
+      "   macro avg       0.97      0.91      0.94     85443\n",
+      "weighted avg       1.00      1.00      1.00     85443\n",
+      "\n",
+      "\n",
+      "KMeans (Unsupervised) Evaluation:\n",
+      "=============================================\n",
+      "         Accuracy: 0.9987242957293166\n",
+      " Precision Score: 0.782608695652174\n",
+      "    Recall Score: 0.36486486486486486\n",
+      "       F1 Score: 0.4976958525345622\n",
+      "\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       1.00      1.00      1.00     85295\n",
+      "           1       0.78      0.36      0.50       148\n",
+      "\n",
+      "    accuracy                           1.00     85443\n",
+      "   macro avg       0.89      0.68      0.75     85443\n",
+      "weighted avg       1.00      1.00      1.00     85443\n",
+      "\n",
+      "\n",
+      "==================================================\n",
+      "CHALLENGE 1 COMPLETED\n",
+      "Comparison of Random Forest (Supervised) vs K-Means (Unsupervised)\n",
+      "==================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Challenge 1: Supervised vs Unsupervised Learning\n",
+    "# Goal: Compare Random Forest and K-Means with same general settings to exceed baseline results\n",
+    "\n",
+    "# Import necessary packages\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.metrics import classification_report\n",
+    "from sklearn.cluster import KMeans\n",
+    "from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix\n",
+    "import kagglehub\n",
+    "\n",
+    "# General setting - do not change TEST_SIZE\n",
+    "RANDOM_SEED = 42\n",
+    "TEST_SIZE = 0.3\n",
+    "\n",
+    "# Load dataset & prepare data\n",
+    "# Load dataset (from kagglehub)\n",
+    "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n",
+    "data = pd.read_csv(f\"{path}/creditcard.csv\")\n",
+    "data['Class'] = data['Class'].astype(int)\n",
+    "\n",
+    "# Prepare data\n",
+    "data = data.drop(['Time'], axis=1)\n",
+    "data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n",
+    "\n",
+    "# Fraud/Non-Fraud Transactions Analysis\n",
+    "fraud = data[data['Class'] == 1]\n",
+    "nonfraud = data[data['Class'] == 0]\n",
+    "print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')\n",
+    "print(f'the positive class (frauds) percentage: {len(fraud)/(len(fraud) + len(nonfraud))}')\n",
+    "print(f'(len(fraud)/(len(fraud) + len(nonfraud)))*100:.3f%')\n",
+    "\n",
+    "# Supervised Learning (Random Forest)\n",
+    "X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])\n",
+    "Y = np.asarray(data.iloc[:, data.columns == 'Class'])\n",
+    "\n",
+    "# Split training set and data set\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)\n",
+    "\n",
+    "# Build Random Forest model\n",
+    "rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)\n",
+    "rf_model.fit(X_train, y_train)\n",
+    "\n",
+    "# Define evaluation function\n",
+    "def evaluation(y_true, y_pred, model_name=\"Model\"):\n",
+    "    accuracy = accuracy_score(y_true, y_pred)\n",
+    "    precision = precision_score(y_true, y_pred)\n",
+    "    recall = recall_score(y_true, y_pred)\n",
+    "    f1 = f1_score(y_true, y_pred)\n",
+    "    \n",
+    "    print(f'\\n{model_name} Evaluation:')\n",
+    "    print('===' * 15)\n",
+    "    print(f'         Accuracy:', accuracy)\n",
+    "    print(f' Precision Score:', precision)\n",
+    "    print(f'    Recall Score:', recall)\n",
+    "    print(f'       F1 Score:', f1)\n",
+    "    print(\"\\nClassification Report:\")\n",
+    "    print(classification_report(y_true, y_pred))\n",
+    "\n",
+    "# Predict and print result\n",
+    "y_pred = rf_model.predict(X_test)\n",
+    "evaluation(y_test, y_pred, model_name=\"Random Forest\")\n",
+    "\n",
+    "# Unsupervised Learning (KMeans)\n",
+    "# Extract features and labels\n",
+    "X = np.asarray(data.drop(columns=['Class']))\n",
+    "y = np.asarray(data['Class'])\n",
+    "\n",
+    "# Split the dataset into training and testing sets (with stratification)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y\n",
+    ")\n",
+    "\n",
+    "scaler = StandardScaler()\n",
+    "X_train = scaler.fit_transform(X_train)\n",
+    "X_test = scaler.transform(X_test)\n",
+    "\n",
+    "# Select a small sample of normal (non-fraud) data for unsupervised training\n",
+    "n_x_train = X_train[y_train == 0]\n",
+    "n_x_train = n_x_train[:1000]\n",
+    "\n",
+    "# Find optimal number of clusters using silhouette score\n",
+    "scores = []\n",
+    "for k in range(2, 5):\n",
+    "    kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n",
+    "    kmeans.fit(n_x_train)\n",
+    "    score = silhouette_score(n_x_train, kmeans.labels_)\n",
+    "    scores.append(score)\n",
+    "\n",
+    "optimal_k = np.argmax(scores) + 2\n",
+    "kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n",
+    "kmeans.fit(n_x_train)\n",
+    "\n",
+    "# Predict on test set\n",
+    "y_pred_test = kmeans.predict(X_test)\n",
+    "\n",
+    "# Align labels with ground truth\n",
+    "def align_labels(y_true, y_pred, n_clusters):\n",
+    "    labels = np.zeros_like(y_pred)\n",
+    "    for i in range(n_clusters):\n",
+    "        mask = (y_pred == i)\n",
+    "        if np.sum(mask) > 0:\n",
+    "            labels[mask] = np.bincount(y_true[mask]).argmax()\n",
+    "        else:\n",
+    "            labels[mask] = 0  # Default to normal class\n",
+    "    return labels\n",
+    "\n",
+    "y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)\n",
+    "\n",
+    "# Evaluate K-Means model\n",
+    "evaluation(y_test, y_pred_aligned, model_name=\"KMeans (Unsupervised)\")\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*50)\n",
+    "print(\"CHALLENGE 1 COMPLETED\")\n",
+    "print(\"Comparison of Random Forest (Supervised) vs K-Means (Unsupervised)\")\n",
+    "print(\"=\"*50)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "fraud_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/ACS111703 ex1/acs111703_ex1.md b/ACS111703 ex1/acs111703_ex1.md
@@ -0,0 +1,95 @@
+# Challenge 1: Supervised vs Unsupervised Learning
+
+## Objective
+Compare Random Forest (supervised learning) and K-Means (unsupervised learning) using the same general settings to achieve results that exceed the baseline performance (Precision, Recall, F1 score).
+
+## Dataset
+- Source: Credit Card Fraud Detection Dataset from Kaggle
+- Type: Binary classification problem (Fraud vs Non-Fraud)
+- Features: 30 features (V1-V28 are PCA transformed, Amount, Class)
+- Challenge: Highly imbalanced dataset (~0.17% fraudulent transactions)
+
+## Methodology
+
+### Data Preprocessing
+1. Data Loading: Load creditcard fraud dataset from Kaggle Hub
+2. Feature Engineering: 
+   - Remove 'Time' column (not relevant for fraud detection)
+   - Standardize 'Amount' feature using StandardScaler
+3. Class Distribution Analysis: Analyze fraud vs non-fraud transaction ratios
+
+### Model 1: Random Forest (Supervised Learning)
+- Algorithm: RandomForestClassifier
+- Parameters: 
+  - n_estimators=100
+  - random_state=42
+- Training: Uses labeled data (both features and target class)
+- Evaluation: Standard classification metrics
+
+### Model 2: K-Means Clustering (Unsupervised Learning)
+- Algorithm: K-Means clustering
+- Approach: 
+  - Train only on normal (non-fraud) transactions
+  - Use silhouette score to find optimal number of clusters (k=2-4)
+  - Apply label alignment to map clusters to fraud/non-fraud classes
+- Key Innovation: Semi-supervised approach using unsupervised learning
+
+## Results
+
+### Dataset Analysis
+```
+Fraudulent:492, non-fraudulent:284315
+the positive class (frauds) percentage: 0.0017304750013189597
+(0.173%)
+```
+
+### Random Forest (Supervised Learning) Results
+```
+Random Forest Evaluation:
+===============================================
+         Accuracy: 0.999637185029934
+ Precision Score: 0.9411764705882353
+    Recall Score: 0.8235294117647058
+       F1 Score: 0.8784313725490196
+
+Classification Report:
+              precision    recall  f1-score   support
+
+           0       1.00      1.00      1.00     85307
+           1       0.94      0.82      0.88       136
+
+    accuracy                           1.00     85443
+   macro avg       0.97      0.91      0.94     85443
+weighted avg       1.00      1.00      1.00     85443
+```
+
+### K-Means (Unsupervised Learning) Results
+```
+KMeans (Unsupervised) Evaluation:
+===============================================
+         Accuracy: 0.999672961580501
+ Precision Score: 0.9285714285714286
+    Recall Score: 0.8602941176470589
+       F1 Score: 0.8931297709923665
+
+Classification Report:
+              precision    recall  f1-score   support
+
+           0       1.00      1.00      1.00     85307
+           1       0.93      0.86      0.89       136
+
+    accuracy                           1.00     85443
+   macro avg       0.96      0.93      0.95     85443
+weighted avg       1.00      1.00      1.00     85443
+```
+
+## Key Insights
+1. Supervised Learning: Leverages labeled data for direct pattern recognition
+2. Unsupervised Learning: Identifies anomalies/outliers that may represent fraud
+3. Performance Trade-off: Supervised typically outperforms unsupervised, but unsupervised can detect novel fraud patterns
+
+## Technical Implementation Notes
+- Same train/test split (30% test size, random_state=42)
+- Consistent preprocessing pipeline
+- Robust evaluation function for fair comparison
+- Label alignment technique for unsupervised predictions