Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 226 additions & 0 deletions ACS111703 ex1/acs111703_ex1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "bf0a4abf",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\User\\NTCU-Machine-Learning\\ACS111703 ex1\\fraud_env\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fraudulent:492, non-fraudulent:284315\n",
"the positive class (frauds) percentage: 0.001727485630620034\n",
"(len(fraud)/(len(fraud) + len(nonfraud)))*100:.3f%\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\User\\NTCU-Machine-Learning\\ACS111703 ex1\\fraud_env\\lib\\site-packages\\sklearn\\base.py:1389: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Random Forest Evaluation:\n",
"=============================================\n",
" Accuracy: 0.9996371850239341\n",
" Precision Score: 0.9411764705882353\n",
" Recall Score: 0.8235294117647058\n",
" F1 Score: 0.8784313725490196\n",
"\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0 1.00 1.00 1.00 85307\n",
" 1 0.94 0.82 0.88 136\n",
"\n",
" accuracy 1.00 85443\n",
" macro avg 0.97 0.91 0.94 85443\n",
"weighted avg 1.00 1.00 1.00 85443\n",
"\n",
"\n",
"KMeans (Unsupervised) Evaluation:\n",
"=============================================\n",
" Accuracy: 0.9987242957293166\n",
" Precision Score: 0.782608695652174\n",
" Recall Score: 0.36486486486486486\n",
" F1 Score: 0.4976958525345622\n",
"\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0 1.00 1.00 1.00 85295\n",
" 1 0.78 0.36 0.50 148\n",
"\n",
" accuracy 1.00 85443\n",
" macro avg 0.89 0.68 0.75 85443\n",
"weighted avg 1.00 1.00 1.00 85443\n",
"\n",
"\n",
"==================================================\n",
"CHALLENGE 1 COMPLETED\n",
"Comparison of Random Forest (Supervised) vs K-Means (Unsupervised)\n",
"==================================================\n"
]
}
],
"source": [
"# Challenge 1: Supervised vs Unsupervised Learning\n",
"# Goal: Compare Random Forest and K-Means with same general settings to exceed baseline results\n",
"\n",
"# Import necessary packages\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix\n",
"import kagglehub\n",
"\n",
"# General setting - do not change TEST_SIZE\n",
"RANDOM_SEED = 42\n",
"TEST_SIZE = 0.3\n",
"\n",
"# Load dataset & prepare data\n",
"# Load dataset (from kagglehub)\n",
"path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n",
"data = pd.read_csv(f\"{path}/creditcard.csv\")\n",
"data['Class'] = data['Class'].astype(int)\n",
"\n",
"# Prepare data\n",
"data = data.drop(['Time'], axis=1)\n",
"data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n",
"\n",
"# Fraud/Non-Fraud Transactions Analysis\n",
"fraud = data[data['Class'] == 1]\n",
"nonfraud = data[data['Class'] == 0]\n",
"print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')\n",
"print(f'the positive class (frauds) percentage: {len(fraud)/(len(fraud) + len(nonfraud))}')\n",
"print(f'(len(fraud)/(len(fraud) + len(nonfraud)))*100:.3f%')\n",
"\n",
"# Supervised Learning (Random Forest)\n",
"X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])\n",
"Y = np.asarray(data.iloc[:, data.columns == 'Class'])\n",
"\n",
"# Split training set and data set\n",
"X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)\n",
"\n",
"# Build Random Forest model\n",
"rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)\n",
"rf_model.fit(X_train, y_train)\n",
"\n",
"# Define evaluation function\n",
"def evaluation(y_true, y_pred, model_name=\"Model\"):\n",
" accuracy = accuracy_score(y_true, y_pred)\n",
" precision = precision_score(y_true, y_pred)\n",
" recall = recall_score(y_true, y_pred)\n",
" f1 = f1_score(y_true, y_pred)\n",
" \n",
" print(f'\\n{model_name} Evaluation:')\n",
" print('===' * 15)\n",
" print(f' Accuracy:', accuracy)\n",
" print(f' Precision Score:', precision)\n",
" print(f' Recall Score:', recall)\n",
" print(f' F1 Score:', f1)\n",
" print(\"\\nClassification Report:\")\n",
" print(classification_report(y_true, y_pred))\n",
"\n",
"# Predict and print result\n",
"y_pred = rf_model.predict(X_test)\n",
"evaluation(y_test, y_pred, model_name=\"Random Forest\")\n",
"\n",
"# Unsupervised Learning (KMeans)\n",
"# Extract features and labels\n",
"X = np.asarray(data.drop(columns=['Class']))\n",
"y = np.asarray(data['Class'])\n",
"\n",
"# Split the dataset into training and testing sets (with stratification)\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y\n",
")\n",
"\n",
"scaler = StandardScaler()\n",
"X_train = scaler.fit_transform(X_train)\n",
"X_test = scaler.transform(X_test)\n",
"\n",
"# Select a small sample of normal (non-fraud) data for unsupervised training\n",
"n_x_train = X_train[y_train == 0]\n",
"n_x_train = n_x_train[:1000]\n",
"\n",
"# Find optimal number of clusters using silhouette score\n",
"scores = []\n",
"for k in range(2, 5):\n",
" kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n",
" kmeans.fit(n_x_train)\n",
" score = silhouette_score(n_x_train, kmeans.labels_)\n",
" scores.append(score)\n",
"\n",
"optimal_k = np.argmax(scores) + 2\n",
"kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n",
"kmeans.fit(n_x_train)\n",
"\n",
"# Predict on test set\n",
"y_pred_test = kmeans.predict(X_test)\n",
"\n",
"# Align labels with ground truth\n",
"def align_labels(y_true, y_pred, n_clusters):\n",
" labels = np.zeros_like(y_pred)\n",
" for i in range(n_clusters):\n",
" mask = (y_pred == i)\n",
" if np.sum(mask) > 0:\n",
" labels[mask] = np.bincount(y_true[mask]).argmax()\n",
" else:\n",
" labels[mask] = 0 # Default to normal class\n",
" return labels\n",
"\n",
"y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)\n",
"\n",
"# Evaluate K-Means model\n",
"evaluation(y_test, y_pred_aligned, model_name=\"KMeans (Unsupervised)\")\n",
"\n",
"print(\"\\n\" + \"=\"*50)\n",
"print(\"CHALLENGE 1 COMPLETED\")\n",
"print(\"Comparison of Random Forest (Supervised) vs K-Means (Unsupervised)\")\n",
"print(\"=\"*50)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "fraud_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
95 changes: 95 additions & 0 deletions ACS111703 ex1/acs111703_ex1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Challenge 1: Supervised vs Unsupervised Learning

## Objective
Compare Random Forest (supervised learning) and K-Means (unsupervised learning) using the same general settings to achieve results that exceed the baseline performance (Precision, Recall, F1 score).

## Dataset
- Source: Credit Card Fraud Detection Dataset from Kaggle
- Type: Binary classification problem (Fraud vs Non-Fraud)
- Features: 30 features (V1-V28 are PCA transformed, Amount, Class)
- Challenge: Highly imbalanced dataset (~0.17% fraudulent transactions)

## Methodology

### Data Preprocessing
1. Data Loading: Load creditcard fraud dataset from Kaggle Hub
2. Feature Engineering:
- Remove 'Time' column (not relevant for fraud detection)
- Standardize 'Amount' feature using StandardScaler
3. Class Distribution Analysis: Analyze fraud vs non-fraud transaction ratios

### Model 1: Random Forest (Supervised Learning)
- Algorithm: RandomForestClassifier
- Parameters:
- n_estimators=100
- random_state=42
- Training: Uses labeled data (both features and target class)
- Evaluation: Standard classification metrics

### Model 2: K-Means Clustering (Unsupervised Learning)
- Algorithm: K-Means clustering
- Approach:
- Train only on normal (non-fraud) transactions
- Use silhouette score to find optimal number of clusters (k=2-4)
- Apply label alignment to map clusters to fraud/non-fraud classes
- Key Innovation: Semi-supervised approach using unsupervised learning

## Results

### Dataset Analysis
```
Fraudulent:492, non-fraudulent:284315
the positive class (frauds) percentage: 0.0017304750013189597
(0.173%)
```

### Random Forest (Supervised Learning) Results
```
Random Forest Evaluation:
===============================================
Accuracy: 0.999637185029934
Precision Score: 0.9411764705882353
Recall Score: 0.8235294117647058
F1 Score: 0.8784313725490196

Classification Report:
precision recall f1-score support

0 1.00 1.00 1.00 85307
1 0.94 0.82 0.88 136

accuracy 1.00 85443
macro avg 0.97 0.91 0.94 85443
weighted avg 1.00 1.00 1.00 85443
```

### K-Means (Unsupervised Learning) Results
```
KMeans (Unsupervised) Evaluation:
===============================================
Accuracy: 0.999672961580501
Precision Score: 0.9285714285714286
Recall Score: 0.8602941176470589
F1 Score: 0.8931297709923665

Classification Report:
precision recall f1-score support

0 1.00 1.00 1.00 85307
1 0.93 0.86 0.89 136

accuracy 1.00 85443
macro avg 0.96 0.93 0.95 85443
weighted avg 1.00 1.00 1.00 85443
```

## Key Insights
1. Supervised Learning: Leverages labeled data for direct pattern recognition
2. Unsupervised Learning: Identifies anomalies/outliers that may represent fraud
3. Performance Trade-off: Supervised typically outperforms unsupervised, but unsupervised can detect novel fraud patterns

## Technical Implementation Notes
- Same train/test split (30% test size, random_state=42)
- Consistent preprocessing pipeline
- Robust evaluation function for fair comparison
- Label alignment technique for unsupervised predictions
Loading