diff --git a/ACS111107ex1.ipynb b/ACS111107ex1.ipynb new file mode 100644 index 0000000..2bdbaf3 --- /dev/null +++ b/ACS111107ex1.ipynb @@ -0,0 +1,866 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyNRztmHiBpwBuNgJ+2yWqJE", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iVpqZIVOT32h" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix\n", + "import kagglehub\n", + "# general setting. do not change TEST_SIZE\n", + "RANDOM_SEED = 42\n", + "TEST_SIZE = 0.3\n", + "\n", + "\n", + "# load dataset(from kagglehub)\n", + "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n", + "data = pd.read_csv(f\"{path}/creditcard.csv\")\n", + "data['Class'] = data['Class'].astype(int)\n", + "# prepare data\n", + "data = data.drop(['Time'], axis=1)\n", + "data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "source": [ + "fraud = data[data['Class'] == 1]\n", + "nonfraud = data[data['Class'] == 0]\n", + "print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')\n", + "print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "g6nCrilJwq0C", + "outputId": "c7452da4-937f-4135-f9c1-660dcca520af" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Fraudulent:492, non-fraudulent:284315\n", + "the positive class (frauds) percentage: 492/284807 (0.173%)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#監督式學習\n", + "from xgboost import XGBClassifier\n", + "\n", + "\n", + "X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])\n", + "Y = np.asarray(data.iloc[:, data.columns == 'Class'])\n", + "# split training set and data set\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)\n", + "# build Random Forest model\n", + "rf_model = XGBClassifier(\n", + " n_estimators=235,\n", + " max_depth=6,\n", + " learning_rate=0.16,\n", + " scale_pos_weight=100,\n", + " eval_metric='logloss',\n", + " use_label_encoder=False,\n", + " random_state=42\n", + ")\n", + "rf_model.fit(X_train, y_train)\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 323 + }, + "id": "dcggNaSdwu4c", + "outputId": "23472a13-10d0-44c0-bdb3-c3d97341b18e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158: UserWarning: [08:09:35] WARNING: /workspace/src/learner.cc:740: \n", + "Parameters: { \"use_label_encoder\" } are not used.\n", + "\n", + " warnings.warn(smsg, UserWarning)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric='logloss',\n", + " feature_types=None, gamma=None, grow_policy=None,\n", + " importance_type=None, interaction_constraints=None,\n", + " learning_rate=0.16, max_bin=None, max_cat_threshold=None,\n", + " max_cat_to_onehot=None, max_delta_step=None, max_depth=6,\n", + " max_leaves=None, min_child_weight=None, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None, n_estimators=235,\n", + " n_jobs=None, num_parallel_tree=None, random_state=42, ...)" + ], + "text/html": [ + "
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
+              "              colsample_bylevel=None, colsample_bynode=None,\n",
+              "              colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
+              "              enable_categorical=False, eval_metric='logloss',\n",
+              "              feature_types=None, gamma=None, grow_policy=None,\n",
+              "              importance_type=None, interaction_constraints=None,\n",
+              "              learning_rate=0.16, max_bin=None, max_cat_threshold=None,\n",
+              "              max_cat_to_onehot=None, max_delta_step=None, max_depth=6,\n",
+              "              max_leaves=None, min_child_weight=None, missing=nan,\n",
+              "              monotone_constraints=None, multi_strategy=None, n_estimators=235,\n",
+              "              n_jobs=None, num_parallel_tree=None, random_state=42, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 58 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# define evaluation function\n", + "def evaluation(y_true, y_pred, model_name=\"Model\"):\n", + " accuracy = accuracy_score(y_true, y_pred)\n", + " precision = precision_score(y_true, y_pred)\n", + " recall = recall_score(y_true, y_pred)\n", + " f1 = f1_score(y_true, y_pred)\n", + " print(f'\\n{model_name} Evaluation:')\n", + " print('===' * 15)\n", + " print(' Accuracy:', accuracy)\n", + " print(' Precision Score:', precision)\n", + " print(' Recall Score:', recall)\n", + " print(' F1 Score:', f1)\n", + " print(\"\\nClassification Report:\")\n", + " print(classification_report(y_true, y_pred))\n", + "# predict and print result\n", + "y_pred = rf_model.predict(X_test)\n", + "evaluation(y_test, y_pred, model_name=\"Random Forest\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eds5blxtxXe7", + "outputId": "f2418343-ea53-4356-b794-1255c4681caa" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Random Forest Evaluation:\n", + "=============================================\n", + " Accuracy: 0.9996957035684608\n", + " Precision Score: 0.9365079365079365\n", + " Recall Score: 0.8676470588235294\n", + " F1 Score: 0.9007633587786259\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85307\n", + " 1 0.94 0.87 0.90 136\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.97 0.93 0.95 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#非監督式學習\n", + "# Extract features and labels\n", + "X = np.asarray(data.drop(columns=['Class']))\n", + "y = np.asarray(data['Class'])\n", + "# Split the dataset into training and testing sets (with stratification)\n", + "x_train, x_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y\n", + ")\n", + "scaler = StandardScaler()\n", + "x_train = scaler.fit_transform(x_train)\n", + "x_test = scaler.transform(x_test)\n", + "# Select a small sample of normal (non-fraud) data for unsupervised training\n", + "n_x_train = x_train[y_train == 0]\n", + "n_x_train = n_x_train[:1000]" + ], + "metadata": { + "id": "sYrT6BZAJLOE" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "scores = []\n", + "for k in range(2, 5):\n", + " kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n", + " kmeans.fit(n_x_train)\n", + " score = silhouette_score(n_x_train, kmeans.labels_)\n", + " scores.append(score)\n", + "optimal_k = np.argmax(scores) + 2\n", + "kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n", + "kmeans.fit(n_x_train)\n", + "y_pred_test = kmeans.predict(x_test)\n", + "def align_labels(y_true, y_pred, n_clusters):\n", + " labels = np.zeros_like(y_pred)\n", + " for i in range(n_clusters):\n", + " mask = (y_pred == i)\n", + " if np.sum(mask) > 0:\n", + " labels[mask] = np.bincount(y_true[mask]).argmax()\n", + " else:\n", + " labels[mask] = 0 # Default to normal class\n", + " return labels\n", + "y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)" + ], + "metadata": { + "id": "iNvpZ--6Jaac" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def evaluation(y_true, y_pred, model_name=\"Model\"):\n", + " accuracy = accuracy_score(y_true, y_pred)\n", + " precision = precision_score(y_true, y_pred, zero_division=0)\n", + " recall = recall_score(y_true, y_pred)\n", + " f1 = f1_score(y_true, y_pred)\n", + " print(f'初始結果\\n{model_name} Evaluation:')\n", + " print('===' * 15)\n", + " print(' Accuracy:', accuracy)\n", + " print(' Precision Score:', precision)\n", + " print(' Recall Score:', recall)\n", + " print(' F1 Score:', f1)\n", + " print(\"\\nClassification Report:\")\n", + " print(classification_report(y_true, y_pred))\n", + "evaluation(y_test, y_pred_aligned, model_name=\"KMeans (Unsupervised)\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "G2iwOyvtJiRA", + "outputId": "2f9503a3-5af5-4710-e11d-2a2136c4234e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "初始結果\n", + "KMeans (Unsupervised) Evaluation:\n", + "=============================================\n", + " Accuracy: 0.9987242957293166\n", + " Precision Score: 0.782608695652174\n", + " Recall Score: 0.36486486486486486\n", + " F1 Score: 0.4976958525345622\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85295\n", + " 1 0.78 0.36 0.50 148\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.89 0.68 0.75 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#非監督式學習\n", + "# Extract features and labels\n", + "X = np.asarray(data.drop(columns=['Class']))\n", + "y = np.asarray(data['Class'])\n", + "# Split the dataset into training and testing sets (with stratification)\n", + "x_train, x_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y\n", + ")\n", + "scaler = StandardScaler()\n", + "x_train = scaler.fit_transform(x_train)\n", + "x_test = scaler.transform(x_test)\n", + "# Select a small sample of normal (non-fraud) data for unsupervised training\n", + "n_x_train = x_train[y_train == 0]\n", + "normal = x_train[y_train == 0][:800]\n", + "fraud = x_train[y_train == 1][:200]\n", + "n_x_train = np.vstack([normal, fraud])" + ], + "metadata": { + "id": "8TLMBIYpKwv5" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "scores = []\n", + "for k in range(2, 10):\n", + " kmeans = KMeans(n_clusters=k, init='k-means++', random_state=RANDOM_SEED)\n", + " kmeans.fit(n_x_train)\n", + " score = silhouette_score(n_x_train, kmeans.labels_)\n", + " scores.append(score)\n", + "optimal_k = np.argmax(scores) + 2\n", + "kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=RANDOM_SEED)\n", + "kmeans.fit(n_x_train)\n", + "y_pred_test = kmeans.predict(x_test)\n", + "def align_labels(y_true, y_pred, n_clusters):\n", + " labels = np.zeros_like(y_pred)\n", + " for i in range(n_clusters):\n", + " mask = (y_pred == i)\n", + " if np.sum(mask) > 0:\n", + " labels[mask] = np.bincount(y_true[mask]).argmax()\n", + " else:\n", + " labels[mask] = 0 # Default to normal class\n", + " return labels\n", + "y_pred_aligned = align_labels(y_test, y_pred_test, optimal_k)" + ], + "metadata": { + "id": "DwVy99wmKr6P" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def evaluation(y_true, y_pred, model_name=\"Model\"):\n", + " accuracy = accuracy_score(y_true, y_pred)\n", + " precision = precision_score(y_true, y_pred, zero_division=0)\n", + " recall = recall_score(y_true, y_pred)\n", + " f1 = f1_score(y_true, y_pred)\n", + " print(f'實作結果\\n{model_name} Evaluation:')\n", + " print('===' * 15)\n", + " print(' Accuracy:', accuracy)\n", + " print(' Precision Score:', precision)\n", + " print(' Recall Score:', recall)\n", + " print(' F1 Score:', f1)\n", + " print(\"\\nClassification Report:\")\n", + " print(classification_report(y_true, y_pred))\n", + "evaluation(y_test, y_pred_aligned, model_name=\"KMeans (Unsupervised)\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pXtXqm48J-K2", + "outputId": "d7a3c325-080b-405e-8ac0-fe3a05025b55" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "實作結果\n", + "KMeans (Unsupervised) Evaluation:\n", + "=============================================\n", + " Accuracy: 0.9989700736163291\n", + " Precision Score: 0.8333333333333334\n", + " Recall Score: 0.5067567567567568\n", + " F1 Score: 0.6302521008403361\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85295\n", + " 1 0.83 0.51 0.63 148\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.92 0.75 0.81 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/ACS111107ex2.ipynb b/ACS111107ex2.ipynb new file mode 100644 index 0000000..30e6ac9 --- /dev/null +++ b/ACS111107ex2.ipynb @@ -0,0 +1,708 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyPZEAzSJRW3cxL0ZXjtYpNV", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2fp07Q7Vyn-N", + "outputId": "367ca3a6-8974-4324-f462-c593fdc878e3" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Downloading from https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 66.0M/66.0M [00:00<00:00, 97.2MB/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Extracting files...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix\n", + "import kagglehub\n", + "# general setting. do not change TEST_SIZE\n", + "RANDOM_SEED = 42\n", + "TEST_SIZE = 0.3\n", + "\n", + "\n", + "# load dataset(from kagglehub)\n", + "path = kagglehub.dataset_download(\"mlg-ulb/creditcardfraud\")\n", + "data = pd.read_csv(f\"{path}/creditcard.csv\")\n", + "data['Class'] = data['Class'].astype(int)\n", + "# prepare data\n", + "data = data.drop(['Time'], axis=1)\n", + "data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "source": [ + "fraud = data[data['Class'] == 1]\n", + "nonfraud = data[data['Class'] == 0]\n", + "print(f'Fraudulent:{len(fraud)}, non-fraudulent:{len(nonfraud)}')\n", + "print(f'the positive class (frauds) percentage: {len(fraud)}/{len(fraud) + len(nonfraud)} ({len(fraud)/(len(fraud) + len(nonfraud))*100:.3f}%)')\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KRvNLAq-zlfp", + "outputId": "2c68ec79-06db-4fe4-a42b-7b98e3ec61dc" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Fraudulent:492, non-fraudulent:284315\n", + "the positive class (frauds) percentage: 492/284807 (0.173%)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#監督式學習\n", + "from xgboost import XGBClassifier\n", + "\n", + "\n", + "X = np.asarray(data.iloc[:, ~data.columns.isin(['Class'])])\n", + "Y = np.asarray(data.iloc[:, data.columns == 'Class'])\n", + "# split training set and data set\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=RANDOM_SEED)\n" + ], + "metadata": { + "id": "JstAHT7Uz-bj" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.ensemble import IsolationForest\n", + "\n", + "#Isolation Forest訓練偵測異常(用整體資料或正常樣本)\n", + "iso = IsolationForest(\n", + " n_estimators=200,\n", + " contamination=0.0017,\n", + " max_samples=5000,\n", + " random_state=42\n", + ")\n", + "anomaly_scores = iso_model.fit_predict(X)\n", + "score_feature = iso_model.decision_function(X).reshape(-1, 1)\n", + "\n", + "#把異常分數加進原始特徵\n", + "X_with_score = np.concatenate([X, score_feature], axis=1)\n", + "\n", + "#丟給監督式模型訓練\n", + "X_train, X_test, y_train, y_test = train_test_split(X_with_score, Y, stratify=Y, test_size=0.3)\n", + "rf_model = XGBClassifier(\n", + " n_estimators=400,\n", + " max_depth=10,\n", + " learning_rate=0.16,\n", + " scale_pos_weight=100,\n", + " eval_metric='logloss',\n", + " use_label_encoder=False,\n", + " random_state=42\n", + ")\n", + "rf_model.fit(X_train, y_train)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 343 + }, + "id": "8bqSWv3ezmmS", + "outputId": "b6a85e16-ee9d-4b8a-b089-dc1203f81285" + }, + "execution_count": 45, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158: UserWarning: [08:41:35] WARNING: /workspace/src/learner.cc:740: \n", + "Parameters: { \"use_label_encoder\" } are not used.\n", + "\n", + " warnings.warn(smsg, UserWarning)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric='logloss',\n", + " feature_types=None, gamma=None, grow_policy=None,\n", + " importance_type=None, interaction_constraints=None,\n", + " learning_rate=0.16, max_bin=None, max_cat_threshold=None,\n", + " max_cat_to_onehot=None, max_delta_step=None, max_depth=10,\n", + " max_leaves=None, min_child_weight=None, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None, n_estimators=400,\n", + " n_jobs=None, num_parallel_tree=None, random_state=42, ...)" + ], + "text/html": [ + "
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
+              "              colsample_bylevel=None, colsample_bynode=None,\n",
+              "              colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
+              "              enable_categorical=False, eval_metric='logloss',\n",
+              "              feature_types=None, gamma=None, grow_policy=None,\n",
+              "              importance_type=None, interaction_constraints=None,\n",
+              "              learning_rate=0.16, max_bin=None, max_cat_threshold=None,\n",
+              "              max_cat_to_onehot=None, max_delta_step=None, max_depth=10,\n",
+              "              max_leaves=None, min_child_weight=None, missing=nan,\n",
+              "              monotone_constraints=None, multi_strategy=None, n_estimators=400,\n",
+              "              n_jobs=None, num_parallel_tree=None, random_state=42, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 45 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# define evaluation function\n", + "def evaluation(y_true, y_pred, model_name=\"Model\"):\n", + " accuracy = accuracy_score(y_true, y_pred)\n", + " precision = precision_score(y_true, y_pred)\n", + " recall = recall_score(y_true, y_pred)\n", + " f1 = f1_score(y_true, y_pred)\n", + " print(f'\\n{model_name} Evaluation:')\n", + " print('===' * 15)\n", + " print(' Accuracy:', accuracy)\n", + " print(' Precision Score:', precision)\n", + " print(' Recall Score:', recall)\n", + " print(' F1 Score:', f1)\n", + " print(\"\\nClassification Report:\")\n", + " print(classification_report(y_true, y_pred))\n", + "# predict and print result\n", + "y_pred = rf_model.predict(X_test)\n", + "evaluation(y_test, y_pred, model_name=\"Random Forest\")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qVeYlQoy0TNW", + "outputId": "ebb1b34e-29dd-4b7b-cbed-8cae7d964bd0" + }, + "execution_count": 46, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Random Forest Evaluation:\n", + "=============================================\n", + " Accuracy: 0.9996839998595555\n", + " Precision Score: 0.9481481481481482\n", + " Recall Score: 0.8648648648648649\n", + " F1 Score: 0.9045936395759717\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 85295\n", + " 1 0.95 0.86 0.90 148\n", + "\n", + " accuracy 1.00 85443\n", + " macro avg 0.97 0.93 0.95 85443\n", + "weighted avg 1.00 1.00 1.00 85443\n", + "\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/EX1.md b/EX1.md new file mode 100644 index 0000000..e243d9e --- /dev/null +++ b/EX1.md @@ -0,0 +1,70 @@ +# Credit Card Fraud Detection - 實驗說明 + +作者:ACS111107 簡祐暄 +作業類型:Machine Learning 練習作業 (挑戰一) +日期:2025/5/29 + +--- + +## 使用資料集 + +- 資料來源:[Kaggle - Credit Card Fraud Detection](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud) +- 總筆數:284,807 +- 詐騙交易數:492(佔 0.172%) +- Class:0 = 正常,1 = 詐騙 + +--- + +## 實驗設定 + +- 使用固定設定: + - `RANDOM_SEED = 42` + - `TEST_SIZE = 0.3` +- 評估指標: + - Accuracy + - Precision + - Recall + - F1 Score + +--- + +## 使用的模型 + +### 1. 監督式學習 +- 套件:`sklearn.ensemble.RandomForestClassifier` +- 調整參數: + ```python + rf_model = RandomForestClassifier( + n_estimators=100, + random_state=42 + ) + ``` +改用 XGBClassifier +- 套件 from xgboost import XGBClassifier + ```python + rf_model = XGBClassifier( + n_estimators=235, + max_depth=6, + learning_rate=0.16, + scale_pos_weight=100, + eval_metric='logloss', + use_label_encoder=False, + random_state=42 + ) + ``` + 就監督式學習,XGBClassifier 較適合使用在此實例上,再慢慢測試個參數的結果,即可得出此結果。 + + ### 2. 非監督式學習 +- 調整參數: + ```python + n_x_train = n_x_train[:1000] + ``` +- 增加範圍設置 + ```python + normal = x_train[y_train == 0][:800] + fraud = x_train[y_train == 1][:200] + n_x_train = np.vstack([normal, fraud]) + ``` + 從百位數開始慢慢測試到個位數,不知道是不是巧合發現800、200是最佳的數值。 + + diff --git a/EX2.md b/EX2.md new file mode 100644 index 0000000..a01c088 --- /dev/null +++ b/EX2.md @@ -0,0 +1,69 @@ +# Credit Card Fraud Detection - 實驗說明 + +作者:ACS111107 簡祐暄 +作業類型:Machine Learning 練習作業 (挑戰二) +日期:2025/6/5 + +--- + +## 使用資料集 + +- 資料來源:[Kaggle - Credit Card Fraud Detection](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud) +- 總筆數:284,807 +- 詐騙交易數:492(佔 0.172%) +- Class:0 = 正常,1 = 詐騙 + +--- + +## 實驗設定 + +- 使用固定設定: + - `RANDOM_SEED = 42` + - `TEST_SIZE = 0.3` +- 評估指標: + - Accuracy + - Precision + - Recall + - F1 Score + +--- + +## 使用的模型 + +### 非監督式 +- 套件:`from sklearn.ensemble import IsolationForest +` +- 調整參數: + ```python + iso = IsolationForest( + n_estimators=200, + contamination=0.0017, + max_samples=5000, + random_state=42 + ) + ``` + 就監督式學習,XGBClassifier 較適合使用在此實例上,再慢慢測試個參數的結果,即可得出此結果。 + + ### 2. 監督式 + -使用 XGBClassifier +- 調整參數: + ```python + rf_model = XGBClassifier( + n_estimators=400, + max_depth=10, + learning_rate=0.16, + scale_pos_weight=100, + eval_metric='logloss', + use_label_encoder=False, + random_state=42 + ) + ``` +- 說明: + 我是用範例的兩個方式進行參數的調整,改變樹的數量與learning rate的值與樹的深度(depth) +- 範例結果 + - ![image](https://github.com/user-attachments/assets/61c60032-9224-419c-804c-cdd8fd337e5a) +- 我的結果 + - ![image](https://github.com/user-attachments/assets/2d711785-52e6-4b37-84c5-088675b82db1) + + + diff --git a/README.md b/README.md deleted file mode 100644 index a07d3c0..0000000 --- a/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# NTCU Machine Learning Assignment Repository -**NTCU-Machine-Learning** repository for the Machine Learning course at NTCU. -This repository is used for submitting assignments related to machine learning projects, with a focus on **Credit Card Fraud Detection**. - -## Project Overview -This assignment focuses on building a machine learning model for **credit card fraud detection**. -You will use a dataset to train and evaluate models, applying techniques such as data preprocessing, feature scaling, and classification algorithms (e.g., Random Forest) or clustering (e.g., KMeans). - -**Objectives:** -- Load and preprocess the dataset. -- Train a machine learning model to detect fraudulent transactions. -- Evaluate the model using metrics like accuracy, precision, recall, F1-score, ROC AUC, and confusion matrix. - -## Setup Instructions -To set up your environment and work on the assignment, follow these steps: - -### 1. Fork the Repository -- Fork the `NTCU-Machine-Learning` repository to your GitHub account. -- Clone your forked repository to your local machine: - ```bash - git clone - ``` - -### 2. Install Git -Ensure Git is installed on your system: -- **Windows/Mac**: Download and install Git from [git-scm.com](https://git-scm.com). -- **Ubuntu/Linux**: - ```bash - sudo apt update - sudo apt install git - ``` - -## Submission Guidelines -1. **Fork and Clone**: Fork this repository and clone it to your local machine. -2. **Create a Branch**: Create a branch for your assignment (e.g., `assignment-`). -3. **Implement Your Code**: Modify the provided code template (see [Code Structure](#code-structure)) to complete the assignment. -4. **Commit and Push**: - ```bash - git add . - git commit -m "Submit assignment for " - git push origin - ``` -5. **Create a Pull Request**: Submit a pull request from your forked repository to the main repository for review. -6. **File Naming**: Name your main script as `.py`. -7. **Create a New Folder**: Remeber put your each file into `_`. - -**Important**: -- Do not modify the `TEST_SIZE` (set to `0.3`) or `RANDOM_SEED` (set to `42`) in the code. -- Ensure your code is well-documented with comments explaining your approach. -- Submit your pull request before the deadline. - -## Dataset -The dataset for this assignment is available via **KaggleHub**. Use the following code to load it: -```python -import kagglehub -path = kagglehub.dataset_download("mlg-ulb/creditcardfraud") -data = pd.read_csv(f"{path}/creditcard.csv") -``` - -- The dataset contains credit card transaction data with features like transaction amount, time, and anonymized features (`V1` to `V28`). -- The target variable is `Class` (0 for non-fraudulent, 1 for fraudulent). - -### Tasks -1. Preprocess the data (e.g., handle missing values, scale features using `StandardScaler`). -2. Split the dataset into training (70%) and testing (30%) sets. -3. Train a classification model (e.g., `RandomForestClassifier`) or a clustering model (e.g., `KMeans`). -4. Evaluate the model using the following metrics: - - Accuracy - - Precision - - Recall - - F1-score - - ROC AUC score - - Confusion Matrix - - (For clustering) Silhouette Score - -## Evaluation Metrics -Your model will be evaluated based on: -- **Correctness**: Does the code run without errors and produce the expected outputs? -- **Performance**: How well does your model perform on the test set (based on the metrics above)? -- **Code Quality**: Is the code well-organized, commented, and easy to understand? -- **Documentation**: Include a brief explanation of your approach in the pull request description. - -## Contact -For questions or issues, contact the teaching assistant via: -- Email: [bcs113116@gm.ntcu.edu.tw] -- Email: [bcs113115@gm.ntcu.edu.tw] ----