From a5da81530580aace76a76f4e309b8ae26ef9fe43 Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Fri, 24 Mar 2023 15:06:23 +0100 Subject: [PATCH 01/14] svm_modified --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 7b9e343..8db9341 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":45,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n"]},{"cell_type":"code","execution_count":46,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":47,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'name':self.cloud_name,'points':self.points,'features':self.feature,'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":48,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID name points \\\n","0 0 000 [[20.06999969482422, 499.9599914550781, 17.450... \n","1 1 001 [[373.3099975585938, 404.2200012207031, 7.1300... \n","2 2 002 [[65.91000366210938, 326.8599853515625, 12.710... \n","3 3 003 [[109.5, 391.5599975585938, 12.69999980926514]... \n","4 4 004 [[126.4300003051758, 234.9400024414062, 6.8800... \n",".. ... ... ... \n","495 495 495 [[129.3399963378906, 12.97000026702881, 8.2200... \n","496 496 496 [[440.1499938964844, 35.84999847412109, 6.1199... \n","497 497 497 [[158.1799926757812, 130.6999969482422, 4.9899... \n","498 498 498 [[498.3299865722656, 93.45999908447266, 9.4200... \n","499 499 499 [[86.16000366210938, 132.1300048828125, 7.0399... \n","\n"," features lable \n","0 [19.72999954223633, 0.002763957987838585] 0 \n","1 [8.470000267028809, 0.004335260115606936] 0 \n","2 [15.56999969482422, 0.00145218945487042] 0 \n","3 [16.46999931335449, 0.0009728572818367545] 0 \n","4 [9.75, 0.0004123711340206186] 0 \n",".. ... ... \n","495 [11.53999996185303, 0.004746257758305951] 4 \n","496 [15.17000007629395, 0.002241817366611867] 4 \n","497 [11.10999965667725, 0.011976047904191617] 4 \n","498 [18.90999984741211, 0.011563599798893917] 4 \n","499 [11.89999961853027, 0.005298013245033113] 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":52,"metadata":{},"outputs":[{"data":{"text/plain":["0 0\n","1 0\n","2 0\n","3 0\n","4 0\n"," ..\n","495 4\n","496 4\n","497 4\n","498 4\n","499 4\n","Name: lable, Length: 500, dtype: int64"]},"execution_count":52,"metadata":{},"output_type":"execute_result"}],"source":["X = pt_cloud_df['features']\n","y = pt_cloud_df['lable'].copy()\n","y"]},{"cell_type":"code","execution_count":50,"metadata":{},"outputs":[{"ename":"ValueError","evalue":"setting an array element with a sequence.","output_type":"error","traceback":["\u001b[1;31m---------------------------------------------------------------------------\u001b[0m","\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)","\u001b[1;31mTypeError\u001b[0m: float() argument must be a string or a real number, not 'list'","\nThe above exception was the direct cause of the following exception:\n","\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)","Cell \u001b[1;32mIn[50], line 12\u001b[0m\n\u001b[0;32m 9\u001b[0m conf_matrix \u001b[39m=\u001b[39m confusion_matrix(y_test, y_preds)\n\u001b[0;32m 10\u001b[0m \u001b[39mprint\u001b[39m(conf_matrix)\n\u001b[1;32m---> 12\u001b[0m SVM_classification(X,y)\n","Cell \u001b[1;32mIn[50], line 4\u001b[0m, in \u001b[0;36mSVM_classification\u001b[1;34m(X, y)\u001b[0m\n\u001b[0;32m 2\u001b[0m X_train, X_test, y_train, y_test \u001b[39m=\u001b[39m train_test_split(X,y,test_size\u001b[39m=\u001b[39m\u001b[39m0.4\u001b[39m)\n\u001b[0;32m 3\u001b[0m clf \u001b[39m=\u001b[39m svm\u001b[39m.\u001b[39mSVC()\n\u001b[1;32m----> 4\u001b[0m clf\u001b[39m.\u001b[39;49mfit(X_train,y_train)\n\u001b[0;32m 5\u001b[0m y_preds \u001b[39m=\u001b[39m clf\u001b[39m.\u001b[39mpredict(X_test)\n\u001b[0;32m 6\u001b[0m acc \u001b[39m=\u001b[39m accuracy_score(y_test,y_preds)\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\svm\\_base.py:192\u001b[0m, in \u001b[0;36mBaseLibSVM.fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 190\u001b[0m check_consistent_length(X, y)\n\u001b[0;32m 191\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 192\u001b[0m X, y \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_data(\n\u001b[0;32m 193\u001b[0m X,\n\u001b[0;32m 194\u001b[0m y,\n\u001b[0;32m 195\u001b[0m dtype\u001b[39m=\u001b[39;49mnp\u001b[39m.\u001b[39;49mfloat64,\n\u001b[0;32m 196\u001b[0m order\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mC\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m 197\u001b[0m accept_sparse\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcsr\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m 198\u001b[0m accept_large_sparse\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[0;32m 199\u001b[0m )\n\u001b[0;32m 201\u001b[0m y \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_validate_targets(y)\n\u001b[0;32m 203\u001b[0m sample_weight \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39masarray(\n\u001b[0;32m 204\u001b[0m [] \u001b[39mif\u001b[39;00m sample_weight \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m sample_weight, dtype\u001b[39m=\u001b[39mnp\u001b[39m.\u001b[39mfloat64\n\u001b[0;32m 205\u001b[0m )\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\base.py:565\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[0;32m 563\u001b[0m y \u001b[39m=\u001b[39m check_array(y, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_y_params)\n\u001b[0;32m 564\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 565\u001b[0m X, y \u001b[39m=\u001b[39m check_X_y(X, y, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mcheck_params)\n\u001b[0;32m 566\u001b[0m out \u001b[39m=\u001b[39m X, y\n\u001b[0;32m 568\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m no_val_X \u001b[39mand\u001b[39;00m check_params\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mensure_2d\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mTrue\u001b[39;00m):\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\utils\\validation.py:1106\u001b[0m, in \u001b[0;36mcheck_X_y\u001b[1;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[0;32m 1101\u001b[0m estimator_name \u001b[39m=\u001b[39m _check_estimator_name(estimator)\n\u001b[0;32m 1102\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 1103\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mestimator_name\u001b[39m}\u001b[39;00m\u001b[39m requires y to be passed, but the target y is None\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 1104\u001b[0m )\n\u001b[1;32m-> 1106\u001b[0m X \u001b[39m=\u001b[39m check_array(\n\u001b[0;32m 1107\u001b[0m X,\n\u001b[0;32m 1108\u001b[0m accept_sparse\u001b[39m=\u001b[39;49maccept_sparse,\n\u001b[0;32m 1109\u001b[0m accept_large_sparse\u001b[39m=\u001b[39;49maccept_large_sparse,\n\u001b[0;32m 1110\u001b[0m dtype\u001b[39m=\u001b[39;49mdtype,\n\u001b[0;32m 1111\u001b[0m order\u001b[39m=\u001b[39;49morder,\n\u001b[0;32m 1112\u001b[0m copy\u001b[39m=\u001b[39;49mcopy,\n\u001b[0;32m 1113\u001b[0m force_all_finite\u001b[39m=\u001b[39;49mforce_all_finite,\n\u001b[0;32m 1114\u001b[0m ensure_2d\u001b[39m=\u001b[39;49mensure_2d,\n\u001b[0;32m 1115\u001b[0m allow_nd\u001b[39m=\u001b[39;49mallow_nd,\n\u001b[0;32m 1116\u001b[0m ensure_min_samples\u001b[39m=\u001b[39;49mensure_min_samples,\n\u001b[0;32m 1117\u001b[0m ensure_min_features\u001b[39m=\u001b[39;49mensure_min_features,\n\u001b[0;32m 1118\u001b[0m estimator\u001b[39m=\u001b[39;49mestimator,\n\u001b[0;32m 1119\u001b[0m input_name\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mX\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m 1120\u001b[0m )\n\u001b[0;32m 1122\u001b[0m y \u001b[39m=\u001b[39m _check_y(y, multi_output\u001b[39m=\u001b[39mmulti_output, y_numeric\u001b[39m=\u001b[39my_numeric, estimator\u001b[39m=\u001b[39mestimator)\n\u001b[0;32m 1124\u001b[0m check_consistent_length(X, y)\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\utils\\validation.py:879\u001b[0m, in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 877\u001b[0m array \u001b[39m=\u001b[39m xp\u001b[39m.\u001b[39mastype(array, dtype, copy\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[0;32m 878\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 879\u001b[0m array \u001b[39m=\u001b[39m _asarray_with_order(array, order\u001b[39m=\u001b[39;49morder, dtype\u001b[39m=\u001b[39;49mdtype, xp\u001b[39m=\u001b[39;49mxp)\n\u001b[0;32m 880\u001b[0m \u001b[39mexcept\u001b[39;00m ComplexWarning \u001b[39mas\u001b[39;00m complex_warning:\n\u001b[0;32m 881\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 882\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mComplex data not supported\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m{}\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(array)\n\u001b[0;32m 883\u001b[0m ) \u001b[39mfrom\u001b[39;00m \u001b[39mcomplex_warning\u001b[39;00m\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\utils\\_array_api.py:185\u001b[0m, in \u001b[0;36m_asarray_with_order\u001b[1;34m(array, dtype, order, copy, xp)\u001b[0m\n\u001b[0;32m 182\u001b[0m xp, _ \u001b[39m=\u001b[39m get_namespace(array)\n\u001b[0;32m 183\u001b[0m \u001b[39mif\u001b[39;00m xp\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m \u001b[39min\u001b[39;00m {\u001b[39m\"\u001b[39m\u001b[39mnumpy\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mnumpy.array_api\u001b[39m\u001b[39m\"\u001b[39m}:\n\u001b[0;32m 184\u001b[0m \u001b[39m# Use NumPy API to support order\u001b[39;00m\n\u001b[1;32m--> 185\u001b[0m array \u001b[39m=\u001b[39m numpy\u001b[39m.\u001b[39masarray(array, order\u001b[39m=\u001b[39morder, dtype\u001b[39m=\u001b[39mdtype)\n\u001b[0;32m 186\u001b[0m \u001b[39mreturn\u001b[39;00m xp\u001b[39m.\u001b[39masarray(array, copy\u001b[39m=\u001b[39mcopy)\n\u001b[0;32m 187\u001b[0m \u001b[39melse\u001b[39;00m:\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\pandas\\core\\generic.py:2070\u001b[0m, in \u001b[0;36mNDFrame.__array__\u001b[1;34m(self, dtype)\u001b[0m\n\u001b[0;32m 2069\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__array__\u001b[39m(\u001b[39mself\u001b[39m, dtype: npt\u001b[39m.\u001b[39mDTypeLike \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m np\u001b[39m.\u001b[39mndarray:\n\u001b[1;32m-> 2070\u001b[0m \u001b[39mreturn\u001b[39;00m np\u001b[39m.\u001b[39;49masarray(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_values, dtype\u001b[39m=\u001b[39;49mdtype)\n","\u001b[1;31mValueError\u001b[0m: setting an array element with a sequence."]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":67,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n"]},{"cell_type":"code","execution_count":68,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":69,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":70,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":71,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()\n"]},{"cell_type":"code","execution_count":72,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.48\n","confusion matrix\n","[[19 9 1 18 0]\n"," [ 3 19 8 3 0]\n"," [ 0 16 28 0 0]\n"," [ 2 1 0 29 0]\n"," [14 9 0 19 2]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From ef478899ce8304b3220d709627ba84c4183f0993 Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Fri, 24 Mar 2023 16:35:04 +0100 Subject: [PATCH 02/14] rf --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 8db9341..25dbacd 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":67,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n"]},{"cell_type":"code","execution_count":68,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":69,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":70,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":71,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()\n"]},{"cell_type":"code","execution_count":72,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.48\n","confusion matrix\n","[[19 9 1 18 0]\n"," [ 3 19 8 3 0]\n"," [ 0 16 28 0 0]\n"," [ 2 1 0 29 0]\n"," [14 9 0 19 2]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":73,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n"]},{"cell_type":"code","execution_count":68,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":69,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":70,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":71,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()\n"]},{"cell_type":"code","execution_count":72,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.48\n","confusion matrix\n","[[19 9 1 18 0]\n"," [ 3 19 8 3 0]\n"," [ 0 16 28 0 0]\n"," [ 2 1 0 29 0]\n"," [14 9 0 19 2]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":74,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.69\n","confusion matrix\n","[[36 0 2 0 4]\n"," [ 0 19 17 0 3]\n"," [ 6 8 26 0 0]\n"," [ 0 1 0 33 3]\n"," [ 3 10 2 3 24]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From efc8727e614ee0a8f020d2bfeaca3d650dea90a7 Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Fri, 24 Mar 2023 16:36:16 +0100 Subject: [PATCH 03/14] rf_m --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 25dbacd..40d3e47 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":73,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n"]},{"cell_type":"code","execution_count":68,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":69,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":70,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":71,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()\n"]},{"cell_type":"code","execution_count":72,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.48\n","confusion matrix\n","[[19 9 1 18 0]\n"," [ 3 19 8 3 0]\n"," [ 0 16 28 0 0]\n"," [ 2 1 0 29 0]\n"," [14 9 0 19 2]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":74,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.69\n","confusion matrix\n","[[36 0 2 0 4]\n"," [ 0 19 17 0 3]\n"," [ 6 8 26 0 0]\n"," [ 0 1 0 33 3]\n"," [ 3 10 2 3 24]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":75,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n"]},{"cell_type":"code","execution_count":76,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":77,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":78,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":79,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()\n"]},{"cell_type":"code","execution_count":80,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.47\n","confusion matrix\n","[[19 4 4 13 0]\n"," [ 4 2 33 4 0]\n"," [ 1 0 37 0 0]\n"," [ 6 0 1 33 0]\n"," [15 5 1 16 2]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":81,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.71\n","confusion matrix\n","[[41 0 2 0 2]\n"," [ 0 20 12 2 5]\n"," [ 3 17 26 0 1]\n"," [ 0 0 0 32 1]\n"," [ 2 6 3 2 23]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From 339891069969dc1d15ad130876bf9998f8788a10 Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Fri, 24 Mar 2023 19:01:34 +0100 Subject: [PATCH 04/14] rf_done --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 40d3e47..cac1eeb 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":75,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n"]},{"cell_type":"code","execution_count":76,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":77,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":78,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":79,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()\n"]},{"cell_type":"code","execution_count":80,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.47\n","confusion matrix\n","[[19 4 4 13 0]\n"," [ 4 2 33 4 0]\n"," [ 1 0 37 0 0]\n"," [ 6 0 1 33 0]\n"," [15 5 1 16 2]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":81,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.71\n","confusion matrix\n","[[41 0 2 0 2]\n"," [ 0 20 12 2 5]\n"," [ 3 17 26 0 1]\n"," [ 0 0 0 32 1]\n"," [ 2 6 3 2 23]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":84,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n"]},{"cell_type":"code","execution_count":85,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":86,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":87,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":88,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()\n"]},{"cell_type":"code","execution_count":89,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.46\n","confusion matrix\n","[[19 3 6 20 0]\n"," [ 1 8 27 6 0]\n"," [ 1 1 34 0 0]\n"," [ 4 0 0 29 1]\n"," [12 4 4 18 2]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":90,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.67\n","confusion matrix\n","[[35 0 3 0 2]\n"," [ 0 20 12 0 11]\n"," [ 5 16 21 1 1]\n"," [ 0 1 0 32 2]\n"," [ 4 5 1 3 25]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From aedee82d3f22491f70a03a098b0860453444372d Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Fri, 24 Mar 2023 19:52:25 +0100 Subject: [PATCH 05/14] tuning --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index cac1eeb..78aea21 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":84,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n"]},{"cell_type":"code","execution_count":85,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":86,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":87,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":88,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()\n"]},{"cell_type":"code","execution_count":89,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.46\n","confusion matrix\n","[[19 3 6 20 0]\n"," [ 1 8 27 6 0]\n"," [ 1 1 34 0 0]\n"," [ 4 0 0 29 1]\n"," [12 4 4 18 2]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":90,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.67\n","confusion matrix\n","[[35 0 3 0 2]\n"," [ 0 20 12 0 11]\n"," [ 5 16 21 1 1]\n"," [ 0 1 0 32 2]\n"," [ 4 5 1 3 25]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":97,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV"]},{"cell_type":"code","execution_count":85,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":86,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":87,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":92,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":98,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.51\n","Fitting 5 folds for each of 60 candidates, totalling 300 fits\n","[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s\n","[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s\n","[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s\n","[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s\n","[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s\n","[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 0.0s\n","[CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 7.1s\n","[CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 7.7s\n","[CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 6.1s\n","[CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 6.0s\n","[CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 8.3s\n","[CV] END ........................C=0.1, gamma=10, kernel=rbf; total time= 0.0s\n","[CV] END ........................C=0.1, gamma=10, kernel=rbf; total time= 0.0s\n","[CV] END ........................C=0.1, gamma=10, kernel=rbf; total time= 0.0s\n","[CV] END ........................C=0.1, gamma=10, kernel=rbf; total time= 0.0s\n","[CV] END ........................C=0.1, gamma=10, kernel=rbf; total time= 0.0s\n","[CV] END .....................C=0.1, gamma=10, kernel=linear; total time= 0.0s\n","[CV] END .....................C=0.1, gamma=10, kernel=linear; total time= 0.0s\n","[CV] END .....................C=0.1, gamma=10, kernel=linear; total time= 0.0s\n","[CV] END .....................C=0.1, gamma=10, kernel=linear; total time= 0.0s\n","[CV] END .....................C=0.1, gamma=10, kernel=linear; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=10, kernel=poly; total time= 58.8s\n","[CV] END .......................C=0.1, gamma=10, kernel=poly; total time= 53.4s\n","[CV] END .......................C=0.1, gamma=10, kernel=poly; total time= 54.7s\n","[CV] END .......................C=0.1, gamma=10, kernel=poly; total time= 42.9s\n","[CV] END .......................C=0.1, gamma=10, kernel=poly; total time= 1.2min\n","[CV] END .......................C=0.1, gamma=100, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=100, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=100, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=100, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=100, kernel=rbf; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=100, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=100, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=100, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=100, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=100, kernel=linear; total time= 0.0s\n"]}],"source":["X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear','poly']}]\n","optimal_params = GridSearchCV(svm.SVC(),param_grid,cv=5,scoring='accuracy',verbose=2)\n","optimal_params.fit(X_train, y_train)\n","print(optimal_params.best_params_)"]},{"cell_type":"code","execution_count":89,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.46\n","confusion matrix\n","[[19 3 6 20 0]\n"," [ 1 8 27 6 0]\n"," [ 1 1 34 0 0]\n"," [ 4 0 0 29 1]\n"," [12 4 4 18 2]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":90,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.67\n","confusion matrix\n","[[35 0 3 0 2]\n"," [ 0 20 12 0 11]\n"," [ 5 16 21 1 1]\n"," [ 0 1 0 32 2]\n"," [ 4 5 1 3 25]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From 2732826e7626bd705e04afc68c87331c33165720 Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Fri, 24 Mar 2023 21:56:54 +0100 Subject: [PATCH 06/14] hyperparam_tuning_m --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 78aea21..3b74610 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":97,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV"]},{"cell_type":"code","execution_count":85,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":86,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":87,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":92,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":98,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.51\n","Fitting 5 folds for each of 60 candidates, totalling 300 fits\n","[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time= 0.0s\n","[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s\n","[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s\n","[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s\n","[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s\n","[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 0.0s\n","[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 0.0s\n","[CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 7.1s\n","[CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 7.7s\n","[CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 6.1s\n","[CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 6.0s\n","[CV] END ........................C=0.1, gamma=1, kernel=poly; total time= 8.3s\n","[CV] END ........................C=0.1, gamma=10, kernel=rbf; total time= 0.0s\n","[CV] END ........................C=0.1, gamma=10, kernel=rbf; total time= 0.0s\n","[CV] END ........................C=0.1, gamma=10, kernel=rbf; total time= 0.0s\n","[CV] END ........................C=0.1, gamma=10, kernel=rbf; total time= 0.0s\n","[CV] END ........................C=0.1, gamma=10, kernel=rbf; total time= 0.0s\n","[CV] END .....................C=0.1, gamma=10, kernel=linear; total time= 0.0s\n","[CV] END .....................C=0.1, gamma=10, kernel=linear; total time= 0.0s\n","[CV] END .....................C=0.1, gamma=10, kernel=linear; total time= 0.0s\n","[CV] END .....................C=0.1, gamma=10, kernel=linear; total time= 0.0s\n","[CV] END .....................C=0.1, gamma=10, kernel=linear; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=10, kernel=poly; total time= 58.8s\n","[CV] END .......................C=0.1, gamma=10, kernel=poly; total time= 53.4s\n","[CV] END .......................C=0.1, gamma=10, kernel=poly; total time= 54.7s\n","[CV] END .......................C=0.1, gamma=10, kernel=poly; total time= 42.9s\n","[CV] END .......................C=0.1, gamma=10, kernel=poly; total time= 1.2min\n","[CV] END .......................C=0.1, gamma=100, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=100, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=100, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=100, kernel=rbf; total time= 0.0s\n","[CV] END .......................C=0.1, gamma=100, kernel=rbf; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=100, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=100, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=100, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=100, kernel=linear; total time= 0.0s\n","[CV] END ....................C=0.1, gamma=100, kernel=linear; total time= 0.0s\n"]}],"source":["X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear','poly']}]\n","optimal_params = GridSearchCV(svm.SVC(),param_grid,cv=5,scoring='accuracy',verbose=2)\n","optimal_params.fit(X_train, y_train)\n","print(optimal_params.best_params_)"]},{"cell_type":"code","execution_count":89,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.46\n","confusion matrix\n","[[19 3 6 20 0]\n"," [ 1 8 27 6 0]\n"," [ 1 1 34 0 0]\n"," [ 4 0 0 29 1]\n"," [12 4 4 18 2]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":90,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.67\n","confusion matrix\n","[[35 0 3 0 2]\n"," [ 0 20 12 0 11]\n"," [ 5 16 21 1 1]\n"," [ 0 1 0 32 2]\n"," [ 4 5 1 3 25]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.43\n","{'C': 10, 'gamma': 100, 'kernel': 'rbf'}\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n","param_grid_02 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['linear']}]\n","optimal_params_02 = GridSearchCV(svm.SVC(),param_grid_02,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_02.fit(X_train, y_train)\n","print(optimal_params_02.best_params_)\n","\n"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.46\n","confusion matrix\n","[[17 5 8 16 0]\n"," [ 5 7 27 3 0]\n"," [ 0 5 36 0 0]\n"," [ 4 0 0 28 3]\n"," [19 3 2 8 4]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.68\n","confusion matrix\n","[[32 0 2 0 4]\n"," [ 1 24 13 0 4]\n"," [ 9 12 19 0 0]\n"," [ 0 0 1 37 4]\n"," [ 4 8 1 1 24]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From 4b2fbc149c7a9ed06318b94c048ff9195733e09c Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Sat, 25 Mar 2023 14:02:54 +0100 Subject: [PATCH 07/14] saved --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 3b74610..7edd032 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.43\n","{'C': 10, 'gamma': 100, 'kernel': 'rbf'}\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n","param_grid_02 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['linear']}]\n","optimal_params_02 = GridSearchCV(svm.SVC(),param_grid_02,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_02.fit(X_train, y_train)\n","print(optimal_params_02.best_params_)\n","\n"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.46\n","confusion matrix\n","[[17 5 8 16 0]\n"," [ 5 7 27 3 0]\n"," [ 0 5 36 0 0]\n"," [ 4 0 0 28 3]\n"," [19 3 2 8 4]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.68\n","confusion matrix\n","[[32 0 2 0 4]\n"," [ 1 24 13 0 4]\n"," [ 9 12 19 0 0]\n"," [ 0 0 1 37 4]\n"," [ 4 8 1 1 24]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.47\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n","param_grid_02 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['linear']}]\n","optimal_params_02 = GridSearchCV(svm.SVC(),param_grid_02,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_02.fit(X_train, y_train)\n","print(optimal_params_02.best_params_)\n","\n"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.47\n","confusion matrix\n","[[16 8 3 16 0]\n"," [ 4 3 30 2 0]\n"," [ 0 5 33 0 0]\n"," [ 4 0 0 37 1]\n"," [13 6 2 13 4]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.67\n","confusion matrix\n","[[33 0 1 0 7]\n"," [ 0 23 14 1 7]\n"," [ 3 9 24 1 4]\n"," [ 0 0 0 31 5]\n"," [ 1 9 1 3 23]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From 2d426fbc34ce418dffa460911c5ed3dad50bee8d Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Sat, 25 Mar 2023 17:19:58 +0100 Subject: [PATCH 08/14] hyperprams tuned --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 7edd032..4494230 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.47\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n","param_grid_02 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['linear']}]\n","optimal_params_02 = GridSearchCV(svm.SVC(),param_grid_02,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_02.fit(X_train, y_train)\n","print(optimal_params_02.best_params_)\n","\n"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.47\n","confusion matrix\n","[[16 8 3 16 0]\n"," [ 4 3 30 2 0]\n"," [ 0 5 33 0 0]\n"," [ 4 0 0 37 1]\n"," [13 6 2 13 4]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.67\n","confusion matrix\n","[[33 0 1 0 7]\n"," [ 0 23 14 1 7]\n"," [ 3 9 24 1 4]\n"," [ 0 0 0 31 5]\n"," [ 1 9 1 3 23]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":84,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV"]},{"cell_type":"code","execution_count":85,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":86,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":87,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":88,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":89,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.41\n","rbf model accuracy: 0.46\n","poly model accuracy: 0.32\n"]}],"source":["X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":90,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.38\n","linear kernel accuracy with c= 1,: 0.41\n","linear kernel accuracy with c= 10,: 0.47\n","linear kernel accuracy with c= 100,: 0.56\n","linear kernel accuracy with c= 1000,: 0.60\n"]}],"source":["# Hyperparams tuning\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":91,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.46\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.46\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.48\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.52\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.58\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.51\n","rbf kernel accuracy with gamma= 1, c=1,: 0.48\n","rbf kernel accuracy with gamma= 1, c=10,: 0.51\n","rbf kernel accuracy with gamma= 1, c=100,: 0.56\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.61\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.34\n","rbf kernel accuracy with gamma= 10, c=1,: 0.50\n","rbf kernel accuracy with gamma= 10, c=10,: 0.50\n","rbf kernel accuracy with gamma= 10, c=100,: 0.49\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.55\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.18\n","rbf kernel accuracy with gamma= 100, c=1,: 0.47\n","rbf kernel accuracy with gamma= 100, c=10,: 0.48\n","rbf kernel accuracy with gamma= 100, c=100,: 0.48\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.43\n"]}],"source":["# Hyperparams tuning\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":92,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.18\n","poly kernel accuracy with degree=1,: 0.39\n","poly kernel accuracy with degree=2,: 0.34\n","poly kernel accuracy with degree=3,: 0.32\n","poly kernel accuracy with degree=4,: 0.35\n","poly kernel accuracy with degree=5,: 0.45\n","poly kernel accuracy with degree=6,: 0.46\n"]}],"source":["# Hyperparams tuning\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":93,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.43\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal params with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":94,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.65\n","confusion matrix\n","[[35 0 1 0 2]\n"," [ 0 4 28 0 16]\n"," [ 2 5 34 0 0]\n"," [ 0 0 1 30 3]\n"," [ 7 0 5 0 27]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1000)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":95,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.69\n","confusion matrix\n","[[39 0 0 0 4]\n"," [ 0 19 12 0 4]\n"," [ 4 17 22 0 0]\n"," [ 0 0 1 33 6]\n"," [ 3 5 2 3 26]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From ab223212eef7dc576e7963e0d60977e1def4bd3e Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Sun, 26 Mar 2023 19:42:08 +0200 Subject: [PATCH 09/14] hyperparam_tuning_finished_both_svm_rf --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 4494230..243d612 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":84,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV"]},{"cell_type":"code","execution_count":85,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":86,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":87,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":88,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":89,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.41\n","rbf model accuracy: 0.46\n","poly model accuracy: 0.32\n"]}],"source":["X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":90,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.38\n","linear kernel accuracy with c= 1,: 0.41\n","linear kernel accuracy with c= 10,: 0.47\n","linear kernel accuracy with c= 100,: 0.56\n","linear kernel accuracy with c= 1000,: 0.60\n"]}],"source":["# Hyperparams tuning\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":91,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.46\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.46\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.48\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.52\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.58\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.51\n","rbf kernel accuracy with gamma= 1, c=1,: 0.48\n","rbf kernel accuracy with gamma= 1, c=10,: 0.51\n","rbf kernel accuracy with gamma= 1, c=100,: 0.56\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.61\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.34\n","rbf kernel accuracy with gamma= 10, c=1,: 0.50\n","rbf kernel accuracy with gamma= 10, c=10,: 0.50\n","rbf kernel accuracy with gamma= 10, c=100,: 0.49\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.55\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.18\n","rbf kernel accuracy with gamma= 100, c=1,: 0.47\n","rbf kernel accuracy with gamma= 100, c=10,: 0.48\n","rbf kernel accuracy with gamma= 100, c=100,: 0.48\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.43\n"]}],"source":["# Hyperparams tuning\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":92,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.18\n","poly kernel accuracy with degree=1,: 0.39\n","poly kernel accuracy with degree=2,: 0.34\n","poly kernel accuracy with degree=3,: 0.32\n","poly kernel accuracy with degree=4,: 0.35\n","poly kernel accuracy with degree=5,: 0.45\n","poly kernel accuracy with degree=6,: 0.46\n"]}],"source":["# Hyperparams tuning\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":93,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.43\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal params with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":94,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.65\n","confusion matrix\n","[[35 0 1 0 2]\n"," [ 0 4 28 0 16]\n"," [ 2 5 34 0 0]\n"," [ 0 0 1 30 3]\n"," [ 7 0 5 0 27]]\n"]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1000)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":95,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.69\n","confusion matrix\n","[[39 0 0 0 4]\n"," [ 0 19 12 0 4]\n"," [ 4 17 22 0 0]\n"," [ 0 0 1 33 6]\n"," [ 3 5 2 3 26]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":120,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint"]},{"cell_type":"code","execution_count":121,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":122,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":123,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":124,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":125,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.46\n","rbf model accuracy: 0.43\n","poly model accuracy: 0.47\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":126,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.43\n","linear kernel accuracy with c= 1,: 0.46\n","linear kernel accuracy with c= 10,: 0.51\n","linear kernel accuracy with c= 100,: 0.61\n","linear kernel accuracy with c= 1000,: 0.70\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":127,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.41\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.46\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.46\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.52\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.54\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.41\n","rbf kernel accuracy with gamma= 1, c=1,: 0.48\n","rbf kernel accuracy with gamma= 1, c=10,: 0.50\n","rbf kernel accuracy with gamma= 1, c=100,: 0.55\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.59\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.39\n","rbf kernel accuracy with gamma= 10, c=1,: 0.43\n","rbf kernel accuracy with gamma= 10, c=10,: 0.46\n","rbf kernel accuracy with gamma= 10, c=100,: 0.47\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.47\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.17\n","rbf kernel accuracy with gamma= 100, c=1,: 0.40\n","rbf kernel accuracy with gamma= 100, c=10,: 0.45\n","rbf kernel accuracy with gamma= 100, c=100,: 0.41\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.42\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":128,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.17\n","poly kernel accuracy with degree=1,: 0.43\n","poly kernel accuracy with degree=2,: 0.41\n","poly kernel accuracy with degree=3,: 0.47\n","poly kernel accuracy with degree=4,: 0.47\n","poly kernel accuracy with degree=5,: 0.43\n","poly kernel accuracy with degree=6,: 0.44\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":129,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.47\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":130,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.65\n","confusion matrix\n","[[29 1 2 0 7]\n"," [ 0 7 25 0 12]\n"," [ 3 3 35 0 0]\n"," [ 0 0 0 32 5]\n"," [ 8 1 3 1 26]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1000)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":131,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'max_features': ['auto', 'sqrt'],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# number of features to consider at every split\n","max_features = ['auto','sqrt']\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_features': max_features,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":132,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"data":{"text/plain":["{'n_estimators': 1400,\n"," 'min_samples_split': 10,\n"," 'min_samples_leaf': 2,\n"," 'max_features': 'sqrt',\n"," 'max_depth': 50,\n"," 'criterion': 'gini',\n"," 'bootstrap': True}"]},"execution_count":132,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=42,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":133,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 180 candidates, totalling 540 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 10,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 3,\n"," 'min_samples_split': 12,\n"," 'n_estimators': 1100}"]},"execution_count":133,"metadata":{},"output_type":"execute_result"}],"source":["# 重新改列表里的数值\n","new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [10,20,40,80,90],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [3, 4, 5],\n"," 'min_samples_split': [8, 10, 12],\n"," 'n_estimators': [1000, 1100, 1200, 1300]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":134,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.65500\n","confusion matrix\n","[[36 1 3 0 4]\n"," [ 0 16 13 0 12]\n"," [ 2 10 22 0 2]\n"," [ 0 0 0 31 4]\n"," [ 4 5 3 6 26]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":135,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.72500\n","confusion matrix\n","[[36 1 2 0 7]\n"," [ 0 17 16 0 5]\n"," [ 2 10 27 0 0]\n"," [ 0 0 1 36 1]\n"," [ 1 5 1 3 29]]\n"]}],"source":["# 改 hyperparameters\n","def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier(n_estimators=1100,criterion='gini',max_depth=10,min_samples_split=12,min_samples_leaf=3,max_features='sqrt',bootstrap=True,)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From e3dd65dd3a5abc4ffb1602eab0624d5ddd464dfd Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Mon, 27 Mar 2023 17:25:55 +0200 Subject: [PATCH 10/14] checked --- assignment_2/ass2_notes.ipynb | 2 +- assignment_2/tutorial.ipynb | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 243d612..0b47682 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":120,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint"]},{"cell_type":"code","execution_count":121,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":122,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":123,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":124,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":125,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.46\n","rbf model accuracy: 0.43\n","poly model accuracy: 0.47\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":126,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.43\n","linear kernel accuracy with c= 1,: 0.46\n","linear kernel accuracy with c= 10,: 0.51\n","linear kernel accuracy with c= 100,: 0.61\n","linear kernel accuracy with c= 1000,: 0.70\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":127,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.41\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.46\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.46\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.52\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.54\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.41\n","rbf kernel accuracy with gamma= 1, c=1,: 0.48\n","rbf kernel accuracy with gamma= 1, c=10,: 0.50\n","rbf kernel accuracy with gamma= 1, c=100,: 0.55\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.59\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.39\n","rbf kernel accuracy with gamma= 10, c=1,: 0.43\n","rbf kernel accuracy with gamma= 10, c=10,: 0.46\n","rbf kernel accuracy with gamma= 10, c=100,: 0.47\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.47\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.17\n","rbf kernel accuracy with gamma= 100, c=1,: 0.40\n","rbf kernel accuracy with gamma= 100, c=10,: 0.45\n","rbf kernel accuracy with gamma= 100, c=100,: 0.41\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.42\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":128,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.17\n","poly kernel accuracy with degree=1,: 0.43\n","poly kernel accuracy with degree=2,: 0.41\n","poly kernel accuracy with degree=3,: 0.47\n","poly kernel accuracy with degree=4,: 0.47\n","poly kernel accuracy with degree=5,: 0.43\n","poly kernel accuracy with degree=6,: 0.44\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":129,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.47\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":130,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.65\n","confusion matrix\n","[[29 1 2 0 7]\n"," [ 0 7 25 0 12]\n"," [ 3 3 35 0 0]\n"," [ 0 0 0 32 5]\n"," [ 8 1 3 1 26]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1000)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":131,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'max_features': ['auto', 'sqrt'],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# number of features to consider at every split\n","max_features = ['auto','sqrt']\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_features': max_features,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":132,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"data":{"text/plain":["{'n_estimators': 1400,\n"," 'min_samples_split': 10,\n"," 'min_samples_leaf': 2,\n"," 'max_features': 'sqrt',\n"," 'max_depth': 50,\n"," 'criterion': 'gini',\n"," 'bootstrap': True}"]},"execution_count":132,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=42,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":133,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 180 candidates, totalling 540 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 10,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 3,\n"," 'min_samples_split': 12,\n"," 'n_estimators': 1100}"]},"execution_count":133,"metadata":{},"output_type":"execute_result"}],"source":["# 重新改列表里的数值\n","new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [10,20,40,80,90],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [3, 4, 5],\n"," 'min_samples_split': [8, 10, 12],\n"," 'n_estimators': [1000, 1100, 1200, 1300]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":134,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.65500\n","confusion matrix\n","[[36 1 3 0 4]\n"," [ 0 16 13 0 12]\n"," [ 2 10 22 0 2]\n"," [ 0 0 0 31 4]\n"," [ 4 5 3 6 26]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":135,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.72500\n","confusion matrix\n","[[36 1 2 0 7]\n"," [ 0 17 16 0 5]\n"," [ 2 10 27 0 0]\n"," [ 0 0 1 36 1]\n"," [ 1 5 1 3 29]]\n"]}],"source":["# 改 hyperparameters\n","def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier(n_estimators=1100,criterion='gini',max_depth=10,min_samples_split=12,min_samples_leaf=3,max_features='sqrt',bootstrap=True,)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.45\n","rbf model accuracy: 0.42\n","poly model accuracy: 0.41\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.45\n","linear kernel accuracy with c= 1,: 0.45\n","linear kernel accuracy with c= 10,: 0.47\n","linear kernel accuracy with c= 100,: 0.57\n","linear kernel accuracy with c= 1000,: 0.65\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.43\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.45\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.43\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.47\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.53\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.37\n","rbf kernel accuracy with gamma= 1, c=1,: 0.45\n","rbf kernel accuracy with gamma= 1, c=10,: 0.47\n","rbf kernel accuracy with gamma= 1, c=100,: 0.49\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.53\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.34\n","rbf kernel accuracy with gamma= 10, c=1,: 0.41\n","rbf kernel accuracy with gamma= 10, c=10,: 0.43\n","rbf kernel accuracy with gamma= 10, c=100,: 0.46\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.45\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.37\n","rbf kernel accuracy with gamma= 100, c=1,: 0.42\n","rbf kernel accuracy with gamma= 100, c=10,: 0.42\n","rbf kernel accuracy with gamma= 100, c=100,: 0.44\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.45\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.19\n","poly kernel accuracy with degree=1,: 0.44\n","poly kernel accuracy with degree=2,: 0.43\n","poly kernel accuracy with degree=3,: 0.41\n","poly kernel accuracy with degree=4,: 0.41\n","poly kernel accuracy with degree=5,: 0.41\n","poly kernel accuracy with degree=6,: 0.41\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.48\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.62\n","confusion matrix\n","[[33 2 3 0 3]\n"," [ 0 5 28 0 6]\n"," [ 2 1 34 0 0]\n"," [ 0 1 0 32 4]\n"," [15 8 1 1 21]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1000)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'max_features': ['auto', 'sqrt'],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# number of features to consider at every split\n","max_features = ['auto','sqrt']\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_features': max_features,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"name":"stderr","output_type":"stream","text":["c:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\ensemble\\_forest.py:424: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n"," warn(\n"]},{"data":{"text/plain":["{'n_estimators': 600,\n"," 'min_samples_split': 5,\n"," 'min_samples_leaf': 1,\n"," 'max_features': 'auto',\n"," 'max_depth': 40,\n"," 'criterion': 'gini',\n"," 'bootstrap': False}"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=42,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 180 candidates, totalling 540 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 20,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 3,\n"," 'min_samples_split': 8,\n"," 'n_estimators': 1000}"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["# 重新改列表里的数值\n","new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [10,20,40,80,90],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [3, 4, 5],\n"," 'min_samples_split': [8, 10, 12],\n"," 'n_estimators': [1000, 1100, 1200, 1300]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.65500\n","confusion matrix\n","[[40 0 1 0 5]\n"," [ 0 18 16 0 12]\n"," [ 2 14 17 1 1]\n"," [ 0 2 1 31 4]\n"," [ 1 5 0 4 25]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.71500\n","confusion matrix\n","[[31 1 6 0 7]\n"," [ 0 25 13 0 1]\n"," [ 0 12 22 0 1]\n"," [ 0 1 0 35 1]\n"," [ 1 8 2 3 30]]\n"]}],"source":["# 改 hyperparameters\n","def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier(n_estimators=1100,criterion='gini',max_depth=10,min_samples_split=12,min_samples_leaf=3,max_features='sqrt',bootstrap=True,)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} diff --git a/assignment_2/tutorial.ipynb b/assignment_2/tutorial.ipynb index 732e3b7..9e4bd98 100644 --- a/assignment_2/tutorial.ipynb +++ b/assignment_2/tutorial.ipynb @@ -2,19 +2,20 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from sklearn import svm, datasets\n", "import sklearn.model_selection as model_selection\n", "from sklearn.metrics import accuracy_score\n", - "from sklearn.metrics import f1_score\n" + "from sklearn.metrics import f1_score\n", + "import pandas as pd" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -187,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -331,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -341,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -351,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -372,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { From 7a9eae647177d349578e3fb221227627bee57630 Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Mon, 27 Mar 2023 18:07:31 +0200 Subject: [PATCH 11/14] feature_engineerinng_03 --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 0b47682..5f1f3ef 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.45\n","rbf model accuracy: 0.42\n","poly model accuracy: 0.41\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.45\n","linear kernel accuracy with c= 1,: 0.45\n","linear kernel accuracy with c= 10,: 0.47\n","linear kernel accuracy with c= 100,: 0.57\n","linear kernel accuracy with c= 1000,: 0.65\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.43\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.45\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.43\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.47\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.53\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.37\n","rbf kernel accuracy with gamma= 1, c=1,: 0.45\n","rbf kernel accuracy with gamma= 1, c=10,: 0.47\n","rbf kernel accuracy with gamma= 1, c=100,: 0.49\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.53\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.34\n","rbf kernel accuracy with gamma= 10, c=1,: 0.41\n","rbf kernel accuracy with gamma= 10, c=10,: 0.43\n","rbf kernel accuracy with gamma= 10, c=100,: 0.46\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.45\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.37\n","rbf kernel accuracy with gamma= 100, c=1,: 0.42\n","rbf kernel accuracy with gamma= 100, c=10,: 0.42\n","rbf kernel accuracy with gamma= 100, c=100,: 0.44\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.45\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.19\n","poly kernel accuracy with degree=1,: 0.44\n","poly kernel accuracy with degree=2,: 0.43\n","poly kernel accuracy with degree=3,: 0.41\n","poly kernel accuracy with degree=4,: 0.41\n","poly kernel accuracy with degree=5,: 0.41\n","poly kernel accuracy with degree=6,: 0.41\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.48\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.62\n","confusion matrix\n","[[33 2 3 0 3]\n"," [ 0 5 28 0 6]\n"," [ 2 1 34 0 0]\n"," [ 0 1 0 32 4]\n"," [15 8 1 1 21]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1000)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'max_features': ['auto', 'sqrt'],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# number of features to consider at every split\n","max_features = ['auto','sqrt']\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_features': max_features,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"name":"stderr","output_type":"stream","text":["c:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\ensemble\\_forest.py:424: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n"," warn(\n"]},{"data":{"text/plain":["{'n_estimators': 600,\n"," 'min_samples_split': 5,\n"," 'min_samples_leaf': 1,\n"," 'max_features': 'auto',\n"," 'max_depth': 40,\n"," 'criterion': 'gini',\n"," 'bootstrap': False}"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=42,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 180 candidates, totalling 540 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 20,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 3,\n"," 'min_samples_split': 8,\n"," 'n_estimators': 1000}"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["# 重新改列表里的数值\n","new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [10,20,40,80,90],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [3, 4, 5],\n"," 'min_samples_split': [8, 10, 12],\n"," 'n_estimators': [1000, 1100, 1200, 1300]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.65500\n","confusion matrix\n","[[40 0 1 0 5]\n"," [ 0 18 16 0 12]\n"," [ 2 14 17 1 1]\n"," [ 0 2 1 31 4]\n"," [ 1 5 0 4 25]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.71500\n","confusion matrix\n","[[31 1 6 0 7]\n"," [ 0 25 13 0 1]\n"," [ 0 12 22 0 1]\n"," [ 0 1 0 35 1]\n"," [ 1 8 2 3 30]]\n"]}],"source":["# 改 hyperparameters\n","def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier(n_estimators=1100,criterion='gini',max_depth=10,min_samples_split=12,min_samples_leaf=3,max_features='sqrt',bootstrap=True,)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":30,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint\n","from jakteristics import las_utils, compute_features, FEATURE_NAMES"]},{"cell_type":"code","execution_count":23,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":37,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," # feature03, linearity\n"," cov_mx = np.cov(self.points, rowvar=False)\n"," eigvals,_ = np.linalg.eig(cov_mx)\n"," eigvals = np.sort(eigvals)[::-1]\n"," linearity = (eigvals[0]-eigvals[2]) / eigvals[0]\n"," self.feature.append(linearity)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,\n"," 'points':self.points,\n"," 'feature01':self.feature[0],\n"," 'feature02':self.feature[1],'feature03':self.feature[2],\n"," 'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":38,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 feature03 lable \n","0 0.002764 0.968252 0 \n","1 0.004335 0.914281 0 \n","2 0.001452 0.920231 0 \n","3 0.000973 0.937653 0 \n","4 0.000412 0.982707 0 \n",".. ... ... ... \n","495 0.004746 0.437249 4 \n","496 0.002242 0.476774 4 \n","497 0.011976 0.719661 4 \n","498 0.011564 0.904078 4 \n","499 0.005298 0.498481 4 \n","\n","[500 rows x 6 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.45\n","rbf model accuracy: 0.42\n","poly model accuracy: 0.41\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.45\n","linear kernel accuracy with c= 1,: 0.45\n","linear kernel accuracy with c= 10,: 0.47\n","linear kernel accuracy with c= 100,: 0.57\n","linear kernel accuracy with c= 1000,: 0.65\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.43\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.45\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.43\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.47\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.53\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.37\n","rbf kernel accuracy with gamma= 1, c=1,: 0.45\n","rbf kernel accuracy with gamma= 1, c=10,: 0.47\n","rbf kernel accuracy with gamma= 1, c=100,: 0.49\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.53\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.34\n","rbf kernel accuracy with gamma= 10, c=1,: 0.41\n","rbf kernel accuracy with gamma= 10, c=10,: 0.43\n","rbf kernel accuracy with gamma= 10, c=100,: 0.46\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.45\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.37\n","rbf kernel accuracy with gamma= 100, c=1,: 0.42\n","rbf kernel accuracy with gamma= 100, c=10,: 0.42\n","rbf kernel accuracy with gamma= 100, c=100,: 0.44\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.45\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.19\n","poly kernel accuracy with degree=1,: 0.44\n","poly kernel accuracy with degree=2,: 0.43\n","poly kernel accuracy with degree=3,: 0.41\n","poly kernel accuracy with degree=4,: 0.41\n","poly kernel accuracy with degree=5,: 0.41\n","poly kernel accuracy with degree=6,: 0.41\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.48\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.62\n","confusion matrix\n","[[33 2 3 0 3]\n"," [ 0 5 28 0 6]\n"," [ 2 1 34 0 0]\n"," [ 0 1 0 32 4]\n"," [15 8 1 1 21]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1000)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'max_features': ['auto', 'sqrt'],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# number of features to consider at every split\n","max_features = ['auto','sqrt']\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_features': max_features,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"name":"stderr","output_type":"stream","text":["c:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\ensemble\\_forest.py:424: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n"," warn(\n"]},{"data":{"text/plain":["{'n_estimators': 600,\n"," 'min_samples_split': 5,\n"," 'min_samples_leaf': 1,\n"," 'max_features': 'auto',\n"," 'max_depth': 40,\n"," 'criterion': 'gini',\n"," 'bootstrap': False}"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=42,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 180 candidates, totalling 540 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 20,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 3,\n"," 'min_samples_split': 8,\n"," 'n_estimators': 1000}"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["# 重新改列表里的数值\n","new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [10,20,40,80,90],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [3, 4, 5],\n"," 'min_samples_split': [8, 10, 12],\n"," 'n_estimators': [1000, 1100, 1200, 1300]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.65500\n","confusion matrix\n","[[40 0 1 0 5]\n"," [ 0 18 16 0 12]\n"," [ 2 14 17 1 1]\n"," [ 0 2 1 31 4]\n"," [ 1 5 0 4 25]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.71500\n","confusion matrix\n","[[31 1 6 0 7]\n"," [ 0 25 13 0 1]\n"," [ 0 12 22 0 1]\n"," [ 0 1 0 35 1]\n"," [ 1 8 2 3 30]]\n"]}],"source":["# 改 hyperparameters\n","def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier(n_estimators=1100,criterion='gini',max_depth=10,min_samples_split=12,min_samples_leaf=3,max_features='sqrt',bootstrap=True,)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From 118db72d4da6d9cde8ccadc237668d042cdfb3ca Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Mon, 27 Mar 2023 23:36:31 +0200 Subject: [PATCH 12/14] feature_design_finished --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 5f1f3ef..f58c7f2 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":30,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint\n","from jakteristics import las_utils, compute_features, FEATURE_NAMES"]},{"cell_type":"code","execution_count":23,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":37,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," # feature03, linearity\n"," cov_mx = np.cov(self.points, rowvar=False)\n"," eigvals,_ = np.linalg.eig(cov_mx)\n"," eigvals = np.sort(eigvals)[::-1]\n"," linearity = (eigvals[0]-eigvals[2]) / eigvals[0]\n"," self.feature.append(linearity)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,\n"," 'points':self.points,\n"," 'feature01':self.feature[0],\n"," 'feature02':self.feature[1],'feature03':self.feature[2],\n"," 'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":38,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 feature03 lable \n","0 0.002764 0.968252 0 \n","1 0.004335 0.914281 0 \n","2 0.001452 0.920231 0 \n","3 0.000973 0.937653 0 \n","4 0.000412 0.982707 0 \n",".. ... ... ... \n","495 0.004746 0.437249 4 \n","496 0.002242 0.476774 4 \n","497 0.011976 0.719661 4 \n","498 0.011564 0.904078 4 \n","499 0.005298 0.498481 4 \n","\n","[500 rows x 6 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.45\n","rbf model accuracy: 0.42\n","poly model accuracy: 0.41\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.45\n","linear kernel accuracy with c= 1,: 0.45\n","linear kernel accuracy with c= 10,: 0.47\n","linear kernel accuracy with c= 100,: 0.57\n","linear kernel accuracy with c= 1000,: 0.65\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.43\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.45\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.43\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.47\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.53\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.37\n","rbf kernel accuracy with gamma= 1, c=1,: 0.45\n","rbf kernel accuracy with gamma= 1, c=10,: 0.47\n","rbf kernel accuracy with gamma= 1, c=100,: 0.49\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.53\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.34\n","rbf kernel accuracy with gamma= 10, c=1,: 0.41\n","rbf kernel accuracy with gamma= 10, c=10,: 0.43\n","rbf kernel accuracy with gamma= 10, c=100,: 0.46\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.45\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.37\n","rbf kernel accuracy with gamma= 100, c=1,: 0.42\n","rbf kernel accuracy with gamma= 100, c=10,: 0.42\n","rbf kernel accuracy with gamma= 100, c=100,: 0.44\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.45\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.19\n","poly kernel accuracy with degree=1,: 0.44\n","poly kernel accuracy with degree=2,: 0.43\n","poly kernel accuracy with degree=3,: 0.41\n","poly kernel accuracy with degree=4,: 0.41\n","poly kernel accuracy with degree=5,: 0.41\n","poly kernel accuracy with degree=6,: 0.41\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.48\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.62\n","confusion matrix\n","[[33 2 3 0 3]\n"," [ 0 5 28 0 6]\n"," [ 2 1 34 0 0]\n"," [ 0 1 0 32 4]\n"," [15 8 1 1 21]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1000)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'max_features': ['auto', 'sqrt'],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# number of features to consider at every split\n","max_features = ['auto','sqrt']\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_features': max_features,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"name":"stderr","output_type":"stream","text":["c:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\ensemble\\_forest.py:424: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n"," warn(\n"]},{"data":{"text/plain":["{'n_estimators': 600,\n"," 'min_samples_split': 5,\n"," 'min_samples_leaf': 1,\n"," 'max_features': 'auto',\n"," 'max_depth': 40,\n"," 'criterion': 'gini',\n"," 'bootstrap': False}"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=42,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 180 candidates, totalling 540 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 20,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 3,\n"," 'min_samples_split': 8,\n"," 'n_estimators': 1000}"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["# 重新改列表里的数值\n","new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [10,20,40,80,90],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [3, 4, 5],\n"," 'min_samples_split': [8, 10, 12],\n"," 'n_estimators': [1000, 1100, 1200, 1300]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.65500\n","confusion matrix\n","[[40 0 1 0 5]\n"," [ 0 18 16 0 12]\n"," [ 2 14 17 1 1]\n"," [ 0 2 1 31 4]\n"," [ 1 5 0 4 25]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.71500\n","confusion matrix\n","[[31 1 6 0 7]\n"," [ 0 25 13 0 1]\n"," [ 0 12 22 0 1]\n"," [ 0 1 0 35 1]\n"," [ 1 8 2 3 30]]\n"]}],"source":["# 改 hyperparameters\n","def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier(n_estimators=1100,criterion='gini',max_depth=10,min_samples_split=12,min_samples_leaf=3,max_features='sqrt',bootstrap=True,)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint\n","from jakteristics import las_utils, compute_features, FEATURE_NAMES"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature 01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature 02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," # feature 03, linearity\n"," cov_mx = np.cov(self.points, rowvar=False)\n"," eigvals, eig_vector = np.linalg.eig(cov_mx)\n"," eigvals = np.sort(eigvals)[::-1]\n"," linearity = (eigvals[0]-eigvals[1]) / eigvals[0]\n"," self.feature.append(linearity)\n","\n"," # feature 04, planarity\n"," planarity = (eigvals[1]-eigvals[2]) / eigvals[0]\n"," self.feature.append(planarity)\n","\n"," # feature 05, sphericity\n"," sphericity = eigvals[2]/eigvals[0]\n"," self.feature.append(sphericity)\n","\n"," # feature 06, omnivariance\n"," omnivariance = np.cbrt((eigvals[0]*eigvals[1]*eigvals[2]))\n"," self.feature.append(omnivariance)\n","\n"," # feature 07, anisotropy\n"," anisotropy = (eigvals[0]-eigvals[2])/eigvals[0]\n"," self.feature.append(anisotropy)\n","\n"," # feature 08, eigenentropy\n"," eigenentropy = -((eigvals[0]*np.log(eigvals[0]))+(eigvals[1]*np.log(eigvals[1]))+(eigvals[2]*np.log(eigvals[2])))\n"," self.feature.append(eigenentropy)\n","\n"," # feature 09, sum of lamdas\n"," sum_of_lamda = eigvals[0]+eigvals[1]+eigvals[2]\n"," self.feature.append(sum_of_lamda)\n","\n"," # feature 10, change of curvature\n"," change_curv = eigvals[2] / sum_of_lamda\n"," self.feature.append(change_curv)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,\n"," 'points':self.points,\n"," 'feature01':self.feature[0],\n"," 'feature02':self.feature[1],\n"," 'feature03':self.feature[2],\n"," 'feature04':self.feature[3],\n"," 'feature05':self.feature[4],\n"," 'feature06':self.feature[5],\n"," 'feature07':self.feature[6],\n"," 'feature08':self.feature[7],\n"," 'feature09':self.feature[8],\n"," 'feature10':self.feature[9],\n"," 'lable':self.label}"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 feature03 feature04 feature05 feature06 feature07 \\\n","0 0.002764 0.781830 0.186422 0.031748 1.721254 0.968252 \n","1 0.004335 0.640157 0.274124 0.085719 2.295543 0.914281 \n","2 0.001452 0.357268 0.562963 0.079769 8.797317 0.920231 \n","3 0.000973 0.403448 0.534206 0.062347 9.323912 0.937653 \n","4 0.000412 0.444323 0.538384 0.017293 7.279276 0.982707 \n",".. ... ... ... ... ... ... \n","495 0.004746 0.178385 0.258864 0.562751 2.194086 0.437249 \n","496 0.002242 0.242329 0.234446 0.523226 6.097446 0.476774 \n","497 0.011976 0.641592 0.078069 0.280339 1.869604 0.719661 \n","498 0.011564 0.261010 0.643068 0.095922 3.095754 0.904078 \n","499 0.005298 0.322539 0.175942 0.501519 2.325635 0.498481 \n","\n"," feature08 feature09 feature10 lable \n","0 -20.847814 11.286457 0.025400 0 \n","1 -16.828596 10.580983 0.059298 0 \n","2 -117.582880 40.790185 0.046310 0 \n","3 -140.853056 46.337350 0.037583 0 \n","4 -176.716771 53.857375 0.010994 0 \n",".. ... ... ... ... \n","495 -5.679643 6.765476 0.236017 4 \n","496 -35.507294 18.932022 0.229395 4 \n","497 -6.258859 6.590341 0.171069 4 \n","498 -24.265297 13.725520 0.052276 4 \n","499 -6.709776 7.262246 0.230162 4 \n","\n","[500 rows x 13 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.90\n","rbf model accuracy: 0.21\n","poly model accuracy: 0.18\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.89\n","linear kernel accuracy with c= 1,: 0.90\n","linear kernel accuracy with c= 10,: 0.89\n","linear kernel accuracy with c= 100,: 0.90\n","linear kernel accuracy with c= 1000,: 0.91\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.42\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.70\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.69\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.69\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.69\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.26\n","rbf kernel accuracy with gamma= 1, c=1,: 0.48\n","rbf kernel accuracy with gamma= 1, c=10,: 0.50\n","rbf kernel accuracy with gamma= 1, c=100,: 0.50\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.50\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.17\n","rbf kernel accuracy with gamma= 10, c=1,: 0.32\n","rbf kernel accuracy with gamma= 10, c=10,: 0.33\n","rbf kernel accuracy with gamma= 10, c=100,: 0.33\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.33\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.17\n","rbf kernel accuracy with gamma= 100, c=1,: 0.18\n","rbf kernel accuracy with gamma= 100, c=10,: 0.20\n","rbf kernel accuracy with gamma= 100, c=100,: 0.20\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.20\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.17\n","poly kernel accuracy with degree=1,: 0.22\n","poly kernel accuracy with degree=2,: 0.19\n","poly kernel accuracy with degree=3,: 0.18\n","poly kernel accuracy with degree=4,: 0.18\n","poly kernel accuracy with degree=5,: 0.17\n","poly kernel accuracy with degree=6,: 0.17\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.47\n","{'C': 100, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.92\n","confusion matrix\n","[[31 0 3 3 0]\n"," [ 0 35 0 0 0]\n"," [ 0 1 39 0 0]\n"," [ 2 1 1 40 1]\n"," [ 2 2 0 1 38]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'max_features': ['auto', 'sqrt'],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# number of features to consider at every split\n","max_features = ['auto','sqrt']\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_features': max_features,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"name":"stderr","output_type":"stream","text":["c:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\ensemble\\_forest.py:424: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n"," warn(\n"]},{"data":{"text/plain":["{'n_estimators': 1000,\n"," 'min_samples_split': 10,\n"," 'min_samples_leaf': 1,\n"," 'max_features': 'auto',\n"," 'max_depth': 110,\n"," 'criterion': 'entropy',\n"," 'bootstrap': True}"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=42,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 256 candidates, totalling 768 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 40,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 1,\n"," 'min_samples_split': 4,\n"," 'n_estimators': 2000}"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["# 重新改列表里的数值\n","new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [40,50,60,70],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [1,2,3,4],\n"," 'min_samples_split': [1,2,3,4],\n"," 'n_estimators': [1700, 1800, 1900, 2000]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":34,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.93500\n","confusion matrix\n","[[39 0 3 0 0]\n"," [ 0 40 1 0 0]\n"," [ 0 1 44 0 0]\n"," [ 0 1 0 34 3]\n"," [ 3 1 0 0 30]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":44,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.96000\n","confusion matrix\n","[[43 0 2 0 0]\n"," [ 0 40 1 0 0]\n"," [ 0 0 42 0 0]\n"," [ 0 0 0 34 1]\n"," [ 2 1 0 1 33]]\n"]}],"source":["# 改 hyperparameters\n","def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier(n_estimators=2000,criterion='entropy',\n"," max_depth=40,min_samples_split=4,\n"," min_samples_leaf=1,max_features='sqrt',\n"," bootstrap=True)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From 932c2e623ff8c62f24570714f1a5ec31cab96cf7 Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Tue, 28 Mar 2023 12:39:59 +0200 Subject: [PATCH 13/14] hyperparams_improved --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index f58c7f2..d7f9837 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint\n","from jakteristics import las_utils, compute_features, FEATURE_NAMES"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature 01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature 02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," # feature 03, linearity\n"," cov_mx = np.cov(self.points, rowvar=False)\n"," eigvals, eig_vector = np.linalg.eig(cov_mx)\n"," eigvals = np.sort(eigvals)[::-1]\n"," linearity = (eigvals[0]-eigvals[1]) / eigvals[0]\n"," self.feature.append(linearity)\n","\n"," # feature 04, planarity\n"," planarity = (eigvals[1]-eigvals[2]) / eigvals[0]\n"," self.feature.append(planarity)\n","\n"," # feature 05, sphericity\n"," sphericity = eigvals[2]/eigvals[0]\n"," self.feature.append(sphericity)\n","\n"," # feature 06, omnivariance\n"," omnivariance = np.cbrt((eigvals[0]*eigvals[1]*eigvals[2]))\n"," self.feature.append(omnivariance)\n","\n"," # feature 07, anisotropy\n"," anisotropy = (eigvals[0]-eigvals[2])/eigvals[0]\n"," self.feature.append(anisotropy)\n","\n"," # feature 08, eigenentropy\n"," eigenentropy = -((eigvals[0]*np.log(eigvals[0]))+(eigvals[1]*np.log(eigvals[1]))+(eigvals[2]*np.log(eigvals[2])))\n"," self.feature.append(eigenentropy)\n","\n"," # feature 09, sum of lamdas\n"," sum_of_lamda = eigvals[0]+eigvals[1]+eigvals[2]\n"," self.feature.append(sum_of_lamda)\n","\n"," # feature 10, change of curvature\n"," change_curv = eigvals[2] / sum_of_lamda\n"," self.feature.append(change_curv)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,\n"," 'points':self.points,\n"," 'feature01':self.feature[0],\n"," 'feature02':self.feature[1],\n"," 'feature03':self.feature[2],\n"," 'feature04':self.feature[3],\n"," 'feature05':self.feature[4],\n"," 'feature06':self.feature[5],\n"," 'feature07':self.feature[6],\n"," 'feature08':self.feature[7],\n"," 'feature09':self.feature[8],\n"," 'feature10':self.feature[9],\n"," 'lable':self.label}"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 feature03 feature04 feature05 feature06 feature07 \\\n","0 0.002764 0.781830 0.186422 0.031748 1.721254 0.968252 \n","1 0.004335 0.640157 0.274124 0.085719 2.295543 0.914281 \n","2 0.001452 0.357268 0.562963 0.079769 8.797317 0.920231 \n","3 0.000973 0.403448 0.534206 0.062347 9.323912 0.937653 \n","4 0.000412 0.444323 0.538384 0.017293 7.279276 0.982707 \n",".. ... ... ... ... ... ... \n","495 0.004746 0.178385 0.258864 0.562751 2.194086 0.437249 \n","496 0.002242 0.242329 0.234446 0.523226 6.097446 0.476774 \n","497 0.011976 0.641592 0.078069 0.280339 1.869604 0.719661 \n","498 0.011564 0.261010 0.643068 0.095922 3.095754 0.904078 \n","499 0.005298 0.322539 0.175942 0.501519 2.325635 0.498481 \n","\n"," feature08 feature09 feature10 lable \n","0 -20.847814 11.286457 0.025400 0 \n","1 -16.828596 10.580983 0.059298 0 \n","2 -117.582880 40.790185 0.046310 0 \n","3 -140.853056 46.337350 0.037583 0 \n","4 -176.716771 53.857375 0.010994 0 \n",".. ... ... ... ... \n","495 -5.679643 6.765476 0.236017 4 \n","496 -35.507294 18.932022 0.229395 4 \n","497 -6.258859 6.590341 0.171069 4 \n","498 -24.265297 13.725520 0.052276 4 \n","499 -6.709776 7.262246 0.230162 4 \n","\n","[500 rows x 13 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.90\n","rbf model accuracy: 0.21\n","poly model accuracy: 0.18\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.89\n","linear kernel accuracy with c= 1,: 0.90\n","linear kernel accuracy with c= 10,: 0.89\n","linear kernel accuracy with c= 100,: 0.90\n","linear kernel accuracy with c= 1000,: 0.91\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.42\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.70\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.69\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.69\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.69\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.26\n","rbf kernel accuracy with gamma= 1, c=1,: 0.48\n","rbf kernel accuracy with gamma= 1, c=10,: 0.50\n","rbf kernel accuracy with gamma= 1, c=100,: 0.50\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.50\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.17\n","rbf kernel accuracy with gamma= 10, c=1,: 0.32\n","rbf kernel accuracy with gamma= 10, c=10,: 0.33\n","rbf kernel accuracy with gamma= 10, c=100,: 0.33\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.33\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.17\n","rbf kernel accuracy with gamma= 100, c=1,: 0.18\n","rbf kernel accuracy with gamma= 100, c=10,: 0.20\n","rbf kernel accuracy with gamma= 100, c=100,: 0.20\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.20\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.17\n","poly kernel accuracy with degree=1,: 0.22\n","poly kernel accuracy with degree=2,: 0.19\n","poly kernel accuracy with degree=3,: 0.18\n","poly kernel accuracy with degree=4,: 0.18\n","poly kernel accuracy with degree=5,: 0.17\n","poly kernel accuracy with degree=6,: 0.17\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.47\n","{'C': 100, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.92\n","confusion matrix\n","[[31 0 3 3 0]\n"," [ 0 35 0 0 0]\n"," [ 0 1 39 0 0]\n"," [ 2 1 1 40 1]\n"," [ 2 2 0 1 38]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'max_features': ['auto', 'sqrt'],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# number of features to consider at every split\n","max_features = ['auto','sqrt']\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_features': max_features,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"name":"stderr","output_type":"stream","text":["c:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\ensemble\\_forest.py:424: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n"," warn(\n"]},{"data":{"text/plain":["{'n_estimators': 1000,\n"," 'min_samples_split': 10,\n"," 'min_samples_leaf': 1,\n"," 'max_features': 'auto',\n"," 'max_depth': 110,\n"," 'criterion': 'entropy',\n"," 'bootstrap': True}"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=42,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 256 candidates, totalling 768 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 40,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 1,\n"," 'min_samples_split': 4,\n"," 'n_estimators': 2000}"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["# 重新改列表里的数值\n","new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [40,50,60,70],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [1,2,3,4],\n"," 'min_samples_split': [1,2,3,4],\n"," 'n_estimators': [1700, 1800, 1900, 2000]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":34,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.93500\n","confusion matrix\n","[[39 0 3 0 0]\n"," [ 0 40 1 0 0]\n"," [ 0 1 44 0 0]\n"," [ 0 1 0 34 3]\n"," [ 3 1 0 0 30]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":44,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.96000\n","confusion matrix\n","[[43 0 2 0 0]\n"," [ 0 40 1 0 0]\n"," [ 0 0 42 0 0]\n"," [ 0 0 0 34 1]\n"," [ 2 1 0 1 33]]\n"]}],"source":["# 改 hyperparameters\n","def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier(n_estimators=2000,criterion='entropy',\n"," max_depth=40,min_samples_split=4,\n"," min_samples_leaf=1,max_features='sqrt',\n"," bootstrap=True)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint\n"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature 01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature 02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," # feature 03, linearity\n"," cov_mx = np.cov(self.points, rowvar=False)\n"," eigvals, eig_vector = np.linalg.eig(cov_mx)\n"," eigvals = np.sort(eigvals)[::-1]\n"," linearity = (eigvals[0]-eigvals[1]) / eigvals[0]\n"," self.feature.append(linearity)\n","\n"," # feature 04, planarity\n"," planarity = (eigvals[1]-eigvals[2]) / eigvals[0]\n"," self.feature.append(planarity)\n","\n"," # feature 05, sphericity\n"," sphericity = eigvals[2]/eigvals[0]\n"," self.feature.append(sphericity)\n","\n"," # feature 06, omnivariance\n"," omnivariance = np.cbrt((eigvals[0]*eigvals[1]*eigvals[2]))\n"," self.feature.append(omnivariance)\n","\n"," # feature 07, anisotropy\n"," anisotropy = (eigvals[0]-eigvals[2])/eigvals[0]\n"," self.feature.append(anisotropy)\n","\n"," # feature 08, eigenentropy\n"," eigenentropy = -((eigvals[0]*np.log(eigvals[0]))+(eigvals[1]*np.log(eigvals[1]))+(eigvals[2]*np.log(eigvals[2])))\n"," self.feature.append(eigenentropy)\n","\n"," # feature 09, sum of lamdas\n"," sum_of_lamda = eigvals[0]+eigvals[1]+eigvals[2]\n"," self.feature.append(sum_of_lamda)\n","\n"," # feature 10, change of curvature\n"," change_curv = eigvals[2] / sum_of_lamda\n"," self.feature.append(change_curv)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,\n"," 'points':self.points,\n"," 'feature01':self.feature[0],\n"," 'feature02':self.feature[1],\n"," 'feature03':self.feature[2],\n"," 'feature04':self.feature[3],\n"," 'feature05':self.feature[4],\n"," 'feature06':self.feature[5],\n"," 'feature07':self.feature[6],\n"," 'feature08':self.feature[7],\n"," 'feature09':self.feature[8],\n"," 'feature10':self.feature[9],\n"," 'lable':self.label}"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 feature03 feature04 feature05 feature06 feature07 \\\n","0 0.002764 0.781830 0.186422 0.031748 1.721254 0.968252 \n","1 0.004335 0.640157 0.274124 0.085719 2.295543 0.914281 \n","2 0.001452 0.357268 0.562963 0.079769 8.797317 0.920231 \n","3 0.000973 0.403448 0.534206 0.062347 9.323912 0.937653 \n","4 0.000412 0.444323 0.538384 0.017293 7.279276 0.982707 \n",".. ... ... ... ... ... ... \n","495 0.004746 0.178385 0.258864 0.562751 2.194086 0.437249 \n","496 0.002242 0.242329 0.234446 0.523226 6.097446 0.476774 \n","497 0.011976 0.641592 0.078069 0.280339 1.869604 0.719661 \n","498 0.011564 0.261010 0.643068 0.095922 3.095754 0.904078 \n","499 0.005298 0.322539 0.175942 0.501519 2.325635 0.498481 \n","\n"," feature08 feature09 feature10 lable \n","0 -20.847814 11.286457 0.025400 0 \n","1 -16.828596 10.580983 0.059298 0 \n","2 -117.582880 40.790185 0.046310 0 \n","3 -140.853056 46.337350 0.037583 0 \n","4 -176.716771 53.857375 0.010994 0 \n",".. ... ... ... ... \n","495 -5.679643 6.765476 0.236017 4 \n","496 -35.507294 18.932022 0.229395 4 \n","497 -6.258859 6.590341 0.171069 4 \n","498 -24.265297 13.725520 0.052276 4 \n","499 -6.709776 7.262246 0.230162 4 \n","\n","[500 rows x 13 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.91\n","rbf model accuracy: 0.29\n","poly model accuracy: 0.23\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.86\n","linear kernel accuracy with c= 1,: 0.91\n","linear kernel accuracy with c= 10,: 0.92\n","linear kernel accuracy with c= 100,: 0.91\n","linear kernel accuracy with c= 1000,: 0.91\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.49\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.81\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.83\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.83\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.82\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.38\n","rbf kernel accuracy with gamma= 1, c=1,: 0.61\n","rbf kernel accuracy with gamma= 1, c=10,: 0.63\n","rbf kernel accuracy with gamma= 1, c=100,: 0.63\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.63\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.18\n","rbf kernel accuracy with gamma= 10, c=1,: 0.34\n","rbf kernel accuracy with gamma= 10, c=10,: 0.34\n","rbf kernel accuracy with gamma= 10, c=100,: 0.34\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.34\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.18\n","rbf kernel accuracy with gamma= 100, c=1,: 0.24\n","rbf kernel accuracy with gamma= 100, c=10,: 0.26\n","rbf kernel accuracy with gamma= 100, c=100,: 0.26\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.26\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.18\n","poly kernel accuracy with degree=1,: 0.26\n","poly kernel accuracy with degree=2,: 0.23\n","poly kernel accuracy with degree=3,: 0.23\n","poly kernel accuracy with degree=4,: 0.23\n","poly kernel accuracy with degree=5,: 0.22\n","poly kernel accuracy with degree=6,: 0.22\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.31\n","{'C': 100, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.92\n","confusion matrix\n","[[38 0 0 4 2]\n"," [ 0 42 0 0 0]\n"," [ 0 2 37 0 0]\n"," [ 0 2 0 36 1]\n"," [ 1 0 0 4 31]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"data":{"text/plain":["{'n_estimators': 600,\n"," 'min_samples_split': 5,\n"," 'min_samples_leaf': 1,\n"," 'max_depth': None,\n"," 'criterion': 'gini',\n"," 'bootstrap': False}"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=42,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 256 candidates, totalling 768 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 40,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 1,\n"," 'min_samples_split': 1,\n"," 'n_estimators': 1800}"]},"execution_count":15,"metadata":{},"output_type":"execute_result"}],"source":["new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [40,50,60,70],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [1,2,3,4],\n"," 'min_samples_split': [1,2,3,4],\n"," 'n_estimators': [1700, 1800, 1900, 2000]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.91500\n","confusion matrix\n","[[46 0 2 0 0]\n"," [ 0 34 0 0 0]\n"," [ 0 0 37 1 0]\n"," [ 3 1 4 31 2]\n"," [ 2 1 0 1 35]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.95500\n","confusion matrix\n","[[39 0 2 0 1]\n"," [ 0 44 0 0 0]\n"," [ 0 0 38 1 0]\n"," [ 0 1 0 36 1]\n"," [ 1 1 0 1 34]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier(n_estimators=2000,criterion='entropy',\n"," max_depth=40,min_samples_split=4,\n"," min_samples_leaf=1,max_features='sqrt',\n"," bootstrap=True)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} From 01339250296833114592943b6f8102d25800182f Mon Sep 17 00:00:00 2001 From: ZhuoranJia <92588550+ZhuoranJia@users.noreply.github.com> Date: Tue, 28 Mar 2023 18:59:40 +0200 Subject: [PATCH 14/14] random_state_101 --- assignment_2/ass2_notes.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index d7f9837..5ac108d 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint\n"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature 01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature 02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," # feature 03, linearity\n"," cov_mx = np.cov(self.points, rowvar=False)\n"," eigvals, eig_vector = np.linalg.eig(cov_mx)\n"," eigvals = np.sort(eigvals)[::-1]\n"," linearity = (eigvals[0]-eigvals[1]) / eigvals[0]\n"," self.feature.append(linearity)\n","\n"," # feature 04, planarity\n"," planarity = (eigvals[1]-eigvals[2]) / eigvals[0]\n"," self.feature.append(planarity)\n","\n"," # feature 05, sphericity\n"," sphericity = eigvals[2]/eigvals[0]\n"," self.feature.append(sphericity)\n","\n"," # feature 06, omnivariance\n"," omnivariance = np.cbrt((eigvals[0]*eigvals[1]*eigvals[2]))\n"," self.feature.append(omnivariance)\n","\n"," # feature 07, anisotropy\n"," anisotropy = (eigvals[0]-eigvals[2])/eigvals[0]\n"," self.feature.append(anisotropy)\n","\n"," # feature 08, eigenentropy\n"," eigenentropy = -((eigvals[0]*np.log(eigvals[0]))+(eigvals[1]*np.log(eigvals[1]))+(eigvals[2]*np.log(eigvals[2])))\n"," self.feature.append(eigenentropy)\n","\n"," # feature 09, sum of lamdas\n"," sum_of_lamda = eigvals[0]+eigvals[1]+eigvals[2]\n"," self.feature.append(sum_of_lamda)\n","\n"," # feature 10, change of curvature\n"," change_curv = eigvals[2] / sum_of_lamda\n"," self.feature.append(change_curv)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,\n"," 'points':self.points,\n"," 'feature01':self.feature[0],\n"," 'feature02':self.feature[1],\n"," 'feature03':self.feature[2],\n"," 'feature04':self.feature[3],\n"," 'feature05':self.feature[4],\n"," 'feature06':self.feature[5],\n"," 'feature07':self.feature[6],\n"," 'feature08':self.feature[7],\n"," 'feature09':self.feature[8],\n"," 'feature10':self.feature[9],\n"," 'lable':self.label}"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 feature03 feature04 feature05 feature06 feature07 \\\n","0 0.002764 0.781830 0.186422 0.031748 1.721254 0.968252 \n","1 0.004335 0.640157 0.274124 0.085719 2.295543 0.914281 \n","2 0.001452 0.357268 0.562963 0.079769 8.797317 0.920231 \n","3 0.000973 0.403448 0.534206 0.062347 9.323912 0.937653 \n","4 0.000412 0.444323 0.538384 0.017293 7.279276 0.982707 \n",".. ... ... ... ... ... ... \n","495 0.004746 0.178385 0.258864 0.562751 2.194086 0.437249 \n","496 0.002242 0.242329 0.234446 0.523226 6.097446 0.476774 \n","497 0.011976 0.641592 0.078069 0.280339 1.869604 0.719661 \n","498 0.011564 0.261010 0.643068 0.095922 3.095754 0.904078 \n","499 0.005298 0.322539 0.175942 0.501519 2.325635 0.498481 \n","\n"," feature08 feature09 feature10 lable \n","0 -20.847814 11.286457 0.025400 0 \n","1 -16.828596 10.580983 0.059298 0 \n","2 -117.582880 40.790185 0.046310 0 \n","3 -140.853056 46.337350 0.037583 0 \n","4 -176.716771 53.857375 0.010994 0 \n",".. ... ... ... ... \n","495 -5.679643 6.765476 0.236017 4 \n","496 -35.507294 18.932022 0.229395 4 \n","497 -6.258859 6.590341 0.171069 4 \n","498 -24.265297 13.725520 0.052276 4 \n","499 -6.709776 7.262246 0.230162 4 \n","\n","[500 rows x 13 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.91\n","rbf model accuracy: 0.29\n","poly model accuracy: 0.23\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.86\n","linear kernel accuracy with c= 1,: 0.91\n","linear kernel accuracy with c= 10,: 0.92\n","linear kernel accuracy with c= 100,: 0.91\n","linear kernel accuracy with c= 1000,: 0.91\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.49\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.81\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.83\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.83\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.82\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.38\n","rbf kernel accuracy with gamma= 1, c=1,: 0.61\n","rbf kernel accuracy with gamma= 1, c=10,: 0.63\n","rbf kernel accuracy with gamma= 1, c=100,: 0.63\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.63\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.18\n","rbf kernel accuracy with gamma= 10, c=1,: 0.34\n","rbf kernel accuracy with gamma= 10, c=10,: 0.34\n","rbf kernel accuracy with gamma= 10, c=100,: 0.34\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.34\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.18\n","rbf kernel accuracy with gamma= 100, c=1,: 0.24\n","rbf kernel accuracy with gamma= 100, c=10,: 0.26\n","rbf kernel accuracy with gamma= 100, c=100,: 0.26\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.26\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.18\n","poly kernel accuracy with degree=1,: 0.26\n","poly kernel accuracy with degree=2,: 0.23\n","poly kernel accuracy with degree=3,: 0.23\n","poly kernel accuracy with degree=4,: 0.23\n","poly kernel accuracy with degree=5,: 0.22\n","poly kernel accuracy with degree=6,: 0.22\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.31\n","{'C': 100, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.92\n","confusion matrix\n","[[38 0 0 4 2]\n"," [ 0 42 0 0 0]\n"," [ 0 2 37 0 0]\n"," [ 0 2 0 36 1]\n"," [ 1 0 0 4 31]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"data":{"text/plain":["{'n_estimators': 600,\n"," 'min_samples_split': 5,\n"," 'min_samples_leaf': 1,\n"," 'max_depth': None,\n"," 'criterion': 'gini',\n"," 'bootstrap': False}"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=42,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 256 candidates, totalling 768 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 40,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 1,\n"," 'min_samples_split': 1,\n"," 'n_estimators': 1800}"]},"execution_count":15,"metadata":{},"output_type":"execute_result"}],"source":["new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [40,50,60,70],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [1,2,3,4],\n"," 'min_samples_split': [1,2,3,4],\n"," 'n_estimators': [1700, 1800, 1900, 2000]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.91500\n","confusion matrix\n","[[46 0 2 0 0]\n"," [ 0 34 0 0 0]\n"," [ 0 0 37 1 0]\n"," [ 3 1 4 31 2]\n"," [ 2 1 0 1 35]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.95500\n","confusion matrix\n","[[39 0 2 0 1]\n"," [ 0 44 0 0 0]\n"," [ 0 0 38 1 0]\n"," [ 0 1 0 36 1]\n"," [ 1 1 0 1 34]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier(n_estimators=2000,criterion='entropy',\n"," max_depth=40,min_samples_split=4,\n"," min_samples_leaf=1,max_features='sqrt',\n"," bootstrap=True)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":102,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint\n"]},{"cell_type":"code","execution_count":103,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":104,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature 01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature 02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," # feature 03, linearity\n"," cov_mx = np.cov(self.points, rowvar=False)\n"," eigvals, eig_vector = np.linalg.eig(cov_mx)\n"," eigvals = np.sort(eigvals)[::-1]\n"," linearity = (eigvals[0]-eigvals[1]) / eigvals[0]\n"," self.feature.append(linearity)\n","\n"," # feature 04, planarity\n"," planarity = (eigvals[1]-eigvals[2]) / eigvals[0]\n"," self.feature.append(planarity)\n","\n"," # feature 05, sphericity\n"," sphericity = eigvals[2]/eigvals[0]\n"," self.feature.append(sphericity)\n","\n"," # feature 06, omnivariance\n"," omnivariance = np.cbrt((eigvals[0]*eigvals[1]*eigvals[2]))\n"," self.feature.append(omnivariance)\n","\n"," # feature 07, anisotropy\n"," anisotropy = (eigvals[0]-eigvals[2])/eigvals[0]\n"," self.feature.append(anisotropy)\n","\n"," # feature 08, eigenentropy\n"," eigenentropy = -((eigvals[0]*np.log(eigvals[0]))+(eigvals[1]*np.log(eigvals[1]))+(eigvals[2]*np.log(eigvals[2])))\n"," self.feature.append(eigenentropy)\n","\n"," # feature 09, sum of lamdas\n"," sum_of_lamda = eigvals[0]+eigvals[1]+eigvals[2]\n"," self.feature.append(sum_of_lamda)\n","\n"," # feature 10, change of curvature\n"," change_curv = eigvals[2] / sum_of_lamda\n"," self.feature.append(change_curv)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,\n"," 'points':self.points,\n"," 'feature01':self.feature[0],\n"," 'feature02':self.feature[1],\n"," 'feature03':self.feature[2],\n"," 'feature04':self.feature[3],\n"," 'feature05':self.feature[4],\n"," 'feature06':self.feature[5],\n"," 'feature07':self.feature[6],\n"," 'feature08':self.feature[7],\n"," 'feature09':self.feature[8],\n"," 'feature10':self.feature[9],\n"," 'lable':self.label}"]},{"cell_type":"code","execution_count":105,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 feature03 feature04 feature05 feature06 feature07 \\\n","0 0.002764 0.781830 0.186422 0.031748 1.721254 0.968252 \n","1 0.004335 0.640157 0.274124 0.085719 2.295543 0.914281 \n","2 0.001452 0.357268 0.562963 0.079769 8.797317 0.920231 \n","3 0.000973 0.403448 0.534206 0.062347 9.323912 0.937653 \n","4 0.000412 0.444323 0.538384 0.017293 7.279276 0.982707 \n",".. ... ... ... ... ... ... \n","495 0.004746 0.178385 0.258864 0.562751 2.194086 0.437249 \n","496 0.002242 0.242329 0.234446 0.523226 6.097446 0.476774 \n","497 0.011976 0.641592 0.078069 0.280339 1.869604 0.719661 \n","498 0.011564 0.261010 0.643068 0.095922 3.095754 0.904078 \n","499 0.005298 0.322539 0.175942 0.501519 2.325635 0.498481 \n","\n"," feature08 feature09 feature10 lable \n","0 -20.847814 11.286457 0.025400 0 \n","1 -16.828596 10.580983 0.059298 0 \n","2 -117.582880 40.790185 0.046310 0 \n","3 -140.853056 46.337350 0.037583 0 \n","4 -176.716771 53.857375 0.010994 0 \n",".. ... ... ... ... \n","495 -5.679643 6.765476 0.236017 4 \n","496 -35.507294 18.932022 0.229395 4 \n","497 -6.258859 6.590341 0.171069 4 \n","498 -24.265297 13.725520 0.052276 4 \n","499 -6.709776 7.262246 0.230162 4 \n","\n","[500 rows x 13 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":106,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":107,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.94\n","rbf model accuracy: 0.27\n","poly model accuracy: 0.20\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=101)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":108,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.91\n","linear kernel accuracy with c= 1,: 0.94\n","linear kernel accuracy with c= 10,: 0.93\n","linear kernel accuracy with c= 100,: 0.94\n","linear kernel accuracy with c= 1000,: 0.93\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":109,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.34\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.74\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.74\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.73\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.73\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.23\n","rbf kernel accuracy with gamma= 1, c=1,: 0.55\n","rbf kernel accuracy with gamma= 1, c=10,: 0.58\n","rbf kernel accuracy with gamma= 1, c=100,: 0.57\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.57\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.17\n","rbf kernel accuracy with gamma= 10, c=1,: 0.29\n","rbf kernel accuracy with gamma= 10, c=10,: 0.29\n","rbf kernel accuracy with gamma= 10, c=100,: 0.29\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.29\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.17\n","rbf kernel accuracy with gamma= 100, c=1,: 0.20\n","rbf kernel accuracy with gamma= 100, c=10,: 0.20\n","rbf kernel accuracy with gamma= 100, c=100,: 0.20\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.20\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":110,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.17\n","poly kernel accuracy with degree=1,: 0.18\n","poly kernel accuracy with degree=2,: 0.20\n","poly kernel accuracy with degree=3,: 0.20\n","poly kernel accuracy with degree=4,: 0.20\n","poly kernel accuracy with degree=5,: 0.20\n","poly kernel accuracy with degree=6,: 0.20\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":111,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.27\n","{'C': 100, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=101)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":112,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.94\n","confusion matrix\n","[[33 0 1 3 3]\n"," [ 0 42 0 0 0]\n"," [ 0 1 46 0 1]\n"," [ 0 1 0 30 3]\n"," [ 0 0 0 0 36]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=101)\n"," clf = svm.SVC(kernel='linear',C=1)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":113,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":114,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"data":{"text/plain":["{'n_estimators': 1200,\n"," 'min_samples_split': 5,\n"," 'min_samples_leaf': 1,\n"," 'max_depth': 20,\n"," 'criterion': 'gini',\n"," 'bootstrap': True}"]},"execution_count":114,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=101,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":115,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 256 candidates, totalling 768 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 40,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 2,\n"," 'min_samples_split': 1,\n"," 'n_estimators': 1800}"]},"execution_count":115,"metadata":{},"output_type":"execute_result"}],"source":["new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [40,50,60,70],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [1,2,3,4],\n"," 'min_samples_split': [1,2,3,4],\n"," 'n_estimators': [1700, 1800, 1900, 2000]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":116,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.96000\n","confusion matrix\n","[[38 0 1 0 1]\n"," [ 0 42 0 0 0]\n"," [ 0 1 46 1 0]\n"," [ 0 0 0 31 3]\n"," [ 1 0 0 0 35]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=101)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":117,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.96500\n","confusion matrix\n","[[38 0 2 0 0]\n"," [ 0 42 0 0 0]\n"," [ 0 1 46 1 0]\n"," [ 0 0 0 32 2]\n"," [ 1 0 0 0 35]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=101)\n"," rf = RandomForestClassifier(n_estimators=2000,criterion='entropy',\n"," max_depth=40,min_samples_split=4,\n"," min_samples_leaf=1,max_features='sqrt',\n"," bootstrap=True)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2}