diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 7b9e343..5ac108d 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":45,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n"]},{"cell_type":"code","execution_count":46,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":47,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'name':self.cloud_name,'points':self.points,'features':self.feature,'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":48,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID name points \\\n","0 0 000 [[20.06999969482422, 499.9599914550781, 17.450... \n","1 1 001 [[373.3099975585938, 404.2200012207031, 7.1300... \n","2 2 002 [[65.91000366210938, 326.8599853515625, 12.710... \n","3 3 003 [[109.5, 391.5599975585938, 12.69999980926514]... \n","4 4 004 [[126.4300003051758, 234.9400024414062, 6.8800... \n",".. ... ... ... \n","495 495 495 [[129.3399963378906, 12.97000026702881, 8.2200... \n","496 496 496 [[440.1499938964844, 35.84999847412109, 6.1199... \n","497 497 497 [[158.1799926757812, 130.6999969482422, 4.9899... \n","498 498 498 [[498.3299865722656, 93.45999908447266, 9.4200... \n","499 499 499 [[86.16000366210938, 132.1300048828125, 7.0399... \n","\n"," features lable \n","0 [19.72999954223633, 0.002763957987838585] 0 \n","1 [8.470000267028809, 0.004335260115606936] 0 \n","2 [15.56999969482422, 0.00145218945487042] 0 \n","3 [16.46999931335449, 0.0009728572818367545] 0 \n","4 [9.75, 0.0004123711340206186] 0 \n",".. ... ... \n","495 [11.53999996185303, 0.004746257758305951] 4 \n","496 [15.17000007629395, 0.002241817366611867] 4 \n","497 [11.10999965667725, 0.011976047904191617] 4 \n","498 [18.90999984741211, 0.011563599798893917] 4 \n","499 [11.89999961853027, 0.005298013245033113] 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":52,"metadata":{},"outputs":[{"data":{"text/plain":["0 0\n","1 0\n","2 0\n","3 0\n","4 0\n"," ..\n","495 4\n","496 4\n","497 4\n","498 4\n","499 4\n","Name: lable, Length: 500, dtype: int64"]},"execution_count":52,"metadata":{},"output_type":"execute_result"}],"source":["X = pt_cloud_df['features']\n","y = pt_cloud_df['lable'].copy()\n","y"]},{"cell_type":"code","execution_count":50,"metadata":{},"outputs":[{"ename":"ValueError","evalue":"setting an array element with a sequence.","output_type":"error","traceback":["\u001b[1;31m---------------------------------------------------------------------------\u001b[0m","\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)","\u001b[1;31mTypeError\u001b[0m: float() argument must be a string or a real number, not 'list'","\nThe above exception was the direct cause of the following exception:\n","\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)","Cell \u001b[1;32mIn[50], line 12\u001b[0m\n\u001b[0;32m 9\u001b[0m conf_matrix \u001b[39m=\u001b[39m confusion_matrix(y_test, y_preds)\n\u001b[0;32m 10\u001b[0m \u001b[39mprint\u001b[39m(conf_matrix)\n\u001b[1;32m---> 12\u001b[0m SVM_classification(X,y)\n","Cell \u001b[1;32mIn[50], line 4\u001b[0m, in \u001b[0;36mSVM_classification\u001b[1;34m(X, y)\u001b[0m\n\u001b[0;32m 2\u001b[0m X_train, X_test, y_train, y_test \u001b[39m=\u001b[39m train_test_split(X,y,test_size\u001b[39m=\u001b[39m\u001b[39m0.4\u001b[39m)\n\u001b[0;32m 3\u001b[0m clf \u001b[39m=\u001b[39m svm\u001b[39m.\u001b[39mSVC()\n\u001b[1;32m----> 4\u001b[0m clf\u001b[39m.\u001b[39;49mfit(X_train,y_train)\n\u001b[0;32m 5\u001b[0m y_preds \u001b[39m=\u001b[39m clf\u001b[39m.\u001b[39mpredict(X_test)\n\u001b[0;32m 6\u001b[0m acc \u001b[39m=\u001b[39m accuracy_score(y_test,y_preds)\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\svm\\_base.py:192\u001b[0m, in \u001b[0;36mBaseLibSVM.fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 190\u001b[0m check_consistent_length(X, y)\n\u001b[0;32m 191\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 192\u001b[0m X, y \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_data(\n\u001b[0;32m 193\u001b[0m X,\n\u001b[0;32m 194\u001b[0m y,\n\u001b[0;32m 195\u001b[0m dtype\u001b[39m=\u001b[39;49mnp\u001b[39m.\u001b[39;49mfloat64,\n\u001b[0;32m 196\u001b[0m order\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mC\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m 197\u001b[0m accept_sparse\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcsr\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m 198\u001b[0m accept_large_sparse\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[0;32m 199\u001b[0m )\n\u001b[0;32m 201\u001b[0m y \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_validate_targets(y)\n\u001b[0;32m 203\u001b[0m sample_weight \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39masarray(\n\u001b[0;32m 204\u001b[0m [] \u001b[39mif\u001b[39;00m sample_weight \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m sample_weight, dtype\u001b[39m=\u001b[39mnp\u001b[39m.\u001b[39mfloat64\n\u001b[0;32m 205\u001b[0m )\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\base.py:565\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[0;32m 563\u001b[0m y \u001b[39m=\u001b[39m check_array(y, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_y_params)\n\u001b[0;32m 564\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 565\u001b[0m X, y \u001b[39m=\u001b[39m check_X_y(X, y, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mcheck_params)\n\u001b[0;32m 566\u001b[0m out \u001b[39m=\u001b[39m X, y\n\u001b[0;32m 568\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m no_val_X \u001b[39mand\u001b[39;00m check_params\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mensure_2d\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mTrue\u001b[39;00m):\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\utils\\validation.py:1106\u001b[0m, in \u001b[0;36mcheck_X_y\u001b[1;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[0;32m 1101\u001b[0m estimator_name \u001b[39m=\u001b[39m _check_estimator_name(estimator)\n\u001b[0;32m 1102\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 1103\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mestimator_name\u001b[39m}\u001b[39;00m\u001b[39m requires y to be passed, but the target y is None\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 1104\u001b[0m )\n\u001b[1;32m-> 1106\u001b[0m X \u001b[39m=\u001b[39m check_array(\n\u001b[0;32m 1107\u001b[0m X,\n\u001b[0;32m 1108\u001b[0m accept_sparse\u001b[39m=\u001b[39;49maccept_sparse,\n\u001b[0;32m 1109\u001b[0m accept_large_sparse\u001b[39m=\u001b[39;49maccept_large_sparse,\n\u001b[0;32m 1110\u001b[0m dtype\u001b[39m=\u001b[39;49mdtype,\n\u001b[0;32m 1111\u001b[0m order\u001b[39m=\u001b[39;49morder,\n\u001b[0;32m 1112\u001b[0m copy\u001b[39m=\u001b[39;49mcopy,\n\u001b[0;32m 1113\u001b[0m force_all_finite\u001b[39m=\u001b[39;49mforce_all_finite,\n\u001b[0;32m 1114\u001b[0m ensure_2d\u001b[39m=\u001b[39;49mensure_2d,\n\u001b[0;32m 1115\u001b[0m allow_nd\u001b[39m=\u001b[39;49mallow_nd,\n\u001b[0;32m 1116\u001b[0m ensure_min_samples\u001b[39m=\u001b[39;49mensure_min_samples,\n\u001b[0;32m 1117\u001b[0m ensure_min_features\u001b[39m=\u001b[39;49mensure_min_features,\n\u001b[0;32m 1118\u001b[0m estimator\u001b[39m=\u001b[39;49mestimator,\n\u001b[0;32m 1119\u001b[0m input_name\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mX\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m 1120\u001b[0m )\n\u001b[0;32m 1122\u001b[0m y \u001b[39m=\u001b[39m _check_y(y, multi_output\u001b[39m=\u001b[39mmulti_output, y_numeric\u001b[39m=\u001b[39my_numeric, estimator\u001b[39m=\u001b[39mestimator)\n\u001b[0;32m 1124\u001b[0m check_consistent_length(X, y)\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\utils\\validation.py:879\u001b[0m, in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 877\u001b[0m array \u001b[39m=\u001b[39m xp\u001b[39m.\u001b[39mastype(array, dtype, copy\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[0;32m 878\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 879\u001b[0m array \u001b[39m=\u001b[39m _asarray_with_order(array, order\u001b[39m=\u001b[39;49morder, dtype\u001b[39m=\u001b[39;49mdtype, xp\u001b[39m=\u001b[39;49mxp)\n\u001b[0;32m 880\u001b[0m \u001b[39mexcept\u001b[39;00m ComplexWarning \u001b[39mas\u001b[39;00m complex_warning:\n\u001b[0;32m 881\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 882\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mComplex data not supported\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m{}\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(array)\n\u001b[0;32m 883\u001b[0m ) \u001b[39mfrom\u001b[39;00m \u001b[39mcomplex_warning\u001b[39;00m\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\utils\\_array_api.py:185\u001b[0m, in \u001b[0;36m_asarray_with_order\u001b[1;34m(array, dtype, order, copy, xp)\u001b[0m\n\u001b[0;32m 182\u001b[0m xp, _ \u001b[39m=\u001b[39m get_namespace(array)\n\u001b[0;32m 183\u001b[0m \u001b[39mif\u001b[39;00m xp\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m \u001b[39min\u001b[39;00m {\u001b[39m\"\u001b[39m\u001b[39mnumpy\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mnumpy.array_api\u001b[39m\u001b[39m\"\u001b[39m}:\n\u001b[0;32m 184\u001b[0m \u001b[39m# Use NumPy API to support order\u001b[39;00m\n\u001b[1;32m--> 185\u001b[0m array \u001b[39m=\u001b[39m numpy\u001b[39m.\u001b[39masarray(array, order\u001b[39m=\u001b[39morder, dtype\u001b[39m=\u001b[39mdtype)\n\u001b[0;32m 186\u001b[0m \u001b[39mreturn\u001b[39;00m xp\u001b[39m.\u001b[39masarray(array, copy\u001b[39m=\u001b[39mcopy)\n\u001b[0;32m 187\u001b[0m \u001b[39melse\u001b[39;00m:\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\pandas\\core\\generic.py:2070\u001b[0m, in \u001b[0;36mNDFrame.__array__\u001b[1;34m(self, dtype)\u001b[0m\n\u001b[0;32m 2069\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__array__\u001b[39m(\u001b[39mself\u001b[39m, dtype: npt\u001b[39m.\u001b[39mDTypeLike \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m np\u001b[39m.\u001b[39mndarray:\n\u001b[1;32m-> 2070\u001b[0m \u001b[39mreturn\u001b[39;00m np\u001b[39m.\u001b[39;49masarray(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_values, dtype\u001b[39m=\u001b[39;49mdtype)\n","\u001b[1;31mValueError\u001b[0m: setting an array element with a sequence."]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":102,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint\n"]},{"cell_type":"code","execution_count":103,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":104,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature 01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature 02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," # feature 03, linearity\n"," cov_mx = np.cov(self.points, rowvar=False)\n"," eigvals, eig_vector = np.linalg.eig(cov_mx)\n"," eigvals = np.sort(eigvals)[::-1]\n"," linearity = (eigvals[0]-eigvals[1]) / eigvals[0]\n"," self.feature.append(linearity)\n","\n"," # feature 04, planarity\n"," planarity = (eigvals[1]-eigvals[2]) / eigvals[0]\n"," self.feature.append(planarity)\n","\n"," # feature 05, sphericity\n"," sphericity = eigvals[2]/eigvals[0]\n"," self.feature.append(sphericity)\n","\n"," # feature 06, omnivariance\n"," omnivariance = np.cbrt((eigvals[0]*eigvals[1]*eigvals[2]))\n"," self.feature.append(omnivariance)\n","\n"," # feature 07, anisotropy\n"," anisotropy = (eigvals[0]-eigvals[2])/eigvals[0]\n"," self.feature.append(anisotropy)\n","\n"," # feature 08, eigenentropy\n"," eigenentropy = -((eigvals[0]*np.log(eigvals[0]))+(eigvals[1]*np.log(eigvals[1]))+(eigvals[2]*np.log(eigvals[2])))\n"," self.feature.append(eigenentropy)\n","\n"," # feature 09, sum of lamdas\n"," sum_of_lamda = eigvals[0]+eigvals[1]+eigvals[2]\n"," self.feature.append(sum_of_lamda)\n","\n"," # feature 10, change of curvature\n"," change_curv = eigvals[2] / sum_of_lamda\n"," self.feature.append(change_curv)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,\n"," 'points':self.points,\n"," 'feature01':self.feature[0],\n"," 'feature02':self.feature[1],\n"," 'feature03':self.feature[2],\n"," 'feature04':self.feature[3],\n"," 'feature05':self.feature[4],\n"," 'feature06':self.feature[5],\n"," 'feature07':self.feature[6],\n"," 'feature08':self.feature[7],\n"," 'feature09':self.feature[8],\n"," 'feature10':self.feature[9],\n"," 'lable':self.label}"]},{"cell_type":"code","execution_count":105,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 feature03 feature04 feature05 feature06 feature07 \\\n","0 0.002764 0.781830 0.186422 0.031748 1.721254 0.968252 \n","1 0.004335 0.640157 0.274124 0.085719 2.295543 0.914281 \n","2 0.001452 0.357268 0.562963 0.079769 8.797317 0.920231 \n","3 0.000973 0.403448 0.534206 0.062347 9.323912 0.937653 \n","4 0.000412 0.444323 0.538384 0.017293 7.279276 0.982707 \n",".. ... ... ... ... ... ... \n","495 0.004746 0.178385 0.258864 0.562751 2.194086 0.437249 \n","496 0.002242 0.242329 0.234446 0.523226 6.097446 0.476774 \n","497 0.011976 0.641592 0.078069 0.280339 1.869604 0.719661 \n","498 0.011564 0.261010 0.643068 0.095922 3.095754 0.904078 \n","499 0.005298 0.322539 0.175942 0.501519 2.325635 0.498481 \n","\n"," feature08 feature09 feature10 lable \n","0 -20.847814 11.286457 0.025400 0 \n","1 -16.828596 10.580983 0.059298 0 \n","2 -117.582880 40.790185 0.046310 0 \n","3 -140.853056 46.337350 0.037583 0 \n","4 -176.716771 53.857375 0.010994 0 \n",".. ... ... ... ... \n","495 -5.679643 6.765476 0.236017 4 \n","496 -35.507294 18.932022 0.229395 4 \n","497 -6.258859 6.590341 0.171069 4 \n","498 -24.265297 13.725520 0.052276 4 \n","499 -6.709776 7.262246 0.230162 4 \n","\n","[500 rows x 13 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":106,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":107,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.94\n","rbf model accuracy: 0.27\n","poly model accuracy: 0.20\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=101)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":108,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.91\n","linear kernel accuracy with c= 1,: 0.94\n","linear kernel accuracy with c= 10,: 0.93\n","linear kernel accuracy with c= 100,: 0.94\n","linear kernel accuracy with c= 1000,: 0.93\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":109,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.34\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.74\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.74\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.73\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.73\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.23\n","rbf kernel accuracy with gamma= 1, c=1,: 0.55\n","rbf kernel accuracy with gamma= 1, c=10,: 0.58\n","rbf kernel accuracy with gamma= 1, c=100,: 0.57\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.57\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.17\n","rbf kernel accuracy with gamma= 10, c=1,: 0.29\n","rbf kernel accuracy with gamma= 10, c=10,: 0.29\n","rbf kernel accuracy with gamma= 10, c=100,: 0.29\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.29\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.17\n","rbf kernel accuracy with gamma= 100, c=1,: 0.20\n","rbf kernel accuracy with gamma= 100, c=10,: 0.20\n","rbf kernel accuracy with gamma= 100, c=100,: 0.20\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.20\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":110,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.17\n","poly kernel accuracy with degree=1,: 0.18\n","poly kernel accuracy with degree=2,: 0.20\n","poly kernel accuracy with degree=3,: 0.20\n","poly kernel accuracy with degree=4,: 0.20\n","poly kernel accuracy with degree=5,: 0.20\n","poly kernel accuracy with degree=6,: 0.20\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":111,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.27\n","{'C': 100, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=101)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":112,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.94\n","confusion matrix\n","[[33 0 1 3 3]\n"," [ 0 42 0 0 0]\n"," [ 0 1 46 0 1]\n"," [ 0 1 0 30 3]\n"," [ 0 0 0 0 36]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=101)\n"," clf = svm.SVC(kernel='linear',C=1)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":113,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":114,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"data":{"text/plain":["{'n_estimators': 1200,\n"," 'min_samples_split': 5,\n"," 'min_samples_leaf': 1,\n"," 'max_depth': 20,\n"," 'criterion': 'gini',\n"," 'bootstrap': True}"]},"execution_count":114,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=101,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":115,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 256 candidates, totalling 768 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 40,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 2,\n"," 'min_samples_split': 1,\n"," 'n_estimators': 1800}"]},"execution_count":115,"metadata":{},"output_type":"execute_result"}],"source":["new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [40,50,60,70],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [1,2,3,4],\n"," 'min_samples_split': [1,2,3,4],\n"," 'n_estimators': [1700, 1800, 1900, 2000]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":116,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.96000\n","confusion matrix\n","[[38 0 1 0 1]\n"," [ 0 42 0 0 0]\n"," [ 0 1 46 1 0]\n"," [ 0 0 0 31 3]\n"," [ 1 0 0 0 35]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=101)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":117,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.96500\n","confusion matrix\n","[[38 0 2 0 0]\n"," [ 0 42 0 0 0]\n"," [ 0 1 46 1 0]\n"," [ 0 0 0 32 2]\n"," [ 1 0 0 0 35]]\n"]}],"source":["def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=101)\n"," rf = RandomForestClassifier(n_estimators=2000,criterion='entropy',\n"," max_depth=40,min_samples_split=4,\n"," min_samples_leaf=1,max_features='sqrt',\n"," bootstrap=True)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} diff --git a/assignment_2/tutorial.ipynb b/assignment_2/tutorial.ipynb index 732e3b7..9e4bd98 100644 --- a/assignment_2/tutorial.ipynb +++ b/assignment_2/tutorial.ipynb @@ -2,19 +2,20 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from sklearn import svm, datasets\n", "import sklearn.model_selection as model_selection\n", "from sklearn.metrics import accuracy_score\n", - "from sklearn.metrics import f1_score\n" + "from sklearn.metrics import f1_score\n", + "import pandas as pd" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -187,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -331,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -341,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -351,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -372,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ {