diff --git a/assignment_2/ass2_notes.ipynb b/assignment_2/ass2_notes.ipynb index 7b9e343..0b47682 100644 --- a/assignment_2/ass2_notes.ipynb +++ b/assignment_2/ass2_notes.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"code","execution_count":45,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","from sklearn.utils import resample\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","from sklearn.preprocessing import scale\n","from sklearn import svm\n","from sklearn.metrics import confusion_matrix\n","from sklearn.decomposition import PCA\n","from sklearn.neighbors import KDTree\n"]},{"cell_type":"code","execution_count":46,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":47,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'name':self.cloud_name,'points':self.points,'features':self.feature,'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":48,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID name points \\\n","0 0 000 [[20.06999969482422, 499.9599914550781, 17.450... \n","1 1 001 [[373.3099975585938, 404.2200012207031, 7.1300... \n","2 2 002 [[65.91000366210938, 326.8599853515625, 12.710... \n","3 3 003 [[109.5, 391.5599975585938, 12.69999980926514]... \n","4 4 004 [[126.4300003051758, 234.9400024414062, 6.8800... \n",".. ... ... ... \n","495 495 495 [[129.3399963378906, 12.97000026702881, 8.2200... \n","496 496 496 [[440.1499938964844, 35.84999847412109, 6.1199... \n","497 497 497 [[158.1799926757812, 130.6999969482422, 4.9899... \n","498 498 498 [[498.3299865722656, 93.45999908447266, 9.4200... \n","499 499 499 [[86.16000366210938, 132.1300048828125, 7.0399... \n","\n"," features lable \n","0 [19.72999954223633, 0.002763957987838585] 0 \n","1 [8.470000267028809, 0.004335260115606936] 0 \n","2 [15.56999969482422, 0.00145218945487042] 0 \n","3 [16.46999931335449, 0.0009728572818367545] 0 \n","4 [9.75, 0.0004123711340206186] 0 \n",".. ... ... \n","495 [11.53999996185303, 0.004746257758305951] 4 \n","496 [15.17000007629395, 0.002241817366611867] 4 \n","497 [11.10999965667725, 0.011976047904191617] 4 \n","498 [18.90999984741211, 0.011563599798893917] 4 \n","499 [11.89999961853027, 0.005298013245033113] 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":52,"metadata":{},"outputs":[{"data":{"text/plain":["0 0\n","1 0\n","2 0\n","3 0\n","4 0\n"," ..\n","495 4\n","496 4\n","497 4\n","498 4\n","499 4\n","Name: lable, Length: 500, dtype: int64"]},"execution_count":52,"metadata":{},"output_type":"execute_result"}],"source":["X = pt_cloud_df['features']\n","y = pt_cloud_df['lable'].copy()\n","y"]},{"cell_type":"code","execution_count":50,"metadata":{},"outputs":[{"ename":"ValueError","evalue":"setting an array element with a sequence.","output_type":"error","traceback":["\u001b[1;31m---------------------------------------------------------------------------\u001b[0m","\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)","\u001b[1;31mTypeError\u001b[0m: float() argument must be a string or a real number, not 'list'","\nThe above exception was the direct cause of the following exception:\n","\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)","Cell \u001b[1;32mIn[50], line 12\u001b[0m\n\u001b[0;32m 9\u001b[0m conf_matrix \u001b[39m=\u001b[39m confusion_matrix(y_test, y_preds)\n\u001b[0;32m 10\u001b[0m \u001b[39mprint\u001b[39m(conf_matrix)\n\u001b[1;32m---> 12\u001b[0m SVM_classification(X,y)\n","Cell \u001b[1;32mIn[50], line 4\u001b[0m, in \u001b[0;36mSVM_classification\u001b[1;34m(X, y)\u001b[0m\n\u001b[0;32m 2\u001b[0m X_train, X_test, y_train, y_test \u001b[39m=\u001b[39m train_test_split(X,y,test_size\u001b[39m=\u001b[39m\u001b[39m0.4\u001b[39m)\n\u001b[0;32m 3\u001b[0m clf \u001b[39m=\u001b[39m svm\u001b[39m.\u001b[39mSVC()\n\u001b[1;32m----> 4\u001b[0m clf\u001b[39m.\u001b[39;49mfit(X_train,y_train)\n\u001b[0;32m 5\u001b[0m y_preds \u001b[39m=\u001b[39m clf\u001b[39m.\u001b[39mpredict(X_test)\n\u001b[0;32m 6\u001b[0m acc \u001b[39m=\u001b[39m accuracy_score(y_test,y_preds)\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\svm\\_base.py:192\u001b[0m, in \u001b[0;36mBaseLibSVM.fit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 190\u001b[0m check_consistent_length(X, y)\n\u001b[0;32m 191\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 192\u001b[0m X, y \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_data(\n\u001b[0;32m 193\u001b[0m X,\n\u001b[0;32m 194\u001b[0m y,\n\u001b[0;32m 195\u001b[0m dtype\u001b[39m=\u001b[39;49mnp\u001b[39m.\u001b[39;49mfloat64,\n\u001b[0;32m 196\u001b[0m order\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mC\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m 197\u001b[0m accept_sparse\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcsr\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m 198\u001b[0m accept_large_sparse\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[0;32m 199\u001b[0m )\n\u001b[0;32m 201\u001b[0m y \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_validate_targets(y)\n\u001b[0;32m 203\u001b[0m sample_weight \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39masarray(\n\u001b[0;32m 204\u001b[0m [] \u001b[39mif\u001b[39;00m sample_weight \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m sample_weight, dtype\u001b[39m=\u001b[39mnp\u001b[39m.\u001b[39mfloat64\n\u001b[0;32m 205\u001b[0m )\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\base.py:565\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[1;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[0;32m 563\u001b[0m y \u001b[39m=\u001b[39m check_array(y, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_y_params)\n\u001b[0;32m 564\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 565\u001b[0m X, y \u001b[39m=\u001b[39m check_X_y(X, y, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mcheck_params)\n\u001b[0;32m 566\u001b[0m out \u001b[39m=\u001b[39m X, y\n\u001b[0;32m 568\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m no_val_X \u001b[39mand\u001b[39;00m check_params\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mensure_2d\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mTrue\u001b[39;00m):\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\utils\\validation.py:1106\u001b[0m, in \u001b[0;36mcheck_X_y\u001b[1;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[0;32m 1101\u001b[0m estimator_name \u001b[39m=\u001b[39m _check_estimator_name(estimator)\n\u001b[0;32m 1102\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 1103\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mestimator_name\u001b[39m}\u001b[39;00m\u001b[39m requires y to be passed, but the target y is None\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 1104\u001b[0m )\n\u001b[1;32m-> 1106\u001b[0m X \u001b[39m=\u001b[39m check_array(\n\u001b[0;32m 1107\u001b[0m X,\n\u001b[0;32m 1108\u001b[0m accept_sparse\u001b[39m=\u001b[39;49maccept_sparse,\n\u001b[0;32m 1109\u001b[0m accept_large_sparse\u001b[39m=\u001b[39;49maccept_large_sparse,\n\u001b[0;32m 1110\u001b[0m dtype\u001b[39m=\u001b[39;49mdtype,\n\u001b[0;32m 1111\u001b[0m order\u001b[39m=\u001b[39;49morder,\n\u001b[0;32m 1112\u001b[0m copy\u001b[39m=\u001b[39;49mcopy,\n\u001b[0;32m 1113\u001b[0m force_all_finite\u001b[39m=\u001b[39;49mforce_all_finite,\n\u001b[0;32m 1114\u001b[0m ensure_2d\u001b[39m=\u001b[39;49mensure_2d,\n\u001b[0;32m 1115\u001b[0m allow_nd\u001b[39m=\u001b[39;49mallow_nd,\n\u001b[0;32m 1116\u001b[0m ensure_min_samples\u001b[39m=\u001b[39;49mensure_min_samples,\n\u001b[0;32m 1117\u001b[0m ensure_min_features\u001b[39m=\u001b[39;49mensure_min_features,\n\u001b[0;32m 1118\u001b[0m estimator\u001b[39m=\u001b[39;49mestimator,\n\u001b[0;32m 1119\u001b[0m input_name\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mX\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m 1120\u001b[0m )\n\u001b[0;32m 1122\u001b[0m y \u001b[39m=\u001b[39m _check_y(y, multi_output\u001b[39m=\u001b[39mmulti_output, y_numeric\u001b[39m=\u001b[39my_numeric, estimator\u001b[39m=\u001b[39mestimator)\n\u001b[0;32m 1124\u001b[0m check_consistent_length(X, y)\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\utils\\validation.py:879\u001b[0m, in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[0;32m 877\u001b[0m array \u001b[39m=\u001b[39m xp\u001b[39m.\u001b[39mastype(array, dtype, copy\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m)\n\u001b[0;32m 878\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m--> 879\u001b[0m array \u001b[39m=\u001b[39m _asarray_with_order(array, order\u001b[39m=\u001b[39;49morder, dtype\u001b[39m=\u001b[39;49mdtype, xp\u001b[39m=\u001b[39;49mxp)\n\u001b[0;32m 880\u001b[0m \u001b[39mexcept\u001b[39;00m ComplexWarning \u001b[39mas\u001b[39;00m complex_warning:\n\u001b[0;32m 881\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 882\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mComplex data not supported\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m{}\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(array)\n\u001b[0;32m 883\u001b[0m ) \u001b[39mfrom\u001b[39;00m \u001b[39mcomplex_warning\u001b[39;00m\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\utils\\_array_api.py:185\u001b[0m, in \u001b[0;36m_asarray_with_order\u001b[1;34m(array, dtype, order, copy, xp)\u001b[0m\n\u001b[0;32m 182\u001b[0m xp, _ \u001b[39m=\u001b[39m get_namespace(array)\n\u001b[0;32m 183\u001b[0m \u001b[39mif\u001b[39;00m xp\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m \u001b[39min\u001b[39;00m {\u001b[39m\"\u001b[39m\u001b[39mnumpy\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mnumpy.array_api\u001b[39m\u001b[39m\"\u001b[39m}:\n\u001b[0;32m 184\u001b[0m \u001b[39m# Use NumPy API to support order\u001b[39;00m\n\u001b[1;32m--> 185\u001b[0m array \u001b[39m=\u001b[39m numpy\u001b[39m.\u001b[39masarray(array, order\u001b[39m=\u001b[39morder, dtype\u001b[39m=\u001b[39mdtype)\n\u001b[0;32m 186\u001b[0m \u001b[39mreturn\u001b[39;00m xp\u001b[39m.\u001b[39masarray(array, copy\u001b[39m=\u001b[39mcopy)\n\u001b[0;32m 187\u001b[0m \u001b[39melse\u001b[39;00m:\n","File \u001b[1;32mc:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\pandas\\core\\generic.py:2070\u001b[0m, in \u001b[0;36mNDFrame.__array__\u001b[1;34m(self, dtype)\u001b[0m\n\u001b[0;32m 2069\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__array__\u001b[39m(\u001b[39mself\u001b[39m, dtype: npt\u001b[39m.\u001b[39mDTypeLike \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m np\u001b[39m.\u001b[39mndarray:\n\u001b[1;32m-> 2070\u001b[0m \u001b[39mreturn\u001b[39;00m np\u001b[39m.\u001b[39;49masarray(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_values, dtype\u001b[39m=\u001b[39;49mdtype)\n","\u001b[1;31mValueError\u001b[0m: setting an array element with a sequence."]}],"source":["def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC()\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} +{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["%matplotlib inline\n","import math\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import matplotlib.colors as colors\n","\n","from sklearn.model_selection import train_test_split\n","from sklearn.metrics import accuracy_score\n","\n","from sklearn import svm\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.metrics import confusion_matrix\n","\n","from sklearn.neighbors import KDTree\n","from sklearn.model_selection import GridSearchCV\n","from sklearn.model_selection import RandomizedSearchCV\n","from pprint import pprint"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["def read_xyz(filenm):\n"," pt_clouds = np.genfromtxt(filenm, delimiter=' ')\n"," return pt_clouds\n"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["class urban_object:\n"," def __init__(self, filenm):\n"," self.cloud_name = filenm.split('/')[-1][-7:-4] #original code:self.cloud_name = filenm.split('/\\\\')[-1][-7:-4]\n"," self.cloud_ID = int(self.cloud_name)\n"," self.label = math.floor(1.0*self.cloud_ID/100)\n"," self.points = read_xyz(filenm)\n"," self.feature = []\n","\n"," def compute_features(self):\n"," # feature01, height\n"," height = np.amax(self.points[:,2])\n"," self.feature.append(height)\n","\n"," # feature02, root point planar density\n"," root = self.points[[np.argmin(self.points[:,2])]]\n"," top = self.points[[np.argmax(self.points[:,2])]]\n"," kd_tree_2d = KDTree(self.points[:,:2], leaf_size=5)\n"," kd_tree_3d = KDTree(self.points, leaf_size=5)\n","\n"," radius_root = 0.2\n"," count = kd_tree_2d.query_radius(root[:,:2], r=radius_root,count_only=True)\n"," root_density = 1.0*count[0] / len(self.points)\n"," self.feature.append(root_density)\n","\n"," def as_dict(self):\n"," return {'ID':self.cloud_ID,'points':self.points,'feature01':self.feature[0],'feature02':self.feature[1],'lable':self.label}\n"," "]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" ID points feature01 \\\n","0 0 [[20.06999969482422, 499.9599914550781, 17.450... 19.730000 \n","1 1 [[373.3099975585938, 404.2200012207031, 7.1300... 8.470000 \n","2 2 [[65.91000366210938, 326.8599853515625, 12.710... 15.570000 \n","3 3 [[109.5, 391.5599975585938, 12.69999980926514]... 16.469999 \n","4 4 [[126.4300003051758, 234.9400024414062, 6.8800... 9.750000 \n",".. ... ... ... \n","495 495 [[129.3399963378906, 12.97000026702881, 8.2200... 11.540000 \n","496 496 [[440.1499938964844, 35.84999847412109, 6.1199... 15.170000 \n","497 497 [[158.1799926757812, 130.6999969482422, 4.9899... 11.110000 \n","498 498 [[498.3299865722656, 93.45999908447266, 9.4200... 18.910000 \n","499 499 [[86.16000366210938, 132.1300048828125, 7.0399... 11.900000 \n","\n"," feature02 lable \n","0 0.002764 0 \n","1 0.004335 0 \n","2 0.001452 0 \n","3 0.000973 0 \n","4 0.000412 0 \n",".. ... ... \n","495 0.004746 4 \n","496 0.002242 4 \n","497 0.011976 4 \n","498 0.011564 4 \n","499 0.005298 4 \n","\n","[500 rows x 5 columns]\n"]}],"source":["path = \"C:/Users/zhuor/OneDrive - Delft University of Technology/Academic Research/electives/GEO 5017 machine learning/assignments/assignment_2/GEO5017-A2-Classification/GEO5017-A2-Classification/data/pointclouds/\"\n","filenms = [path+\"{:03d}.xyz\".format(n) for n in range(500)]\n","# print(filenms)\n","def get_pt_cloud_dtset(filenms):\n"," pt_cloud_dataset = []\n"," for pt_filenm in filenms:\n"," pt_cloud = urban_object(pt_filenm)\n"," pt_cloud.compute_features()\n"," pt_cloud_dataset.append(pt_cloud)\n","\n"," \n"," df = pd.DataFrame([x.as_dict() for x in pt_cloud_dataset])\n"," return df\n","\n","pt_cloud_df = get_pt_cloud_dtset(filenms)\n","\n","print(pt_cloud_df)\n"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["X = pt_cloud_df.drop(['ID','points','lable'],axis=1).copy()\n","y = pt_cloud_df['lable'].copy()"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear model accuracy: 0.45\n","rbf model accuracy: 0.42\n","poly model accuracy: 0.41\n"]}],"source":["# Hyperparams tuning for svm\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","\n","kernels = ['linear','rbf','poly']\n","for kernel in kernels:\n"," svc = svm.SVC(kernel=kernel)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"{} model accuracy: {:.2f}\".format(kernel,acc))"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["linear kernel accuracy with c= 0.1,: 0.45\n","linear kernel accuracy with c= 1,: 0.45\n","linear kernel accuracy with c= 10,: 0.47\n","linear kernel accuracy with c= 100,: 0.57\n","linear kernel accuracy with c= 1000,: 0.65\n"]}],"source":["# Hyperparams tuning for svm\n","Cs = [0.1,1,10,100,1000]\n","for c in Cs:\n"," svc = svm.SVC(kernel='linear',C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"linear kernel accuracy with c= {},: {:.2f}\".format(c,acc))"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["rbf kernel accuracy with gamma= 0.1, c=0.1,: 0.43\n","rbf kernel accuracy with gamma= 0.1, c=1,: 0.45\n","rbf kernel accuracy with gamma= 0.1, c=10,: 0.43\n","rbf kernel accuracy with gamma= 0.1, c=100,: 0.47\n","rbf kernel accuracy with gamma= 0.1, c=1000,: 0.53\n","rbf kernel accuracy with gamma= 1, c=0.1,: 0.37\n","rbf kernel accuracy with gamma= 1, c=1,: 0.45\n","rbf kernel accuracy with gamma= 1, c=10,: 0.47\n","rbf kernel accuracy with gamma= 1, c=100,: 0.49\n","rbf kernel accuracy with gamma= 1, c=1000,: 0.53\n","rbf kernel accuracy with gamma= 10, c=0.1,: 0.34\n","rbf kernel accuracy with gamma= 10, c=1,: 0.41\n","rbf kernel accuracy with gamma= 10, c=10,: 0.43\n","rbf kernel accuracy with gamma= 10, c=100,: 0.46\n","rbf kernel accuracy with gamma= 10, c=1000,: 0.45\n","rbf kernel accuracy with gamma= 100, c=0.1,: 0.37\n","rbf kernel accuracy with gamma= 100, c=1,: 0.42\n","rbf kernel accuracy with gamma= 100, c=10,: 0.42\n","rbf kernel accuracy with gamma= 100, c=100,: 0.44\n","rbf kernel accuracy with gamma= 100, c=1000,: 0.45\n"]}],"source":["# Hyperparams tuning for svm\n","gammas = [0.1,1,10,100]\n","for gamma in gammas:\n"," for c in Cs:\n"," svc = svm.SVC(kernel='rbf', gamma=gamma, C=c)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"rbf kernel accuracy with gamma= {}, c={},: {:.2f}\".format(gamma,c,acc))"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["poly kernel accuracy with degree=0,: 0.19\n","poly kernel accuracy with degree=1,: 0.44\n","poly kernel accuracy with degree=2,: 0.43\n","poly kernel accuracy with degree=3,: 0.41\n","poly kernel accuracy with degree=4,: 0.41\n","poly kernel accuracy with degree=5,: 0.41\n","poly kernel accuracy with degree=6,: 0.41\n"]}],"source":["# Hyperparams tuning for svm\n","degrees = [0,1,2,3,4,5,6]\n","for degree in degrees:\n"," svc = svm.SVC(kernel='poly', degree=degree)\n"," svc.fit(X_train,y_train)\n"," y_preds = svc.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"poly kernel accuracy with degree={},: {:.2f}\".format(degree,acc))"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.48\n","{'C': 1000, 'gamma': 0.1, 'kernel': 'linear'}\n"]}],"source":["# double check the optimal svm hyperparams with GridSearchCV\n","X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n","clf_svm = svm.SVC()\n","clf_svm.fit(X_train,y_train)\n","y_preds = clf_svm.predict(X_test)\n","acc = accuracy_score(y_test,y_preds)\n","print(\"SVM accuracy: {:.2f}\".format(acc))\n","\n","param_grid_01 = [{'C':[0.1,1,10,100,1000],'gamma':[0.1,1,10,100],'kernel':['rbf','linear']}]\n","optimal_params_01 = GridSearchCV(svm.SVC(),param_grid_01,cv=5,scoring='accuracy',verbose=0)\n","optimal_params_01.fit(X_train, y_train)\n","print(optimal_params_01.best_params_)\n","\n"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["SVM accuracy: 0.62\n","confusion matrix\n","[[33 2 3 0 3]\n"," [ 0 5 28 0 6]\n"," [ 2 1 34 0 0]\n"," [ 0 1 0 32 4]\n"," [15 8 1 1 21]]\n"]}],"source":["# 改函数里的 hyperparameters\n","def SVM_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," clf = svm.SVC(kernel='linear',C=1000)\n"," clf.fit(X_train,y_train)\n"," y_preds = clf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"SVM accuracy: {:.2f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","SVM_classification(X,y)"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'bootstrap': [True, False],\n"," 'criterion': ['gini', 'entropy'],\n"," 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],\n"," 'max_features': ['auto', 'sqrt'],\n"," 'min_samples_leaf': [1, 2, 4],\n"," 'min_samples_split': [2, 5, 10],\n"," 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n"]}],"source":["# hyperparameter settings for rf\n","# number of trees in rf\n","n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]\n","\n","# number of features to consider at every split\n","max_features = ['auto','sqrt']\n","\n","# maximum number of levels in tree\n","max_depth = [int(x) for x in np.linspace(10,110, num=11)]\n","max_depth.append(None)\n","\n","# criterion\n","criterion = ['gini','entropy']\n","\n","# minimum number of samples required to split a node\n","min_samples_split = [2,5,10]\n","\n","# minimum number of samples required at each leaf node\n","min_samples_leaf = [1,2,4]\n","\n","# method of selecting samples for training each tree\n","bootstrap = [True,False]\n","\n","# create the random grid\n","random_grid = {'n_estimators': n_estimators,\n"," 'criterion':criterion,\n"," 'max_features': max_features,\n"," 'max_depth': max_depth,\n"," 'min_samples_split': min_samples_split,\n"," 'min_samples_leaf': min_samples_leaf,\n"," 'bootstrap': bootstrap}\n","pprint(random_grid)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 100 candidates, totalling 300 fits\n"]},{"name":"stderr","output_type":"stream","text":["c:\\Users\\zhuor\\anaconda3\\envs\\GEO5017ML\\Lib\\site-packages\\sklearn\\ensemble\\_forest.py:424: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.\n"," warn(\n"]},{"data":{"text/plain":["{'n_estimators': 600,\n"," 'min_samples_split': 5,\n"," 'min_samples_leaf': 1,\n"," 'max_features': 'auto',\n"," 'max_depth': 40,\n"," 'criterion': 'gini',\n"," 'bootstrap': False}"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["rf = RandomForestClassifier()\n","rf_random = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=42,n_jobs=-1)\n","rf_random.fit(X_train,y_train)\n","rf_random.best_params_"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Fitting 3 folds for each of 180 candidates, totalling 540 fits\n"]},{"data":{"text/plain":["{'bootstrap': True,\n"," 'max_depth': 20,\n"," 'max_features': 'sqrt',\n"," 'min_samples_leaf': 3,\n"," 'min_samples_split': 8,\n"," 'n_estimators': 1000}"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["# 重新改列表里的数值\n","new_param_grid = {\n"," 'bootstrap': [True],\n"," 'max_depth': [10,20,40,80,90],\n"," 'max_features': ['sqrt'],\n"," 'min_samples_leaf': [3, 4, 5],\n"," 'min_samples_split': [8, 10, 12],\n"," 'n_estimators': [1000, 1100, 1200, 1300]\n","}\n","\n","rf = RandomForestClassifier()\n","grid_search = GridSearchCV(estimator=rf,param_grid=new_param_grid,cv=3,n_jobs=-1,verbose=2)\n","grid_search.fit(X_train,y_train)\n","grid_search.best_params_"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Random Forest accuracy: 0.65500\n","confusion matrix\n","[[40 0 1 0 5]\n"," [ 0 18 16 0 12]\n"," [ 2 14 17 1 1]\n"," [ 0 2 1 31 4]\n"," [ 1 5 0 4 25]]\n"]}],"source":["def RF_classification_initial(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier()\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification_initial(X,y)"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["imporved Random Forest accuracy: 0.71500\n","confusion matrix\n","[[31 1 6 0 7]\n"," [ 0 25 13 0 1]\n"," [ 0 12 22 0 1]\n"," [ 0 1 0 35 1]\n"," [ 1 8 2 3 30]]\n"]}],"source":["# 改 hyperparameters\n","def RF_classification(X,y):\n"," X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4)\n"," rf = RandomForestClassifier(n_estimators=1100,criterion='gini',max_depth=10,min_samples_split=12,min_samples_leaf=3,max_features='sqrt',bootstrap=True,)\n"," rf.fit(X_train,y_train)\n"," y_preds = rf.predict(X_test)\n"," acc = accuracy_score(y_test,y_preds)\n"," print(\"imporved Random Forest accuracy: {:.5f}\".format(acc))\n"," print(\"confusion matrix\")\n"," conf_matrix = confusion_matrix(y_test, y_preds)\n"," print(conf_matrix)\n","\n","RF_classification(X,y)"]}],"metadata":{"kernelspec":{"display_name":"GEO5017ML","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.0"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2} diff --git a/assignment_2/tutorial.ipynb b/assignment_2/tutorial.ipynb index 732e3b7..9e4bd98 100644 --- a/assignment_2/tutorial.ipynb +++ b/assignment_2/tutorial.ipynb @@ -2,19 +2,20 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from sklearn import svm, datasets\n", "import sklearn.model_selection as model_selection\n", "from sklearn.metrics import accuracy_score\n", - "from sklearn.metrics import f1_score\n" + "from sklearn.metrics import f1_score\n", + "import pandas as pd" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -187,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -331,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -341,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -351,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -372,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ {