diff --git a/section-2-data-science-and-ml-tools/introduction-to-scikit-learn.ipynb b/section-2-data-science-and-ml-tools/introduction-to-scikit-learn.ipynb
index e2eca4841..72448bc45 100644
--- a/section-2-data-science-and-ml-tools/introduction-to-scikit-learn.ipynb
+++ b/section-2-data-science-and-ml-tools/introduction-to-scikit-learn.ipynb
@@ -2905,7 +2905,7 @@
"\n",
"Let's figure it out.\n",
"\n",
- "First, we'll import the `car-sales-extended.csv` dataset."
+ "First, we'll import the [`car-sales-extended.csv`](https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/data/car-sales-extended.csv) dataset."
]
},
{
@@ -3059,7 +3059,8 @@
],
"source": [
"# Import car-sales-extended.csv\n",
- "car_sales = pd.read_csv(\"../data/car-sales-extended.csv\")\n",
+ "# car_sales = pd.read_csv(\"../data/car-sales-extended.csv\") # load data from local directory \n",
+ "car_sales = pd.read_csv(\"https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended.csv\") # load data directly from raw URL (source: https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/data/car-sales-extended.csv)\n",
"car_sales"
]
},
@@ -3136,14 +3137,14 @@
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m/var/folders/c4/qj4gdk190td18bqvjjh0p3p00000gn/T/ipykernel_30502/1044518071.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Try to predict with random forest on price column (doesn't work)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mensemble\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mRandomForestRegressor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mRandomForestRegressor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1148\u001b[0m skip_parameter_validation=(\n\u001b[1;32m 1149\u001b[0m \u001b[0mprefer_skip_nested_validation\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mglobal_skip_validation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1150\u001b[0m )\n\u001b[1;32m 1151\u001b[0m ):\n\u001b[0;32m-> 1152\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfit_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/ensemble/_forest.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 344\u001b[0m \"\"\"\n\u001b[1;32m 345\u001b[0m \u001b[0;31m# Validate or convert input data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0missparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 347\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"sparse multilabel-indicator for y is not supported.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 348\u001b[0;31m X, y = self._validate_data(\n\u001b[0m\u001b[1;32m 349\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmulti_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"csc\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mDTYPE\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 350\u001b[0m )\n\u001b[1;32m 351\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msample_weight\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[1;32m 618\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"estimator\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcheck_y_params\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 619\u001b[0m \u001b[0mcheck_y_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mdefault_check_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"y\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 621\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 622\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 623\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 624\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 625\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mno_val_X\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mcheck_params\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"ensure_2d\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 1142\u001b[0m raise ValueError(\n\u001b[1;32m 1143\u001b[0m \u001b[0;34mf\"{estimator_name} requires y to be passed, but the target y is None\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1144\u001b[0m )\n\u001b[1;32m 1145\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1146\u001b[0;31m X = check_array(\n\u001b[0m\u001b[1;32m 1147\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1148\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1149\u001b[0m \u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 912\u001b[0m )\n\u001b[1;32m 913\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 914\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 915\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_asarray_with_order\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mxp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 916\u001b[0;31m \u001b[0;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 917\u001b[0m raise ValueError(\n\u001b[1;32m 918\u001b[0m \u001b[0;34m\"Complex data not supported\\n{}\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 919\u001b[0m ) from complex_warning\n",
- "\u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/utils/_array_api.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(array, dtype, order, copy, xp)\u001b[0m\n\u001b[1;32m 376\u001b[0m \u001b[0;31m# Use NumPy API to support order\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 377\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcopy\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 378\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 379\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 380\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 381\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 382\u001b[0m \u001b[0;31m# At this point array is a NumPy ndarray. We convert it to an array\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 383\u001b[0m \u001b[0;31m# container that is consistent with the input's namespace.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, dtype)\u001b[0m\n\u001b[1;32m 2082\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__array__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnpt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDTypeLike\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2083\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2084\u001b[0;31m \u001b[0marr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2085\u001b[0m if (\n\u001b[1;32m 2086\u001b[0m \u001b[0mastype_is_view\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2087\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0musing_copy_on_write\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/var/folders/c4/qj4gdk190td18bqvjjh0p3p00000gn/T/ipykernel_23180/1044518071.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Try to predict with random forest on price column (doesn't work)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mensemble\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mRandomForestRegressor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mRandomForestRegressor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1469\u001b[0m skip_parameter_validation=(\n\u001b[1;32m 1470\u001b[0m \u001b[0mprefer_skip_nested_validation\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mglobal_skip_validation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1471\u001b[0m )\n\u001b[1;32m 1472\u001b[0m ):\n\u001b[0;32m-> 1473\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfit_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/ensemble/_forest.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 359\u001b[0m \u001b[0;31m# Validate or convert input data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0missparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"sparse multilabel-indicator for y is not supported.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 362\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 363\u001b[0;31m X, y = self._validate_data(\n\u001b[0m\u001b[1;32m 364\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 365\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 366\u001b[0m \u001b[0mmulti_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"estimator\"\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcheck_y_params\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[0mcheck_y_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mdefault_check_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 648\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"y\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 649\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 650\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 651\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 652\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 653\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mno_val_X\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mcheck_params\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"ensure_2d\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 1297\u001b[0m raise ValueError(\n\u001b[1;32m 1298\u001b[0m \u001b[0;34mf\"{estimator_name} requires y to be passed, but the target y is None\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1299\u001b[0m )\n\u001b[1;32m 1300\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1301\u001b[0;31m X = check_array(\n\u001b[0m\u001b[1;32m 1302\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1303\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1304\u001b[0m \u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 1009\u001b[0m )\n\u001b[1;32m 1010\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1011\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1012\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_asarray_with_order\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mxp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1013\u001b[0;31m \u001b[0;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1014\u001b[0m raise ValueError(\n\u001b[1;32m 1015\u001b[0m \u001b[0;34m\"Complex data not supported\\n{}\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1016\u001b[0m ) from complex_warning\n",
+ "\u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/utils/_array_api.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(array, dtype, order, copy, xp, device)\u001b[0m\n\u001b[1;32m 747\u001b[0m \u001b[0;31m# Use NumPy API to support order\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 748\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcopy\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 749\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 750\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 751\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 752\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 753\u001b[0m \u001b[0;31m# At this point array is a NumPy ndarray. We convert it to an array\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 754\u001b[0m \u001b[0;31m# container that is consistent with the input's namespace.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self, dtype, copy)\u001b[0m\n\u001b[1;32m 2149\u001b[0m def __array__(\n\u001b[1;32m 2150\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnpt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDTypeLike\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool_t\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2151\u001b[0m ) -> np.ndarray:\n\u001b[1;32m 2152\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2153\u001b[0;31m \u001b[0marr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2154\u001b[0m if (\n\u001b[1;32m 2155\u001b[0m \u001b[0mastype_is_view\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2156\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0musing_copy_on_write\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'Honda'"
]
}
@@ -3161,13 +3162,21 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Oops... this doesn't work, we'll have to convert the non-numerical features into numbers first.\n",
+ "Oh no! We get a another `ValueError` (some of data is in string format rather than numerical format).\n",
+ "\n",
+ "```\n",
+ "ValueError: could not convert string to float: 'Honda'\n",
+ "```\n",
+ "\n",
+ "Machine learning models prefer to work with numbers than text. \n",
+ "\n",
+ "So we'll have to convert the non-numerical features into numbers first.\n",
"\n",
"The process of turning categorical features into numbers is often referred to as **encoding**.\n",
"\n",
"Scikit-Learn has a fantastic in-depth guide on [*Encoding categorical features*](https://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features).\n",
"\n",
- "But let's look at one of the most straightforward ways to turn categorical features into numbers, [one-hot encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html).\n",
+ "But let's look at one of the most straightforward ways to turn categorical features into numbers, one-hot encoding.\n",
"\n",
"In machine learning, [one-hot encoding](https://en.wikipedia.org/wiki/One-hot#Machine_learning_and_statistics) gives a value of `1` to the target value and a value of `0` to the other values.\n",
"\n",
@@ -4123,12 +4132,12 @@
"\n",
"> **Note:** Dealing with missing values differs from problem to problem, meaning there's no 100% best way to fill missing values across datasets and problem types. It will often take careful experimentation and practice to figure out the best way to deal with missing values in your own datasets.\n",
"\n",
- "To practice dealing with missing values, let's import a version of the `car_sales` dataset with several missing values."
+ "To practice dealing with missing values, let's import a version of the `car_sales` dataset with several missing values (namely [`car-sales-extended-missing-data.csv`](https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/data/car-sales-extended-missing-data.csv))."
]
},
{
"cell_type": "code",
- "execution_count": 45,
+ "execution_count": 47,
"metadata": {},
"outputs": [
{
@@ -4201,91 +4210,82 @@
"
14043.0 \n",
" \n",
" \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
+ " 5 \n",
+ " Honda \n",
+ " Red \n",
+ " 42652.0 \n",
+ " 4.0 \n",
+ " 23883.0 \n",
" \n",
" \n",
- " 995 \n",
+ " 6 \n",
" Toyota \n",
- " Black \n",
- " 35820.0 \n",
+ " Blue \n",
+ " 163453.0 \n",
" 4.0 \n",
- " 32042.0 \n",
+ " 8473.0 \n",
" \n",
" \n",
- " 996 \n",
- " NaN \n",
+ " 7 \n",
+ " Honda \n",
" White \n",
- " 155144.0 \n",
- " 3.0 \n",
- " 5716.0 \n",
- " \n",
- " \n",
- " 997 \n",
- " Nissan \n",
- " Blue \n",
- " 66604.0 \n",
+ " NaN \n",
" 4.0 \n",
- " 31570.0 \n",
+ " 20306.0 \n",
" \n",
" \n",
- " 998 \n",
- " Honda \n",
+ " 8 \n",
+ " NaN \n",
" White \n",
- " 215883.0 \n",
+ " 130538.0 \n",
" 4.0 \n",
- " 4001.0 \n",
+ " 9374.0 \n",
" \n",
" \n",
- " 999 \n",
- " Toyota \n",
+ " 9 \n",
+ " Honda \n",
" Blue \n",
- " 248360.0 \n",
+ " 51029.0 \n",
" 4.0 \n",
- " 12732.0 \n",
+ " 26683.0 \n",
" \n",
" \n",
"\n",
- "1000 rows × 5 columns
\n",
""
],
"text/plain": [
- " Make Colour Odometer (KM) Doors Price\n",
- "0 Honda White 35431.0 4.0 15323.0\n",
- "1 BMW Blue 192714.0 5.0 19943.0\n",
- "2 Honda White 84714.0 4.0 28343.0\n",
- "3 Toyota White 154365.0 4.0 13434.0\n",
- "4 Nissan Blue 181577.0 3.0 14043.0\n",
- ".. ... ... ... ... ...\n",
- "995 Toyota Black 35820.0 4.0 32042.0\n",
- "996 NaN White 155144.0 3.0 5716.0\n",
- "997 Nissan Blue 66604.0 4.0 31570.0\n",
- "998 Honda White 215883.0 4.0 4001.0\n",
- "999 Toyota Blue 248360.0 4.0 12732.0\n",
- "\n",
- "[1000 rows x 5 columns]"
+ " Make Colour Odometer (KM) Doors Price\n",
+ "0 Honda White 35431.0 4.0 15323.0\n",
+ "1 BMW Blue 192714.0 5.0 19943.0\n",
+ "2 Honda White 84714.0 4.0 28343.0\n",
+ "3 Toyota White 154365.0 4.0 13434.0\n",
+ "4 Nissan Blue 181577.0 3.0 14043.0\n",
+ "5 Honda Red 42652.0 4.0 23883.0\n",
+ "6 Toyota Blue 163453.0 4.0 8473.0\n",
+ "7 Honda White NaN 4.0 20306.0\n",
+ "8 NaN White 130538.0 4.0 9374.0\n",
+ "9 Honda Blue 51029.0 4.0 26683.0"
]
},
- "execution_count": 45,
+ "execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Import car sales dataframe with missing values\n",
- "car_sales_missing = pd.read_csv(\"../data/car-sales-extended-missing-data.csv\")\n",
- "car_sales_missing"
+ "# car_sales_missing = pd.read_csv(\"../data/car-sales-extended-missing-data.csv\") # load from local directory\n",
+ "car_sales_missing = pd.read_csv(\"https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv\") # read directly from URL (source: https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/data/car-sales-extended-missing-data.csv)\n",
+ "car_sales_missing.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "If you're dataset is large, it's likely you aren't going to go through it sample by sample to find the missing values.\n",
+ "Notice the `NaN` value in row 7 for the `Odometer (KM)` column, that means pandas has detected a missing value there.\n",
+ "\n",
+ "However, if you're dataset is large, it's likely you aren't going to go through it sample by sample to find the missing values.\n",
"\n",
"Luckily, pandas has a method called [`pd.DataFrame.isna()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isna.html) which is able to detect missing values.\n",
"\n",
@@ -4294,7 +4294,7 @@
},
{
"cell_type": "code",
- "execution_count": 46,
+ "execution_count": 48,
"metadata": {},
"outputs": [
{
@@ -4308,7 +4308,7 @@
"dtype: int64"
]
},
- "execution_count": 46,
+ "execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
@@ -4329,7 +4329,7 @@
},
{
"cell_type": "code",
- "execution_count": 47,
+ "execution_count": 49,
"metadata": {},
"outputs": [
{
@@ -4353,7 +4353,7 @@
},
{
"cell_type": "code",
- "execution_count": 48,
+ "execution_count": 50,
"metadata": {},
"outputs": [
{
@@ -4379,7 +4379,7 @@
},
{
"cell_type": "code",
- "execution_count": 49,
+ "execution_count": 51,
"metadata": {},
"outputs": [
{
@@ -4400,7 +4400,7 @@
" 0.00000e+00, 2.48360e+05]])"
]
},
- "execution_count": 49,
+ "execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
@@ -4434,25 +4434,26 @@
},
{
"cell_type": "code",
- "execution_count": 50,
+ "execution_count": 52,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
- "evalue": "Input X contains NaN.\nRandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values",
+ "evalue": "Input y contains NaN.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
- "\u001b[1;32m/Users/daniel/code/zero-to-mastery-ml/section-2-data-science-and-ml-tools/introduction-to-scikit-learn.ipynb Cell 96\u001b[0m line \u001b[0;36m8\n\u001b[1;32m 6 \u001b[0m \u001b[39m# Fit and score a model\u001b[39;00m\n\u001b[1;32m 7 \u001b[0m model \u001b[39m=\u001b[39m RandomForestRegressor()\n\u001b[0;32m----> 8 \u001b[0m model\u001b[39m.\u001b[39;49mfit(X_train, y_train)\n\u001b[1;32m 9 \u001b[0m model\u001b[39m.\u001b[39mscore(X_test, y_test)\n",
- "File \u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/base.py:1152\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1145\u001b[0m estimator\u001b[39m.\u001b[39m_validate_params()\n\u001b[1;32m 1147\u001b[0m \u001b[39mwith\u001b[39;00m config_context(\n\u001b[1;32m 1148\u001b[0m skip_parameter_validation\u001b[39m=\u001b[39m(\n\u001b[1;32m 1149\u001b[0m prefer_skip_nested_validation \u001b[39mor\u001b[39;00m global_skip_validation\n\u001b[1;32m 1150\u001b[0m )\n\u001b[1;32m 1151\u001b[0m ):\n\u001b[0;32m-> 1152\u001b[0m \u001b[39mreturn\u001b[39;00m fit_method(estimator, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
- "File \u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/ensemble/_forest.py:348\u001b[0m, in \u001b[0;36mBaseForest.fit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[39mif\u001b[39;00m issparse(y):\n\u001b[1;32m 347\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39msparse multilabel-indicator for y is not supported.\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m--> 348\u001b[0m X, y \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_validate_data(\n\u001b[1;32m 349\u001b[0m X, y, multi_output\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, accept_sparse\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcsc\u001b[39;49m\u001b[39m\"\u001b[39;49m, dtype\u001b[39m=\u001b[39;49mDTYPE\n\u001b[1;32m 350\u001b[0m )\n\u001b[1;32m 351\u001b[0m \u001b[39mif\u001b[39;00m sample_weight \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 352\u001b[0m sample_weight \u001b[39m=\u001b[39m _check_sample_weight(sample_weight, X)\n",
- "File \u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/base.py:622\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[0;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[1;32m 620\u001b[0m y \u001b[39m=\u001b[39m check_array(y, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mcheck_y_params)\n\u001b[1;32m 621\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 622\u001b[0m X, y \u001b[39m=\u001b[39m check_X_y(X, y, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mcheck_params)\n\u001b[1;32m 623\u001b[0m out \u001b[39m=\u001b[39m X, y\n\u001b[1;32m 625\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m no_val_X \u001b[39mand\u001b[39;00m check_params\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mensure_2d\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mTrue\u001b[39;00m):\n",
- "File \u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/utils/validation.py:1146\u001b[0m, in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 1141\u001b[0m estimator_name \u001b[39m=\u001b[39m _check_estimator_name(estimator)\n\u001b[1;32m 1142\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 1143\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mestimator_name\u001b[39m}\u001b[39;00m\u001b[39m requires y to be passed, but the target y is None\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 1144\u001b[0m )\n\u001b[0;32m-> 1146\u001b[0m X \u001b[39m=\u001b[39m check_array(\n\u001b[1;32m 1147\u001b[0m X,\n\u001b[1;32m 1148\u001b[0m accept_sparse\u001b[39m=\u001b[39;49maccept_sparse,\n\u001b[1;32m 1149\u001b[0m accept_large_sparse\u001b[39m=\u001b[39;49maccept_large_sparse,\n\u001b[1;32m 1150\u001b[0m dtype\u001b[39m=\u001b[39;49mdtype,\n\u001b[1;32m 1151\u001b[0m order\u001b[39m=\u001b[39;49morder,\n\u001b[1;32m 1152\u001b[0m copy\u001b[39m=\u001b[39;49mcopy,\n\u001b[1;32m 1153\u001b[0m force_all_finite\u001b[39m=\u001b[39;49mforce_all_finite,\n\u001b[1;32m 1154\u001b[0m ensure_2d\u001b[39m=\u001b[39;49mensure_2d,\n\u001b[1;32m 1155\u001b[0m allow_nd\u001b[39m=\u001b[39;49mallow_nd,\n\u001b[1;32m 1156\u001b[0m ensure_min_samples\u001b[39m=\u001b[39;49mensure_min_samples,\n\u001b[1;32m 1157\u001b[0m ensure_min_features\u001b[39m=\u001b[39;49mensure_min_features,\n\u001b[1;32m 1158\u001b[0m estimator\u001b[39m=\u001b[39;49mestimator,\n\u001b[1;32m 1159\u001b[0m input_name\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mX\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 1160\u001b[0m )\n\u001b[1;32m 1162\u001b[0m y \u001b[39m=\u001b[39m _check_y(y, multi_output\u001b[39m=\u001b[39mmulti_output, y_numeric\u001b[39m=\u001b[39my_numeric, estimator\u001b[39m=\u001b[39mestimator)\n\u001b[1;32m 1164\u001b[0m check_consistent_length(X, y)\n",
- "File \u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/utils/validation.py:957\u001b[0m, in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 951\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 952\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mFound array with dim \u001b[39m\u001b[39m%d\u001b[39;00m\u001b[39m. \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m expected <= 2.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 953\u001b[0m \u001b[39m%\u001b[39m (array\u001b[39m.\u001b[39mndim, estimator_name)\n\u001b[1;32m 954\u001b[0m )\n\u001b[1;32m 956\u001b[0m \u001b[39mif\u001b[39;00m force_all_finite:\n\u001b[0;32m--> 957\u001b[0m _assert_all_finite(\n\u001b[1;32m 958\u001b[0m array,\n\u001b[1;32m 959\u001b[0m input_name\u001b[39m=\u001b[39;49minput_name,\n\u001b[1;32m 960\u001b[0m estimator_name\u001b[39m=\u001b[39;49mestimator_name,\n\u001b[1;32m 961\u001b[0m allow_nan\u001b[39m=\u001b[39;49mforce_all_finite \u001b[39m==\u001b[39;49m \u001b[39m\"\u001b[39;49m\u001b[39mallow-nan\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 962\u001b[0m )\n\u001b[1;32m 964\u001b[0m \u001b[39mif\u001b[39;00m ensure_min_samples \u001b[39m>\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[1;32m 965\u001b[0m n_samples \u001b[39m=\u001b[39m _num_samples(array)\n",
- "File \u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/utils/validation.py:122\u001b[0m, in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan, msg_dtype, estimator_name, input_name)\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[39mif\u001b[39;00m first_pass_isfinite:\n\u001b[1;32m 120\u001b[0m \u001b[39mreturn\u001b[39;00m\n\u001b[0;32m--> 122\u001b[0m _assert_all_finite_element_wise(\n\u001b[1;32m 123\u001b[0m X,\n\u001b[1;32m 124\u001b[0m xp\u001b[39m=\u001b[39;49mxp,\n\u001b[1;32m 125\u001b[0m allow_nan\u001b[39m=\u001b[39;49mallow_nan,\n\u001b[1;32m 126\u001b[0m msg_dtype\u001b[39m=\u001b[39;49mmsg_dtype,\n\u001b[1;32m 127\u001b[0m estimator_name\u001b[39m=\u001b[39;49mestimator_name,\n\u001b[1;32m 128\u001b[0m input_name\u001b[39m=\u001b[39;49minput_name,\n\u001b[1;32m 129\u001b[0m )\n",
- "File \u001b[0;32m~/code/zero-to-mastery-ml/env/lib/python3.10/site-packages/sklearn/utils/validation.py:171\u001b[0m, in \u001b[0;36m_assert_all_finite_element_wise\u001b[0;34m(X, xp, allow_nan, msg_dtype, estimator_name, input_name)\u001b[0m\n\u001b[1;32m 154\u001b[0m \u001b[39mif\u001b[39;00m estimator_name \u001b[39mand\u001b[39;00m input_name \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mX\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mand\u001b[39;00m has_nan_error:\n\u001b[1;32m 155\u001b[0m \u001b[39m# Improve the error message on how to handle missing values in\u001b[39;00m\n\u001b[1;32m 156\u001b[0m \u001b[39m# scikit-learn.\u001b[39;00m\n\u001b[1;32m 157\u001b[0m msg_err \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (\n\u001b[1;32m 158\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m{\u001b[39;00mestimator_name\u001b[39m}\u001b[39;00m\u001b[39m does not accept missing values\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 159\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m encoded as NaN natively. For supervised learning, you might want\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 169\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m#estimators-that-handle-nan-values\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 170\u001b[0m )\n\u001b[0;32m--> 171\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(msg_err)\n",
- "\u001b[0;31mValueError\u001b[0m: Input X contains NaN.\nRandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values"
+ "Cell \u001b[0;32mIn[52], line 8\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Fit and score a model\u001b[39;00m\n\u001b[1;32m 7\u001b[0m model \u001b[38;5;241m=\u001b[39m RandomForestRegressor()\n\u001b[0;32m----> 8\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 9\u001b[0m model\u001b[38;5;241m.\u001b[39mscore(X_test, y_test)\n",
+ "File \u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/base.py:1473\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1466\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m 1468\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[1;32m 1469\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[1;32m 1470\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[1;32m 1471\u001b[0m )\n\u001b[1;32m 1472\u001b[0m ):\n\u001b[0;32m-> 1473\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/ensemble/_forest.py:363\u001b[0m, in \u001b[0;36mBaseForest.fit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m issparse(y):\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msparse multilabel-indicator for y is not supported.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 363\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 364\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 365\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 366\u001b[0m \u001b[43m \u001b[49m\u001b[43mmulti_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 367\u001b[0m \u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcsc\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 368\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mDTYPE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 369\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_all_finite\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 370\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 371\u001b[0m \u001b[38;5;66;03m# _compute_missing_values_in_feature_mask checks if X has missing values and\u001b[39;00m\n\u001b[1;32m 372\u001b[0m \u001b[38;5;66;03m# will raise an error if the underlying tree base estimator can't handle missing\u001b[39;00m\n\u001b[1;32m 373\u001b[0m \u001b[38;5;66;03m# values. Only the criterion is required to determine if the tree supports\u001b[39;00m\n\u001b[1;32m 374\u001b[0m \u001b[38;5;66;03m# missing values.\u001b[39;00m\n\u001b[1;32m 375\u001b[0m estimator \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mestimator)(criterion\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcriterion)\n",
+ "File \u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/base.py:650\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[0;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[1;32m 648\u001b[0m y \u001b[38;5;241m=\u001b[39m check_array(y, input_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcheck_y_params)\n\u001b[1;32m 649\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 650\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_X_y\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcheck_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 651\u001b[0m out \u001b[38;5;241m=\u001b[39m X, y\n\u001b[1;32m 653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m check_params\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mensure_2d\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m):\n",
+ "File \u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/utils/validation.py:1318\u001b[0m, in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 1297\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1298\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mestimator_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m requires y to be passed, but the target y is None\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1299\u001b[0m )\n\u001b[1;32m 1301\u001b[0m X \u001b[38;5;241m=\u001b[39m check_array(\n\u001b[1;32m 1302\u001b[0m X,\n\u001b[1;32m 1303\u001b[0m accept_sparse\u001b[38;5;241m=\u001b[39maccept_sparse,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1315\u001b[0m input_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 1316\u001b[0m )\n\u001b[0;32m-> 1318\u001b[0m y \u001b[38;5;241m=\u001b[39m \u001b[43m_check_y\u001b[49m\u001b[43m(\u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmulti_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmulti_output\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_numeric\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my_numeric\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mestimator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mestimator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1320\u001b[0m check_consistent_length(X, y)\n\u001b[1;32m 1322\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m X, y\n",
+ "File \u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/utils/validation.py:1328\u001b[0m, in \u001b[0;36m_check_y\u001b[0;34m(y, multi_output, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 1326\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Isolated part of check_X_y dedicated to y validation\"\"\"\u001b[39;00m\n\u001b[1;32m 1327\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m multi_output:\n\u001b[0;32m-> 1328\u001b[0m y \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_array\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1329\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1330\u001b[0m \u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcsr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1331\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_all_finite\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1332\u001b[0m \u001b[43m \u001b[49m\u001b[43mensure_2d\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1333\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1334\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43my\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1335\u001b[0m \u001b[43m \u001b[49m\u001b[43mestimator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1336\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1337\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1338\u001b[0m estimator_name \u001b[38;5;241m=\u001b[39m _check_estimator_name(estimator)\n",
+ "File \u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/utils/validation.py:1064\u001b[0m, in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 1058\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1059\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFound array with dim \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m. \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m expected <= 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1060\u001b[0m \u001b[38;5;241m%\u001b[39m (array\u001b[38;5;241m.\u001b[39mndim, estimator_name)\n\u001b[1;32m 1061\u001b[0m )\n\u001b[1;32m 1063\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m force_all_finite:\n\u001b[0;32m-> 1064\u001b[0m \u001b[43m_assert_all_finite\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1065\u001b[0m \u001b[43m \u001b[49m\u001b[43marray\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1066\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1067\u001b[0m \u001b[43m \u001b[49m\u001b[43mestimator_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mestimator_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1068\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_nan\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce_all_finite\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mallow-nan\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1069\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1071\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m copy:\n\u001b[1;32m 1072\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _is_numpy_namespace(xp):\n\u001b[1;32m 1073\u001b[0m \u001b[38;5;66;03m# only make a copy if `array` and `array_orig` may share memory`\u001b[39;00m\n",
+ "File \u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/utils/validation.py:123\u001b[0m, in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan, msg_dtype, estimator_name, input_name)\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m first_pass_isfinite:\n\u001b[1;32m 121\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[0;32m--> 123\u001b[0m \u001b[43m_assert_all_finite_element_wise\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 124\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 125\u001b[0m \u001b[43m \u001b[49m\u001b[43mxp\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mxp\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 126\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_nan\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallow_nan\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 127\u001b[0m \u001b[43m \u001b[49m\u001b[43mmsg_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmsg_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 128\u001b[0m \u001b[43m \u001b[49m\u001b[43mestimator_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mestimator_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 129\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 130\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m~/miniforge3/envs/ai/lib/python3.11/site-packages/sklearn/utils/validation.py:172\u001b[0m, in \u001b[0;36m_assert_all_finite_element_wise\u001b[0;34m(X, xp, allow_nan, msg_dtype, estimator_name, input_name)\u001b[0m\n\u001b[1;32m 155\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m estimator_name \u001b[38;5;129;01mand\u001b[39;00m input_name \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m has_nan_error:\n\u001b[1;32m 156\u001b[0m \u001b[38;5;66;03m# Improve the error message on how to handle missing values in\u001b[39;00m\n\u001b[1;32m 157\u001b[0m \u001b[38;5;66;03m# scikit-learn.\u001b[39;00m\n\u001b[1;32m 158\u001b[0m msg_err \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 159\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mestimator_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not accept missing values\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 160\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m encoded as NaN natively. For supervised learning, you might want\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 170\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m#estimators-that-handle-nan-values\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 171\u001b[0m )\n\u001b[0;32m--> 172\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg_err)\n",
+ "\u001b[0;31mValueError\u001b[0m: Input y contains NaN."
]
}
],
@@ -4472,9 +4473,13 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Ahh... dam! Looks like the model we're trying to use doesn't work with missing values.\n",
+ "Ahh... dam! Another `ValueError` (our input data contains missing values).\n",
+ "\n",
+ "`ValueError: Input y contains NaN.`\n",
"\n",
- "When we try to fit it on a dataset with missing samples, Scikit-Learn produces the error:\n",
+ "Looks like the model we're trying to use doesn't work with missing values.\n",
+ "\n",
+ "When we try to fit it on a dataset with missing samples, Scikit-Learn produces an error similar to:\n",
"\n",
"`ValueError: Input X contains NaN. RandomForestRegressor does not accept missing values encoded as NaN natively...`\n",
"\n",
@@ -4504,7 +4509,7 @@
},
{
"cell_type": "code",
- "execution_count": 51,
+ "execution_count": 53,
"metadata": {},
"outputs": [
{
@@ -4518,7 +4523,7 @@
"dtype: int64"
]
},
- "execution_count": 51,
+ "execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
@@ -4562,6 +4567,7 @@
"| `Odometer (KM)` | mean of `Odometer (KM)` | \n",
"| `Price` (target) | NA, remove samples missing `Price` |\n",
"\n",
+ "\n",
"> **Note:** The practice of filling missing data with given or calculated values is called [**imputation**](https://scikit-learn.org/stable/modules/impute.html). And it's important to remember there's no perfect way to fill missing data (unless it's with data that should've actually been there in the first place). The methods we're using are only one of many. The techniques you use will depend heavily on your dataset. A good place to look would be searching for \"data imputation techniques\".\n",
"\n",
"Let's start with the `Make` column.\n",
@@ -4571,12 +4577,15 @@
},
{
"cell_type": "code",
- "execution_count": 52,
+ "execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"# Fill the missing values in the Make column\n",
- "car_sales_missing[\"Make\"].fillna(value=\"missing\", inplace=True)"
+ "# Note: In previous versions of pandas, inplace=True was possible, however this will be changed in a future version, can use reassignment instead.\n",
+ "# car_sales_missing[\"Make\"].fillna(value=\"missing\", inplace=True)\n",
+ "\n",
+ "car_sales_missing[\"Make\"] = car_sales_missing[\"Make\"].fillna(value=\"missing\")"
]
},
{
@@ -4588,12 +4597,15 @@
},
{
"cell_type": "code",
- "execution_count": 53,
+ "execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
+ "# Note: In previous versions of pandas, inplace=True was possible, however this will be changed in a future version, can use reassignment instead.\n",
+ "# car_sales_missing[\"Colour\"].fillna(value=\"missing\", inplace=True)\n",
+ "\n",
"# Fill the Colour column\n",
- "car_sales_missing[\"Colour\"].fillna(value=\"missing\", inplace=True)"
+ "car_sales_missing[\"Colour\"] = car_sales_missing[\"Colour\"].fillna(value=\"missing\")"
]
},
{
@@ -4605,7 +4617,7 @@
},
{
"cell_type": "code",
- "execution_count": 54,
+ "execution_count": 57,
"metadata": {},
"outputs": [
{
@@ -4619,7 +4631,7 @@
"dtype: int64"
]
},
- "execution_count": 54,
+ "execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
@@ -4639,7 +4651,7 @@
},
{
"cell_type": "code",
- "execution_count": 55,
+ "execution_count": 58,
"metadata": {},
"outputs": [
{
@@ -4652,7 +4664,7 @@
"Name: count, dtype: int64"
]
},
- "execution_count": 55,
+ "execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
@@ -4664,12 +4676,12 @@
},
{
"cell_type": "code",
- "execution_count": 56,
+ "execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"# Fill the Doors column with the most common value\n",
- "car_sales_missing[\"Doors\"].fillna(value=4, inplace=True)"
+ "car_sales_missing[\"Doors\"] = car_sales_missing[\"Doors\"].fillna(value=4)"
]
},
{
@@ -4681,12 +4693,14 @@
},
{
"cell_type": "code",
- "execution_count": 57,
+ "execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"# Fill the Odometer (KM) column\n",
- "car_sales_missing[\"Odometer (KM)\"].fillna(value=car_sales_missing[\"Odometer (KM)\"].mean(), inplace=True)"
+ "# Old: car_sales_missing[\"Odometer (KM)\"].fillna(value=car_sales_missing[\"Odometer (KM)\"].mean(), inplace=True)\n",
+ "\n",
+ "car_sales_missing[\"Odometer (KM)\"] = car_sales_missing[\"Odometer (KM)\"].fillna(value=car_sales_missing[\"Odometer (KM)\"].mean())"
]
},
{
@@ -4698,7 +4712,7 @@
},
{
"cell_type": "code",
- "execution_count": 58,
+ "execution_count": 61,
"metadata": {},
"outputs": [
{
@@ -4712,7 +4726,7 @@
"dtype: int64"
]
},
- "execution_count": 58,
+ "execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
@@ -4730,12 +4744,14 @@
"\n",
"Finally, we can remove the rows which are missing the target value `Price`.\n",
"\n",
- "> **Note:** Another option would be to impute the `Price` value with the mean or median or some other calculated value (such as by using similar cars to estimate the price), however, to keep things simple and prevent introducing too many fake labels to the data, we'll remove the samples missing a `Price` value. "
+ "> **Note:** Another option would be to impute the `Price` value with the mean or median or some other calculated value (such as by using similar cars to estimate the price), however, to keep things simple and prevent introducing too many fake labels to the data, we'll remove the samples missing a `Price` value. \n",
+ "\n",
+ "We can remove rows with missing values in place from a pandas DataFrame with the [`pandas.DataFrame.dropna(inplace=True)`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html) method."
]
},
{
"cell_type": "code",
- "execution_count": 59,
+ "execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
@@ -4752,7 +4768,7 @@
},
{
"cell_type": "code",
- "execution_count": 60,
+ "execution_count": 63,
"metadata": {},
"outputs": [
{
@@ -4766,7 +4782,7 @@
"dtype: int64"
]
},
- "execution_count": 60,
+ "execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
@@ -4785,7 +4801,7 @@
},
{
"cell_type": "code",
- "execution_count": 61,
+ "execution_count": 64,
"metadata": {},
"outputs": [
{
@@ -4794,7 +4810,7 @@
"950"
]
},
- "execution_count": 61,
+ "execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
@@ -4823,7 +4839,7 @@
},
{
"cell_type": "code",
- "execution_count": 62,
+ "execution_count": 65,
"metadata": {},
"outputs": [
{
@@ -4852,7 +4868,7 @@
},
{
"cell_type": "code",
- "execution_count": 63,
+ "execution_count": 66,
"metadata": {},
"outputs": [
{
@@ -4873,7 +4889,7 @@
" 0.00000e+00, 2.48360e+05]])"
]
},
- "execution_count": 63,
+ "execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
@@ -4898,7 +4914,7 @@
},
{
"cell_type": "code",
- "execution_count": 64,
+ "execution_count": 67,
"metadata": {},
"outputs": [
{
@@ -4907,7 +4923,7 @@
"0.22011714008302485"
]
},
- "execution_count": 64,
+ "execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
@@ -4955,7 +4971,7 @@
},
{
"cell_type": "code",
- "execution_count": 65,
+ "execution_count": 68,
"metadata": {},
"outputs": [
{
@@ -4969,7 +4985,7 @@
"dtype: int64"
]
},
- "execution_count": 65,
+ "execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
@@ -4987,7 +5003,7 @@
},
{
"cell_type": "code",
- "execution_count": 66,
+ "execution_count": 69,
"metadata": {},
"outputs": [
{
@@ -5001,14 +5017,15 @@
"dtype: int64"
]
},
- "execution_count": 66,
+ "execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "# Reimport the DataFrame\n",
- "car_sales_missing = pd.read_csv(\"../data/car-sales-extended-missing-data.csv\")\n",
+ "# Reimport the DataFrame (so that all the missing values are back)\n",
+ "# car_sales_missing = pd.read_csv(\"../data/car-sales-extended-missing-data.csv\") # read from local directory\n",
+ "car_sales_missing = pd.read_csv(\"https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv\") # read directly from URL (source: https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/data/car-sales-extended-missing-data.csv)\n",
"car_sales_missing.isna().sum()"
]
},
@@ -5021,7 +5038,7 @@
},
{
"cell_type": "code",
- "execution_count": 67,
+ "execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
@@ -5038,7 +5055,7 @@
},
{
"cell_type": "code",
- "execution_count": 68,
+ "execution_count": 71,
"metadata": {},
"outputs": [
{
@@ -5052,7 +5069,7 @@
"dtype: int64"
]
},
- "execution_count": 68,
+ "execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
@@ -5072,7 +5089,7 @@
},
{
"cell_type": "code",
- "execution_count": 69,
+ "execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
@@ -5106,7 +5123,7 @@
},
{
"cell_type": "code",
- "execution_count": 70,
+ "execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
@@ -5137,7 +5154,7 @@
},
{
"cell_type": "code",
- "execution_count": 71,
+ "execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
@@ -5181,7 +5198,7 @@
},
{
"cell_type": "code",
- "execution_count": 72,
+ "execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
@@ -5220,7 +5237,7 @@
},
{
"cell_type": "code",
- "execution_count": 73,
+ "execution_count": 76,
"metadata": {},
"outputs": [
{
@@ -5235,7 +5252,7 @@
" ['Honda', 'missing', 4.0, 150582.0]], dtype=object)"
]
},
- "execution_count": 73,
+ "execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
@@ -5262,7 +5279,7 @@
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": 77,
"metadata": {},
"outputs": [
{
@@ -5275,7 +5292,7 @@
"dtype: int64"
]
},
- "execution_count": 74,
+ "execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
@@ -5301,7 +5318,7 @@
},
{
"cell_type": "code",
- "execution_count": 75,
+ "execution_count": 78,
"metadata": {},
"outputs": [
{
@@ -5314,7 +5331,7 @@
"dtype: int64"
]
},
- "execution_count": 75,
+ "execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
@@ -5333,7 +5350,7 @@
},
{
"cell_type": "code",
- "execution_count": 76,
+ "execution_count": 79,
"metadata": {},
"outputs": [
{
@@ -5347,7 +5364,7 @@
"dtype: int64"
]
},
- "execution_count": 76,
+ "execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
@@ -5372,7 +5389,7 @@
},
{
"cell_type": "code",
- "execution_count": 77,
+ "execution_count": 80,
"metadata": {},
"outputs": [
{
@@ -5451,7 +5468,7 @@
"4 Honda Blue 4.0 219217.0"
]
},
- "execution_count": 77,
+ "execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
@@ -5471,7 +5488,7 @@
},
{
"cell_type": "code",
- "execution_count": 78,
+ "execution_count": 81,
"metadata": {},
"outputs": [
{
@@ -5486,7 +5503,7 @@
" [0.0, 1.0, 0.0, ..., 1.0, 0.0, 150582.0]], dtype=object)"
]
},
- "execution_count": 78,
+ "execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
@@ -5529,7 +5546,7 @@
},
{
"cell_type": "code",
- "execution_count": 79,
+ "execution_count": 82,
"metadata": {},
"outputs": [
{
@@ -5538,7 +5555,7 @@
"0.21229043336119102"
]
},
- "execution_count": 79,
+ "execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
@@ -5623,7 +5640,7 @@
},
{
"cell_type": "code",
- "execution_count": 80,
+ "execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
@@ -5642,7 +5659,7 @@
},
{
"cell_type": "code",
- "execution_count": 81,
+ "execution_count": 84,
"metadata": {},
"outputs": [
{
@@ -5758,7 +5775,7 @@
"4 -122.25 3.422 "
]
},
- "execution_count": 81,
+ "execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
@@ -5771,7 +5788,7 @@
},
{
"cell_type": "code",
- "execution_count": 82,
+ "execution_count": 85,
"metadata": {},
"outputs": [
{
@@ -5780,7 +5797,7 @@
"20640"
]
},
- "execution_count": 82,
+ "execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
@@ -5817,16 +5834,16 @@
},
{
"cell_type": "code",
- "execution_count": 83,
+ "execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "0.5758549611440122"
+ "0.5758549611440125"
]
},
- "execution_count": 83,
+ "execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
@@ -5879,7 +5896,7 @@
},
{
"cell_type": "code",
- "execution_count": 84,
+ "execution_count": 87,
"metadata": {},
"outputs": [
{
@@ -5888,7 +5905,7 @@
"0.8059809073051385"
]
},
- "execution_count": 84,
+ "execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
@@ -5941,12 +5958,12 @@
"\n",
"Say you were trying to predict whether or not a patient had heart disease based on their medical records.\n",
"\n",
- "The dataset in `../data/heart-disease.csv` contains data for just that problem."
+ "The dataset in `../data/heart-disease.csv` (or at [`heart-disease.csv`](https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/data/heart-disease.csv)) contains data for just that problem."
]
},
{
"cell_type": "code",
- "execution_count": 85,
+ "execution_count": 90,
"metadata": {},
"outputs": [
{
@@ -6092,19 +6109,20 @@
"4 0 2 1 "
]
},
- "execution_count": 85,
+ "execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "heart_disease = pd.read_csv(\"../data/heart-disease.csv\")\n",
+ "# heart_disease = pd.read_csv(\"../data/heart-disease.csv\") # load from local directory\n",
+ "heart_disease = pd.read_csv(\"https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/heart-disease.csv\") # load directly from URL (source: https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/data/heart-disease.csv)\n",
"heart_disease.head()"
]
},
{
"cell_type": "code",
- "execution_count": 86,
+ "execution_count": 91,
"metadata": {},
"outputs": [
{
@@ -6113,7 +6131,7 @@
"303"
]
},
- "execution_count": 86,
+ "execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
@@ -6144,7 +6162,7 @@
},
{
"cell_type": "code",
- "execution_count": 87,
+ "execution_count": 92,
"metadata": {},
"outputs": [
{
@@ -6153,7 +6171,7 @@
"0.8688524590163934"
]
},
- "execution_count": 87,
+ "execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
@@ -6206,7 +6224,7 @@
},
{
"cell_type": "code",
- "execution_count": 88,
+ "execution_count": 93,
"metadata": {},
"outputs": [
{
@@ -6215,7 +6233,7 @@
"0.8524590163934426"
]
},
- "execution_count": 88,
+ "execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
@@ -6334,7 +6352,7 @@
},
{
"cell_type": "code",
- "execution_count": 89,
+ "execution_count": 94,
"metadata": {},
"outputs": [
{
@@ -6343,7 +6361,7 @@
"0.8524590163934426"
]
},
- "execution_count": 89,
+ "execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
@@ -6385,7 +6403,7 @@
},
{
"cell_type": "code",
- "execution_count": 90,
+ "execution_count": 95,
"metadata": {},
"outputs": [
{
@@ -6525,7 +6543,7 @@
"4 0 2 "
]
},
- "execution_count": 90,
+ "execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
@@ -6543,7 +6561,7 @@
},
{
"cell_type": "code",
- "execution_count": 91,
+ "execution_count": 96,
"metadata": {},
"outputs": [
{
@@ -6557,7 +6575,7 @@
"Name: target, dtype: int64"
]
},
- "execution_count": 91,
+ "execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
@@ -6607,7 +6625,7 @@
},
{
"cell_type": "code",
- "execution_count": 92,
+ "execution_count": 97,
"metadata": {},
"outputs": [
{
@@ -6618,7 +6636,7 @@
" 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])"
]
},
- "execution_count": 92,
+ "execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
@@ -6641,16 +6659,16 @@
},
{
"cell_type": "code",
- "execution_count": 93,
+ "execution_count": 98,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "0.8524590163934426"
+ "np.float64(0.8524590163934426)"
]
},
- "execution_count": 93,
+ "execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
@@ -6672,7 +6690,7 @@
},
{
"cell_type": "code",
- "execution_count": 94,
+ "execution_count": 100,
"metadata": {},
"outputs": [
{
@@ -6681,7 +6699,7 @@
"0.8524590163934426"
]
},
- "execution_count": 94,
+ "execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
@@ -6700,7 +6718,7 @@
},
{
"cell_type": "code",
- "execution_count": 95,
+ "execution_count": 101,
"metadata": {},
"outputs": [
{
@@ -6713,7 +6731,7 @@
" [0.18, 0.82]])"
]
},
- "execution_count": 95,
+ "execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
@@ -6732,7 +6750,7 @@
},
{
"cell_type": "code",
- "execution_count": 96,
+ "execution_count": 102,
"metadata": {},
"outputs": [
{
@@ -6741,7 +6759,7 @@
"array([0, 1, 1, 0, 1])"
]
},
- "execution_count": 96,
+ "execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
@@ -6762,7 +6780,7 @@
},
{
"cell_type": "code",
- "execution_count": 97,
+ "execution_count": 103,
"metadata": {},
"outputs": [
{
@@ -6771,7 +6789,7 @@
"array([[0.89, 0.11]])"
]
},
- "execution_count": 97,
+ "execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
@@ -6792,7 +6810,7 @@
},
{
"cell_type": "code",
- "execution_count": 98,
+ "execution_count": 104,
"metadata": {},
"outputs": [
{
@@ -6801,7 +6819,7 @@
"array([0])"
]
},
- "execution_count": 98,
+ "execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
@@ -6826,7 +6844,7 @@
},
{
"cell_type": "code",
- "execution_count": 99,
+ "execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
@@ -6860,16 +6878,16 @@
},
{
"cell_type": "code",
- "execution_count": 100,
+ "execution_count": 106,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "0.3270458119670544"
+ "np.float64(0.3270458119670544)"
]
},
- "execution_count": 100,
+ "execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
@@ -6923,7 +6941,7 @@
},
{
"cell_type": "code",
- "execution_count": 101,
+ "execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
@@ -6956,7 +6974,7 @@
},
{
"cell_type": "code",
- "execution_count": 102,
+ "execution_count": 108,
"metadata": {},
"outputs": [
{
@@ -6965,7 +6983,7 @@
"0.8524590163934426"
]
},
- "execution_count": 102,
+ "execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
@@ -7001,7 +7019,7 @@
},
{
"cell_type": "code",
- "execution_count": 103,
+ "execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
@@ -7032,7 +7050,7 @@
},
{
"cell_type": "code",
- "execution_count": 104,
+ "execution_count": 110,
"metadata": {},
"outputs": [
{
@@ -7041,7 +7059,7 @@
"0.8059809073051385"
]
},
- "execution_count": 104,
+ "execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
@@ -7085,7 +7103,7 @@
},
{
"cell_type": "code",
- "execution_count": 105,
+ "execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
@@ -7123,7 +7141,7 @@
},
{
"cell_type": "code",
- "execution_count": 106,
+ "execution_count": 112,
"metadata": {},
"outputs": [
{
@@ -7132,7 +7150,7 @@
"0.8524590163934426"
]
},
- "execution_count": 106,
+ "execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
@@ -7144,7 +7162,7 @@
},
{
"cell_type": "code",
- "execution_count": 107,
+ "execution_count": 113,
"metadata": {},
"outputs": [
{
@@ -7153,7 +7171,7 @@
"array([0.81967213, 0.86885246, 0.81967213, 0.78333333, 0.76666667])"
]
},
- "execution_count": 107,
+ "execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
@@ -7211,7 +7229,7 @@
},
{
"cell_type": "code",
- "execution_count": 108,
+ "execution_count": 114,
"metadata": {},
"outputs": [
{
@@ -7220,7 +7238,7 @@
"array([0.83606557, 0.8852459 , 0.7704918 , 0.8 , 0.8 ])"
]
},
- "execution_count": 108,
+ "execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
@@ -7241,16 +7259,16 @@
},
{
"cell_type": "code",
- "execution_count": 109,
+ "execution_count": 115,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "(0.8524590163934426, 0.8248087431693989)"
+ "(0.8524590163934426, np.float64(0.8248087431693989))"
]
},
- "execution_count": 109,
+ "execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
@@ -7284,7 +7302,7 @@
},
{
"cell_type": "code",
- "execution_count": 110,
+ "execution_count": 116,
"metadata": {},
"outputs": [
{
@@ -7293,7 +7311,7 @@
"array([0.78688525, 0.86885246, 0.80327869, 0.78333333, 0.76666667])"
]
},
- "execution_count": 110,
+ "execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
@@ -7337,7 +7355,7 @@
},
{
"cell_type": "code",
- "execution_count": 111,
+ "execution_count": 117,
"metadata": {},
"outputs": [
{
@@ -7346,7 +7364,7 @@
"0.8524590163934426"
]
},
- "execution_count": 111,
+ "execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
@@ -7382,7 +7400,7 @@
},
{
"cell_type": "code",
- "execution_count": 112,
+ "execution_count": 118,
"metadata": {},
"outputs": [
{
@@ -7422,7 +7440,7 @@
},
{
"cell_type": "code",
- "execution_count": 113,
+ "execution_count": 119,
"metadata": {},
"outputs": [
{
@@ -7435,7 +7453,7 @@
" 0.65517241, 0.72413793, 0.72413793, 0.82758621, 1. ])"
]
},
- "execution_count": 113,
+ "execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
@@ -7469,12 +7487,12 @@
},
{
"cell_type": "code",
- "execution_count": 114,
+ "execution_count": 120,
"metadata": {},
"outputs": [
{
"data": {
- "image/png": "",
+ "image/png": "",
"text/plain": [
""
]
@@ -7522,16 +7540,16 @@
},
{
"cell_type": "code",
- "execution_count": 115,
+ "execution_count": 122,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "0.9304956896551724"
+ "np.float64(0.9304956896551724)"
]
},
- "execution_count": 115,
+ "execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
@@ -7556,12 +7574,12 @@
},
{
"cell_type": "code",
- "execution_count": 116,
+ "execution_count": 123,
"metadata": {},
"outputs": [
{
"data": {
- "image/png": "",
+ "image/png": "",
"text/plain": [
""
]
@@ -7590,12 +7608,12 @@
},
{
"cell_type": "code",
- "execution_count": 117,
+ "execution_count": 124,
"metadata": {},
"outputs": [
{
"data": {
- "image/png": "",
+ "image/png": "",
"text/plain": [
""
]
@@ -7612,16 +7630,16 @@
},
{
"cell_type": "code",
- "execution_count": 118,
+ "execution_count": 125,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "1.0"
+ "np.float64(1.0)"
]
},
- "execution_count": 118,
+ "execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
@@ -7653,7 +7671,7 @@
},
{
"cell_type": "code",
- "execution_count": 119,
+ "execution_count": 126,
"metadata": {},
"outputs": [
{
@@ -7663,7 +7681,7 @@
" [ 4, 28]])"
]
},
- "execution_count": 119,
+ "execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
@@ -7687,7 +7705,7 @@
},
{
"cell_type": "code",
- "execution_count": 120,
+ "execution_count": 127,
"metadata": {},
"outputs": [
{
@@ -7742,7 +7760,7 @@
"1 4 28"
]
},
- "execution_count": 120,
+ "execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
@@ -7775,12 +7793,12 @@
},
{
"cell_type": "code",
- "execution_count": 121,
+ "execution_count": 128,
"metadata": {},
"outputs": [
{
"data": {
- "image/png": "",
+ "image/png": "",
"text/plain": [
""
]
@@ -7797,12 +7815,12 @@
},
{
"cell_type": "code",
- "execution_count": 122,
+ "execution_count": 129,
"metadata": {},
"outputs": [
{
"data": {
- "image/png": "",
+ "image/png": "",
"text/plain": [
""
]
@@ -7834,7 +7852,7 @@
},
{
"cell_type": "code",
- "execution_count": 123,
+ "execution_count": 130,
"metadata": {},
"outputs": [
{
@@ -7890,7 +7908,7 @@
},
{
"cell_type": "code",
- "execution_count": 124,
+ "execution_count": 131,
"metadata": {},
"outputs": [
{
@@ -7966,7 +7984,7 @@
"support 9999.00000 1.0 0.9999 10000.000000 10000.00000"
]
},
- "execution_count": 124,
+ "execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
@@ -8020,7 +8038,7 @@
},
{
"cell_type": "code",
- "execution_count": 125,
+ "execution_count": 132,
"metadata": {},
"outputs": [],
"source": [
@@ -8053,7 +8071,7 @@
},
{
"cell_type": "code",
- "execution_count": 126,
+ "execution_count": 133,
"metadata": {},
"outputs": [
{
@@ -8062,7 +8080,7 @@
"0.8059809073051385"
]
},
- "execution_count": 126,
+ "execution_count": 133,
"metadata": {},
"output_type": "execute_result"
}
@@ -8083,7 +8101,7 @@
},
{
"cell_type": "code",
- "execution_count": 127,
+ "execution_count": 134,
"metadata": {},
"outputs": [
{
@@ -8092,7 +8110,7 @@
"0.0"
]
},
- "execution_count": 127,
+ "execution_count": 134,
"metadata": {},
"output_type": "execute_result"
}
@@ -8115,7 +8133,7 @@
},
{
"cell_type": "code",
- "execution_count": 128,
+ "execution_count": 135,
"metadata": {},
"outputs": [
{
@@ -8124,7 +8142,7 @@
"1.0"
]
},
- "execution_count": 128,
+ "execution_count": 135,
"metadata": {},
"output_type": "execute_result"
}
@@ -8151,16 +8169,16 @@
},
{
"cell_type": "code",
- "execution_count": 129,
+ "execution_count": 136,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "0.3270458119670544"
+ "np.float64(0.3270458119670544)"
]
},
- "execution_count": 129,
+ "execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
@@ -8187,7 +8205,7 @@
},
{
"cell_type": "code",
- "execution_count": 130,
+ "execution_count": 137,
"metadata": {},
"outputs": [
{
@@ -8293,7 +8311,7 @@
"[4128 rows x 2 columns]"
]
},
- "execution_count": 130,
+ "execution_count": 137,
"metadata": {},
"output_type": "execute_result"
}
@@ -8316,12 +8334,12 @@
},
{
"cell_type": "code",
- "execution_count": 131,
+ "execution_count": 138,
"metadata": {},
"outputs": [
{
"data": {
- "image/png": "",
+ "image/png": "",
"text/plain": [
""
]
@@ -8351,16 +8369,16 @@
},
{
"cell_type": "code",
- "execution_count": 132,
+ "execution_count": 139,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "0.2542443610174998"
+ "np.float64(0.2542443610174998)"
]
},
- "execution_count": 132,
+ "execution_count": 139,
"metadata": {},
"output_type": "execute_result"
}
@@ -8407,7 +8425,7 @@
},
{
"cell_type": "code",
- "execution_count": 133,
+ "execution_count": 140,
"metadata": {},
"outputs": [],
"source": [
@@ -8431,7 +8449,7 @@
},
{
"cell_type": "code",
- "execution_count": 134,
+ "execution_count": 141,
"metadata": {},
"outputs": [
{
@@ -8440,7 +8458,7 @@
"array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])"
]
},
- "execution_count": 134,
+ "execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
@@ -8462,7 +8480,7 @@
},
{
"cell_type": "code",
- "execution_count": 135,
+ "execution_count": 142,
"metadata": {},
"outputs": [
{
@@ -8487,7 +8505,7 @@
},
{
"cell_type": "code",
- "execution_count": 136,
+ "execution_count": 143,
"metadata": {},
"outputs": [
{
@@ -8515,7 +8533,7 @@
},
{
"cell_type": "code",
- "execution_count": 137,
+ "execution_count": 144,
"metadata": {},
"outputs": [
{
@@ -8541,7 +8559,7 @@
},
{
"cell_type": "code",
- "execution_count": 138,
+ "execution_count": 145,
"metadata": {},
"outputs": [
{
@@ -8567,7 +8585,7 @@
},
{
"cell_type": "code",
- "execution_count": 139,
+ "execution_count": 146,
"metadata": {},
"outputs": [
{
@@ -8595,7 +8613,7 @@
},
{
"cell_type": "code",
- "execution_count": 140,
+ "execution_count": 147,
"metadata": {},
"outputs": [],
"source": [
@@ -8614,23 +8632,28 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "The default is `\"r2\"`."
+ "The default is `\"r2\"`.\n",
+ "\n",
+ "> **Note:** We can time how long a single cell of code takes to run using the [`%%time` magic command](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-time)."
]
},
{
"cell_type": "code",
- "execution_count": 141,
+ "execution_count": 150,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "The cross-validated R^2 score is: 0.65\n"
+ "The cross-validated R^2 score is: 0.65\n",
+ "CPU times: user 40.5 s, sys: 286 ms, total: 40.8 s\n",
+ "Wall time: 41.6 s\n"
]
}
],
"source": [
+ "%%time \n",
"np.random.seed(42)\n",
"cv_r2 = cross_val_score(model, X, y, cv=5, scoring=\"r2\")\n",
"print(f\"The cross-validated R^2 score is: {np.mean(cv_r2):.2f}\")"
@@ -8640,23 +8663,27 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "But we can use `\"neg_mean_absolute_error\"` for MAE (mean absolute error)."
+ "But we can use `\"neg_mean_absolute_error\"` for MAE (mean absolute error).\n",
+ "\n"
]
},
{
"cell_type": "code",
- "execution_count": 142,
+ "execution_count": 151,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "The cross-validated MAE score is: -0.47\n"
+ "The cross-validated MAE score is: -0.47\n",
+ "CPU times: user 40.4 s, sys: 246 ms, total: 40.7 s\n",
+ "Wall time: 41.6 s\n"
]
}
],
"source": [
+ "%%time\n",
"np.random.seed(42)\n",
"cv_mae = cross_val_score(model, X, y, cv=5, scoring=\"neg_mean_absolute_error\")\n",
"print(f\"The cross-validated MAE score is: {np.mean(cv_mae):.2f}\")"
@@ -8678,7 +8705,7 @@
},
{
"cell_type": "code",
- "execution_count": 143,
+ "execution_count": 159,
"metadata": {},
"outputs": [
{
@@ -8730,7 +8757,7 @@
},
{
"cell_type": "code",
- "execution_count": 144,
+ "execution_count": 160,
"metadata": {},
"outputs": [
{
@@ -8787,7 +8814,7 @@
},
{
"cell_type": "code",
- "execution_count": 145,
+ "execution_count": 161,
"metadata": {},
"outputs": [
{
@@ -8875,7 +8902,7 @@
"* Could we improve our data? This could mean filling in misisng values or finding a better encoding (turning data into numbers) strategy.\n",
"\n",
"From a model perspective asks:\n",
- "* Is there a better model we could use? If you've started out with a simple model, could you use a more complex one? (we saw an example of this when looking at the [Scikit-Learn machine learning map](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html), ensemble methods are generally considered more complex models)\n",
+ "* Is there a better model we could use? If you've started out with a simple model, could you use a more complex one? (we saw an example of this when looking at the [Scikit-Learn machine learning map](https://scikit-learn.org/stable/machine_learning_map.html), ensemble methods are generally considered more complex models)\n",
"* Could we improve the current model? If the model you're using performs well straight out of the box, can the hyperparameters be tuned to make it even better?\n",
"\n",
"> **Note:** Patterns in data are also often referred to as data parameters. The difference between *parameters* and *hyperparameters* is a machine learning model seeks to find parameters in data on its own, where as, hyperparameters are settings on a model which a person (you) can adjust.\n",
@@ -8893,7 +8920,7 @@
},
{
"cell_type": "code",
- "execution_count": 146,
+ "execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
@@ -8913,7 +8940,7 @@
},
{
"cell_type": "code",
- "execution_count": 147,
+ "execution_count": 163,
"metadata": {},
"outputs": [
{
@@ -8931,6 +8958,7 @@
" 'min_samples_leaf': 1,\n",
" 'min_samples_split': 2,\n",
" 'min_weight_fraction_leaf': 0.0,\n",
+ " 'monotonic_cst': None,\n",
" 'n_estimators': 100,\n",
" 'n_jobs': None,\n",
" 'oob_score': False,\n",
@@ -8939,7 +8967,7 @@
" 'warm_start': False}"
]
},
- "execution_count": 147,
+ "execution_count": 163,
"metadata": {},
"output_type": "execute_result"
}
@@ -9024,7 +9052,7 @@
},
{
"cell_type": "code",
- "execution_count": 148,
+ "execution_count": 164,
"metadata": {},
"outputs": [
{
@@ -9042,6 +9070,7 @@
" 'min_samples_leaf': 1,\n",
" 'min_samples_split': 2,\n",
" 'min_weight_fraction_leaf': 0.0,\n",
+ " 'monotonic_cst': None,\n",
" 'n_estimators': 100,\n",
" 'n_jobs': None,\n",
" 'oob_score': False,\n",
@@ -9050,7 +9079,7 @@
" 'warm_start': False}"
]
},
- "execution_count": 148,
+ "execution_count": 164,
"metadata": {},
"output_type": "execute_result"
}
@@ -9081,7 +9110,7 @@
},
{
"cell_type": "code",
- "execution_count": 149,
+ "execution_count": 165,
"metadata": {},
"outputs": [],
"source": [
@@ -9119,7 +9148,7 @@
},
{
"cell_type": "code",
- "execution_count": 150,
+ "execution_count": 166,
"metadata": {},
"outputs": [
{
@@ -9135,10 +9164,13 @@
{
"data": {
"text/plain": [
- "{'accuracy': 0.8, 'precision': 0.78, 'recall': 0.88, 'f1': 0.82}"
+ "{'accuracy': 0.8,\n",
+ " 'precision': np.float64(0.78),\n",
+ " 'recall': np.float64(0.88),\n",
+ " 'f1': np.float64(0.82)}"
]
},
- "execution_count": 150,
+ "execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
@@ -9152,7 +9184,8 @@
"np.random.seed(42)\n",
"\n",
"# Read in the data\n",
- "heart_disease = pd.read_csv(\"../data/heart-disease.csv\")\n",
+ "# heart_disease = pd.read_csv(\"../data/heart-disease.csv\") # load in from local directory\n",
+ "heart_disease = pd.read_csv(\"https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/heart-disease.csv\") # load directly from URL (source: https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/data/heart-disease.csv)\n",
"\n",
"# Split into X (features) & y (labels)\n",
"X = heart_disease.drop(\"target\", axis=1)\n",
@@ -9177,7 +9210,7 @@
},
{
"cell_type": "code",
- "execution_count": 151,
+ "execution_count": 167,
"metadata": {},
"outputs": [
{
@@ -9208,7 +9241,7 @@
},
{
"cell_type": "code",
- "execution_count": 152,
+ "execution_count": 168,
"metadata": {},
"outputs": [
{
@@ -9278,7 +9311,7 @@
},
{
"cell_type": "code",
- "execution_count": 153,
+ "execution_count": 169,
"metadata": {},
"outputs": [],
"source": [
@@ -9313,7 +9346,7 @@
},
{
"cell_type": "code",
- "execution_count": 154,
+ "execution_count": 170,
"metadata": {},
"outputs": [
{
@@ -9355,7 +9388,7 @@
},
{
"cell_type": "code",
- "execution_count": 155,
+ "execution_count": 171,
"metadata": {},
"outputs": [
{
@@ -9363,16 +9396,16 @@
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 30 candidates, totalling 150 fits\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 0.5s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=10, max_features=None, min_samples_leaf=8, min_samples_split=2, n_estimators=500; total time= 0.3s\n",
"[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time= 0.0s\n",
@@ -9383,21 +9416,21 @@
"[CV] END max_depth=5, max_features=log2, min_samples_leaf=2, min_samples_split=8, n_estimators=100; total time= 0.1s\n",
"[CV] END max_depth=5, max_features=log2, min_samples_leaf=2, min_samples_split=8, n_estimators=100; total time= 0.1s\n",
"[CV] END max_depth=5, max_features=log2, min_samples_leaf=2, min_samples_split=8, n_estimators=100; total time= 0.1s\n",
+ "[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
"[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
+ "[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
"[CV] END max_depth=10, max_features=log2, min_samples_leaf=8, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=10, max_features=log2, min_samples_leaf=8, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=10, max_features=log2, min_samples_leaf=8, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=10, max_features=log2, min_samples_leaf=8, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=10, max_features=log2, min_samples_leaf=8, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
- "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=4, n_estimators=1200; total time= 1.0s\n",
- "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=4, n_estimators=1200; total time= 1.0s\n",
- "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=4, n_estimators=1200; total time= 1.0s\n",
- "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=4, n_estimators=1200; total time= 1.0s\n",
- "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=4, n_estimators=1200; total time= 1.0s\n",
+ "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=4, n_estimators=1200; total time= 0.7s\n",
+ "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=4, n_estimators=1200; total time= 0.7s\n",
+ "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=4, n_estimators=1200; total time= 0.7s\n",
+ "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=4, n_estimators=1200; total time= 0.8s\n",
+ "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=8, min_samples_split=4, n_estimators=1200; total time= 0.7s\n",
"[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=10; total time= 0.0s\n",
@@ -9413,36 +9446,36 @@
"[CV] END max_depth=5, max_features=log2, min_samples_leaf=8, min_samples_split=4, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=5, max_features=log2, min_samples_leaf=8, min_samples_split=4, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=5, max_features=log2, min_samples_leaf=8, min_samples_split=4, n_estimators=10; total time= 0.0s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time= 1.0s\n",
- "[CV] END max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time= 1.0s\n",
- "[CV] END max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time= 1.0s\n",
- "[CV] END max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time= 1.0s\n",
- "[CV] END max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time= 1.0s\n",
- "[CV] END max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time= 0.7s\n",
+ "[CV] END max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time= 0.7s\n",
+ "[CV] END max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time= 0.8s\n",
+ "[CV] END max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time= 0.8s\n",
+ "[CV] END max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=4, n_estimators=1200; total time= 0.7s\n",
+ "[CV] END max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
"[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1200; total time= 1.1s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1200; total time= 1.0s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1200; total time= 1.0s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1200; total time= 1.0s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1200; total time= 1.1s\n",
- "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1200; total time= 0.8s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1200; total time= 0.7s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1200; total time= 0.8s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1200; total time= 0.7s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=1200; total time= 0.7s\n",
+ "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=1000; total time= 0.7s\n",
+ "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=8, min_samples_split=2, n_estimators=1000; total time= 0.6s\n",
"[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time= 0.0s\n",
@@ -9453,41 +9486,41 @@
"[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=6, n_estimators=100; total time= 0.1s\n",
"[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=6, n_estimators=100; total time= 0.1s\n",
"[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=6, n_estimators=100; total time= 0.1s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=8, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=8, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=8, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=8, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=8, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.5s\n",
- "[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.5s\n",
- "[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=8, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=8, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=8, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=8, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=4, min_samples_split=8, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=6, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
"[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
+ "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
"[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=8, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=8, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=8, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=8, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=8, n_estimators=10; total time= 0.0s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
"[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=8, n_estimators=10; total time= 0.0s\n",
@@ -9498,10 +9531,10 @@
"[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 0.1s\n",
"[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 0.1s\n",
"[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=100; total time= 0.1s\n",
- "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 0.4s\n",
- "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 0.4s\n",
+ "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 0.3s\n",
+ "[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 0.3s\n",
"[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time= 0.4s\n",
"[CV] END max_depth=None, max_features=None, min_samples_leaf=8, min_samples_split=4, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=None, max_features=None, min_samples_leaf=8, min_samples_split=4, n_estimators=10; total time= 0.0s\n",
@@ -9513,7 +9546,7 @@
"[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=10; total time= 0.0s\n",
"[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=10; total time= 0.0s\n",
- "[INFO] Total time taken for 30 random combinations of hyperparameters: 49.50 seconds.\n"
+ "[INFO] Total time taken for 30 random combinations of hyperparameters: 36.02 seconds.\n"
]
}
],
@@ -9561,7 +9594,7 @@
},
{
"cell_type": "code",
- "execution_count": 156,
+ "execution_count": 174,
"metadata": {},
"outputs": [
{
@@ -9574,7 +9607,7 @@
" 'max_depth': 30}"
]
},
- "execution_count": 156,
+ "execution_count": 174,
"metadata": {},
"output_type": "execute_result"
}
@@ -9593,7 +9626,7 @@
},
{
"cell_type": "code",
- "execution_count": 157,
+ "execution_count": 175,
"metadata": {},
"outputs": [
{
@@ -9643,7 +9676,7 @@
},
{
"cell_type": "code",
- "execution_count": 158,
+ "execution_count": 176,
"metadata": {},
"outputs": [
{
@@ -9656,7 +9689,7 @@
" 'min_samples_leaf': [1, 2, 4, 8]}"
]
},
- "execution_count": 158,
+ "execution_count": 176,
"metadata": {},
"output_type": "execute_result"
}
@@ -9693,7 +9726,7 @@
},
{
"cell_type": "code",
- "execution_count": 159,
+ "execution_count": 177,
"metadata": {},
"outputs": [],
"source": [
@@ -9714,7 +9747,7 @@
},
{
"cell_type": "code",
- "execution_count": 160,
+ "execution_count": 178,
"metadata": {},
"outputs": [
{
@@ -9745,7 +9778,7 @@
},
{
"cell_type": "code",
- "execution_count": 161,
+ "execution_count": 179,
"metadata": {},
"outputs": [
{
@@ -9754,125 +9787,125 @@
"text": [
"Fitting 5 folds for each of 24 candidates, totalling 120 fits\n",
"[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
"[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.7s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.7s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=40, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.1s\n",
"[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.1s\n",
"[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=4, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.1s\n",
"[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.2s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.9s\n",
- "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.8s\n"
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=6, n_estimators=1000; total time= 0.7s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time= 0.1s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.6s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.5s\n",
+ "[CV] END max_depth=50, max_features=log2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time= 0.5s\n"
]
}
],
@@ -9910,14 +9943,14 @@
},
{
"cell_type": "code",
- "execution_count": 162,
+ "execution_count": 181,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "[INFO] The total running time for running GridSearchCV was 61.45 seconds.\n"
+ "[INFO] The total running time for running GridSearchCV was 41.95 seconds.\n"
]
}
],
@@ -9936,7 +9969,7 @@
},
{
"cell_type": "code",
- "execution_count": 163,
+ "execution_count": 182,
"metadata": {},
"outputs": [
{
@@ -9949,7 +9982,7 @@
" 'n_estimators': 200}"
]
},
- "execution_count": 163,
+ "execution_count": 182,
"metadata": {},
"output_type": "execute_result"
}
@@ -9968,7 +10001,7 @@
},
{
"cell_type": "code",
- "execution_count": 164,
+ "execution_count": 183,
"metadata": {},
"outputs": [
{
@@ -9984,10 +10017,13 @@
{
"data": {
"text/plain": [
- "{'accuracy': 0.89, 'precision': 0.88, 'recall': 0.91, 'f1': 0.89}"
+ "{'accuracy': 0.89,\n",
+ " 'precision': np.float64(0.88),\n",
+ " 'recall': np.float64(0.91),\n",
+ " 'f1': np.float64(0.89)}"
]
},
- "execution_count": 164,
+ "execution_count": 183,
"metadata": {},
"output_type": "execute_result"
}
@@ -10010,12 +10046,12 @@
},
{
"cell_type": "code",
- "execution_count": 165,
+ "execution_count": 184,
"metadata": {},
"outputs": [
{
"data": {
- "image/png": "",
+ "image/png": "",
"text/plain": [
""
]
@@ -10074,7 +10110,7 @@
},
{
"cell_type": "code",
- "execution_count": 166,
+ "execution_count": 185,
"metadata": {},
"outputs": [],
"source": [
@@ -10094,7 +10130,7 @@
},
{
"cell_type": "code",
- "execution_count": 167,
+ "execution_count": 186,
"metadata": {},
"outputs": [],
"source": [
@@ -10111,7 +10147,7 @@
},
{
"cell_type": "code",
- "execution_count": 168,
+ "execution_count": 187,
"metadata": {},
"outputs": [
{
@@ -10127,10 +10163,13 @@
{
"data": {
"text/plain": [
- "{'accuracy': 0.89, 'precision': 0.88, 'recall': 0.91, 'f1': 0.89}"
+ "{'accuracy': 0.89,\n",
+ " 'precision': np.float64(0.88),\n",
+ " 'recall': np.float64(0.91),\n",
+ " 'f1': np.float64(0.89)}"
]
},
- "execution_count": 168,
+ "execution_count": 187,
"metadata": {},
"output_type": "execute_result"
}
@@ -10151,7 +10190,7 @@
},
{
"cell_type": "code",
- "execution_count": 169,
+ "execution_count": 188,
"metadata": {},
"outputs": [
{
@@ -10160,7 +10199,7 @@
"True"
]
},
- "execution_count": 169,
+ "execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
@@ -10182,7 +10221,7 @@
},
{
"cell_type": "code",
- "execution_count": 170,
+ "execution_count": 189,
"metadata": {},
"outputs": [
{
@@ -10191,7 +10230,7 @@
"['gs_random_forest_model_1.joblib']"
]
},
- "execution_count": 170,
+ "execution_count": 189,
"metadata": {},
"output_type": "execute_result"
}
@@ -10213,7 +10252,7 @@
},
{
"cell_type": "code",
- "execution_count": 171,
+ "execution_count": 190,
"metadata": {},
"outputs": [],
"source": [
@@ -10230,7 +10269,7 @@
},
{
"cell_type": "code",
- "execution_count": 172,
+ "execution_count": 191,
"metadata": {},
"outputs": [
{
@@ -10246,10 +10285,13 @@
{
"data": {
"text/plain": [
- "{'accuracy': 0.89, 'precision': 0.88, 'recall': 0.91, 'f1': 0.89}"
+ "{'accuracy': 0.89,\n",
+ " 'precision': np.float64(0.88),\n",
+ " 'recall': np.float64(0.91),\n",
+ " 'f1': np.float64(0.89)}"
]
},
- "execution_count": 172,
+ "execution_count": 191,
"metadata": {},
"output_type": "execute_result"
}
@@ -10270,7 +10312,7 @@
},
{
"cell_type": "code",
- "execution_count": 173,
+ "execution_count": 192,
"metadata": {},
"outputs": [
{
@@ -10279,7 +10321,7 @@
"True"
]
},
- "execution_count": 173,
+ "execution_count": 192,
"metadata": {},
"output_type": "execute_result"
}
@@ -10329,12 +10371,12 @@
"\n",
"Good news is, `Pipeline` can help us clean it up.\n",
"\n",
- "Let's remind ourselves what the data looks like."
+ "Let's remind ourselves what our [`car-sales-extended-missing-data.csv`](https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/data/car-sales-extended-missing-data.csv) looks like in DataFrame form."
]
},
{
"cell_type": "code",
- "execution_count": 174,
+ "execution_count": 193,
"metadata": {},
"outputs": [
{
@@ -10419,19 +10461,20 @@
"4 Nissan Blue 181577.0 3.0 14043.0"
]
},
- "execution_count": 174,
+ "execution_count": 193,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "data = pd.read_csv(\"../data/car-sales-extended-missing-data.csv\")\n",
+ "# data = pd.read_csv(\"../data/car-sales-extended-missing-data.csv\") # load from local directory\n",
+ "data = pd.read_csv(\"https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv\") # load directly from URL\n",
"data.head()"
]
},
{
"cell_type": "code",
- "execution_count": 175,
+ "execution_count": 194,
"metadata": {},
"outputs": [
{
@@ -10445,7 +10488,7 @@
"dtype: object"
]
},
- "execution_count": 175,
+ "execution_count": 194,
"metadata": {},
"output_type": "execute_result"
}
@@ -10456,7 +10499,7 @@
},
{
"cell_type": "code",
- "execution_count": 176,
+ "execution_count": 195,
"metadata": {},
"outputs": [
{
@@ -10470,7 +10513,7 @@
"dtype: int64"
]
},
- "execution_count": 176,
+ "execution_count": 195,
"metadata": {},
"output_type": "execute_result"
}
@@ -10501,7 +10544,7 @@
},
{
"cell_type": "code",
- "execution_count": 177,
+ "execution_count": 199,
"metadata": {},
"outputs": [
{
@@ -10510,7 +10553,7 @@
"0.22188417408787875"
]
},
- "execution_count": 177,
+ "execution_count": 199,
"metadata": {},
"output_type": "execute_result"
}
@@ -10532,7 +10575,7 @@
"np.random.seed(42)\n",
"\n",
"# Import data and drop the rows with missing labels\n",
- "data = pd.read_csv(\"../data/car-sales-extended-missing-data.csv\")\n",
+ "data = pd.read_csv(\"https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv\")\n",
"data.dropna(subset=[\"Price\"], inplace=True)\n",
"\n",
"# Define different features and transformer pipelines\n",
@@ -10594,7 +10637,7 @@
},
{
"cell_type": "code",
- "execution_count": 178,
+ "execution_count": 200,
"metadata": {},
"outputs": [
{
@@ -10612,16 +10655,16 @@
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s\n",
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s\n",
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.9s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.9s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.8s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.9s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.9s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.9s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.9s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.9s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.9s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.9s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.5s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.5s\n",
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s\n",
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s\n",
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s\n",
@@ -10632,16 +10675,16 @@
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s\n",
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s\n",
"[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.8s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.8s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.8s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.8s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.9s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.8s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.8s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.9s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.8s\n",
- "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.8s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.5s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.5s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.5s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.5s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.6s\n",
+ "[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.6s\n",
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s\n",
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s\n",
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s\n",
@@ -10652,16 +10695,16 @@
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s\n",
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s\n",
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.8s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.7s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.7s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.7s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.7s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.6s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.5s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.5s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.5s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.6s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.6s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.6s\n",
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.7s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.8s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.8s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.8s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.8s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.5s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.6s\n",
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s\n",
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s\n",
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time= 0.1s\n",
@@ -10672,22 +10715,426 @@
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s\n",
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s\n",
"[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time= 0.1s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.8s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.8s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.8s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.7s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.7s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.7s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.7s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.7s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.7s\n",
- "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.7s\n"
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.5s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.6s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.6s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.5s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time= 0.5s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.6s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.5s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.5s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.5s\n",
+ "[CV] END model__max_depth=5, model__max_features=sqrt, model__min_samples_split=4, model__n_estimators=1000, preprocessor__num__imputer__strategy=median; total time= 0.5s\n"
]
},
{
"data": {
"text/html": [
- "GridSearchCV(cv=5,\n",
+ "GridSearchCV(cv=5,\n",
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('cat',\n",
" Pipeline(steps=[('imputer',\n",
@@ -10715,7 +11162,7 @@
" 'model__n_estimators': [100, 1000],\n",
" 'preprocessor__num__imputer__strategy': ['mean',\n",
" 'median']},\n",
- " verbose=2) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. GridSearchCV GridSearchCV(cv=5,\n",
+ " verbose=2) In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. GridSearchCV?Documentation for GridSearchCV iFitted GridSearchCV(cv=5,\n",
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('cat',\n",
" Pipeline(steps=[('imputer',\n",
@@ -10743,7 +11190,7 @@
" 'model__n_estimators': [100, 1000],\n",
" 'preprocessor__num__imputer__strategy': ['mean',\n",
" 'median']},\n",
- " verbose=2) best_estimator_: Pipeline Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('cat',\n",
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(fill_value='missing',\n",
@@ -10760,7 +11207,9 @@
" Pipeline(steps=[('imputer',\n",
" SimpleImputer())]),\n",
" ['Odometer (KM)'])])),\n",
- " ('model', RandomForestRegressor(n_jobs=-1))]) preprocessor: ColumnTransformer?Documentation for preprocessor: ColumnTransformer ColumnTransformer(transformers=[('cat',\n",
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(fill_value='missing',\n",
" strategy='constant')),\n",
@@ -10774,7 +11223,7 @@
" ['Doors']),\n",
" ('num',\n",
" Pipeline(steps=[('imputer', SimpleImputer())]),\n",
- " ['Odometer (KM)'])]) "
+ " ['Odometer (KM)'])])
"
],
"text/plain": [
"GridSearchCV(cv=5,\n",
@@ -10808,12 +11257,14 @@
" verbose=2)"
]
},
- "execution_count": 178,
+ "execution_count": 200,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
+ "%%time\n",
+ "\n",
"# Using grid search with pipeline\n",
"pipe_grid = {\n",
" \"preprocessor__num__imputer__strategy\": [\"mean\", \"median\"], # note the double underscore after each prefix \"preprocessor__\"\n",
@@ -10836,7 +11287,7 @@
},
{
"cell_type": "code",
- "execution_count": 179,
+ "execution_count": 201,
"metadata": {},
"outputs": [
{
@@ -10845,7 +11296,7 @@
"0.2848784564026805"
]
},
- "execution_count": 179,
+ "execution_count": 201,
"metadata": {},
"output_type": "execute_result"
}