diff --git a/.github/workflows/make_docs.yml b/.github/workflows/make_docs.yml index 6d37da348..11da63ed5 100644 --- a/.github/workflows/make_docs.yml +++ b/.github/workflows/make_docs.yml @@ -34,6 +34,7 @@ jobs: - run: cp section-2-data-science-and-ml-tools/introduction-to-scikit-learn.ipynb docs/ - run: cp communicating-your-work.md docs/ - run: cp section-4-unstructured-data-projects/end-to-end-dog-vision-v2.ipynb docs/ + - run: cp section-3-structured-data-projects/end-to-end-heart-disease-classification.ipynb docs/ - run: cp images/* docs/images/ # Last step to deploy docs diff --git a/mkdocs.yml b/mkdocs.yml index d4d8e1020..43fd57b9d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -11,6 +11,8 @@ nav: - Introduction to pandas: "introduction-to-pandas.ipynb" - Introduction to Matplotlib: "introduction-to-matplotlib.ipynb" - Introduction to Scikit-Learn: "introduction-to-scikit-learn.ipynb" + - Milestone Projects: + - End-to-End Heart Disease Classification: "end-to-end-heart-disease-classification.ipynb" - Introduction to TensorFlow, Deep Learning and Transfer Learning: "end-to-end-dog-vision-v2.ipynb" - Communicating your work: "communicating-your-work.md" diff --git a/section-3-structured-data-projects/end-to-end-heart-disease-classification.ipynb b/section-3-structured-data-projects/end-to-end-heart-disease-classification.ipynb index b43dbb30d..6acec808c 100644 --- a/section-3-structured-data-projects/end-to-end-heart-disease-classification.ipynb +++ b/section-3-structured-data-projects/end-to-end-heart-disease-classification.ipynb @@ -176,14 +176,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Notebook last updated: 2024-09-13 11:09:57.323291\n", + "Notebook last updated: 2024-09-24 13:29:26.771285\n", "\n", "NumPy version: 2.1.1\n", "pandas version: 2.2.2\n", @@ -251,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -260,7 +260,7 @@ "(303, 14)" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -273,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -419,7 +419,7 @@ "4 0 2 1 " ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -475,7 +475,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -621,7 +621,7 @@ "4 0 2 1 " ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -633,7 +633,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -874,7 +874,7 @@ "9 0 2 1 " ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -893,7 +893,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -905,7 +905,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -930,7 +930,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -942,7 +942,7 @@ "Name: proportion, dtype: float64" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -961,7 +961,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -991,7 +991,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -1037,7 +1037,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -1250,7 +1250,7 @@ "max 3.000000 1.000000 " ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -1278,7 +1278,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -1290,7 +1290,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1310,7 +1310,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -1365,7 +1365,7 @@ "1 72 93" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1421,7 +1421,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -1455,7 +1455,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1494,7 +1494,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1546,7 +1546,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1589,7 +1589,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1656,7 +1656,7 @@ "3 7 16" ] }, - "execution_count": 19, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1667,7 +1667,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1741,7 +1741,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -2074,7 +2074,7 @@ "target -0.344029 1.000000 " ] }, - "execution_count": 21, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -2094,7 +2094,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -2179,7 +2179,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -2325,7 +2325,7 @@ "4 0 2 1 " ] }, - "execution_count": 23, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -2350,7 +2350,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -2370,7 +2370,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -2510,7 +2510,7 @@ "4 0 2 " ] }, - "execution_count": 26, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -2522,7 +2522,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -2545,7 +2545,7 @@ " numpy.ndarray)" ] }, - "execution_count": 28, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -2591,7 +2591,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -2626,7 +2626,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -2766,7 +2766,7 @@ "176 2 2 3 " ] }, - "execution_count": 31, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -2777,7 +2777,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -2797,7 +2797,7 @@ " 242)" ] }, - "execution_count": 32, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -2817,7 +2817,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -2957,7 +2957,7 @@ "60 2 1 2 " ] }, - "execution_count": 33, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -2968,7 +2968,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -2980,7 +2980,7 @@ " 61)" ] }, - "execution_count": 34, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -3060,7 +3060,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -3103,7 +3103,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -3128,7 +3128,7 @@ " 'Random Forest': 0.8360655737704918}" ] }, - "execution_count": 36, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -3164,7 +3164,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -3256,29 +3256,53 @@ "source": [ "## 6. Hyperparameter tuning and cross-validation\n", "\n", - "UPTOHERE - going through hyperparameter tuning + cross-validation\n", + "To cook your favourite dish, you know to set the oven to 180 degrees and turn the grill on. \n", "\n", - "To cook your favourite dish, you know to set the oven to 180 degrees and turn the grill on. But when your roommate cooks their favourite dish, they set use 200 degrees and the fan-forced mode. Same oven, different settings, different outcomes.\n", + "But when your roommate cooks their favourite dish, they use 200 degrees and the fan-forced mode. \n", + "\n", + "Same oven, different settings, different outcomes.\n", "\n", "The same can be done for machine learning algorithms. You can use the same algorithms but change the settings (hyperparameters) and get different results.\n", "\n", - "But just like turning the oven up too high can burn your food, the same can happen for machine learning algorithms. You change the settings and it works so well, it **overfits** (does too well) the data.\n", + "But just like turning the oven up too high can burn your food, the same can happen for machine learning algorithms. \n", + "\n", + "You change the settings and it works so well, it **overfits** (does too well) the data.\n", + "\n", + "We're looking for the [Goldilocks model](https://en.wikipedia.org/wiki/Goldilocks_principle). \n", + "\n", + "One which does well on our training dataset but also on unseen examples like in the testing dataset/real world.\n", + "\n", + "To test different hyperparameters, you could use a **validation set** but since we don't have much data, we'll use [**cross-validation**](https://scikit-learn.org/stable/modules/cross_validation.html).\n", + "\n", + "> **Note:** A validation set is a third player in the training/test split game. It's designed to be used in between a training and test set. You can think of it as the practice exam before the final exam. As in, the training set is the course material to learn on, the validation set is the practice exam to practice and tweak your skills on and the test set is the final exam to push your skills. In machine learning, the model learns patterns on the training set and then you can tweak hyperparameters to improve results on the validation set before finally testing your model on the testing set. All samples in the training, validation and test sets should be kept exclusive of each other.\n", + "\n", + "The most common type of cross-validation is *k-fold*. \n", "\n", - "We're looking for the goldilocks model. One which does well on our dataset but also does well on unseen examples.\n", + "It involves splitting your data into *k-fold's* or *k-different splits of the data* and then training and testing a model on each split.\n", "\n", - "To test different hyperparameters, you could use a **validation set** but since we don't have much data, we'll use **cross-validation**.\n", + "For example, let's say we had 5 folds (k = 5). \n", "\n", - "The most common type of cross-validation is *k-fold*. It involves splitting your data into *k-fold's* and then testing a model on each. For example, let's say we had 5 folds (k = 5). This what it might look like.\n", + "This is what it might look like.\n", "\n", "| | \n", "|:--:| \n", "| Normal train and test split versus 5-fold cross-validation |\n", "\n", - "We'll be using this setup to tune the hyperparameters of some of our models and then evaluate them. We'll also get a few more metrics like **precision**, **recall**, **F1-score** and **ROC** at the same time.\n", + "You have 5 different versions of train and test splits.\n", "\n", - "Here's the game plan:\n", + "This means you'll be able to train and test 5 different versions of your model on different data splits and calculate the average performance.\n", "\n", - "1. Tune model hyperparameters, see which performs best\n", + "Why do this?\n", + "\n", + "This prevents us from focusing too much on the metrics from one data split (imagine the data split we do contains all the easy samples and the performance metrics we use say that the model performs better than it does).\n", + "\n", + "We'll be using this setup to tune the hyperparameters of some of our models and then evaluate them. \n", + "\n", + "We'll also get a few more metrics like **precision**, **recall**, **F1-score** and **ROC** at the same time.\n", + "\n", + "Here's the plan:\n", + "\n", + "1. Tune model hyperparameters, and see which performs best\n", "2. Perform cross-validation\n", "3. Plot ROC curves\n", "4. Make a confusion matrix\n", @@ -3292,20 +3316,28 @@ "source": [ "### Tune KNeighborsClassifier (K-Nearest Neighbors or KNN) by hand\n", "\n", - "There's one main hyperparameter we can tune for the K-Nearest Neighbors (KNN) algorithm, and that is number of neighbours. The default is 5 (`n_neigbors=5`).\n", + "There are several hyperparameters we can tune for the K-Nearest Neighbors (KNN) algorithm (or [`sklearn.neighbors.KNeighborsClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)).\n", + "\n", + "But for now, let's start with one, the number of neighbors.\n", + "\n", + "The default is 5 (`n_neigbors=5`).\n", "\n", "What are neighbours?\n", "\n", - "Imagine all our different samples on one graph like the scatter graph we have above. KNN works by assuming dots which are closer together belong to the same class. If `n_neighbors=5` then it assume a dot with the 5 closest dots around it are in the same class.\n", + "Well, imagine all our different samples on one graph like the scatter graph several cells above. \n", "\n", - "We've left out some details here like what defines close or how distance is calculated but I encourage you to research them.\n", + "KNN works by assuming dots which are closer together belong to the same class. \n", "\n", - "For now, let's try a few different values of `n_neighbors`." + "If `n_neighbors=5` then it assume a dot with the 5 closest dots around it are in the same class.\n", + "\n", + "We've left out some details here like what defines close or how distance is calculated but I encourage you to research them by going through the documentation.\n", + "\n", + "For now, let's try a few different values of `n_neighbors` and test how the results go." ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -3339,12 +3371,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's look at KNN's train scores." + "That was quick!\n", + "\n", + "Now let's look at KNN's train scores." ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -3372,7 +3406,7 @@ " 0.6694214876033058]" ] }, - "execution_count": 37, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -3385,12 +3419,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "These are hard to understand, let's plot them." + "Ok, these are a bit hard to understand, so let's follow the data explorer's motto and *visualize, visualize, visualize!* In other words, let's plot them." ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -3434,20 +3468,27 @@ "\n", "We've tuned `KNN` by hand but let's see how we can `LogisticsRegression` and `RandomForestClassifier` using [`RandomizedSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html).\n", "\n", - "Instead of us having to manually try different hyperparameters by hand, `RandomizedSearchCV` tries a number of different combinations, evaluates them and saves the best.\n", - "\n", + "Instead of us having to manually try different hyperparameters by hand, `RandomizedSearchCV` tries a number of different combinations, evaluates them and saves the best." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "### Tuning models with with [`RandomizedSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)\n", "\n", "Reading the Scikit-Learn documentation for [`LogisticRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV), we find there's a number of different hyperparameters we can tune.\n", "\n", "The same for [`RandomForestClassifier`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html).\n", "\n", - "Let's create a hyperparameter grid (a dictionary of different hyperparameters) for each and then test them out." + "Let's create a hyperparameter grid (a dictionary of different hyperparameters) for each and then test them out.\n", + "\n", + "> **Note:** Be careful creating a hyperparameter dictionary for tuning as if there are typos in the keys of the dictionary, you will find that your code hyperparameter tuning code will produce errors." ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -3468,23 +3509,27 @@ "source": [ "Now let's use [`sklearn.model_selection.RandomizedSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) to try and tune our `LogisticRegression` model.\n", "\n", - "We'll pass it the different hyperparameters from `log_reg_grid` as well as set `n_iter = 20`. This means, `RandomizedSearchCV` will try 20 different combinations of hyperparameters from `log_reg_grid` and save the best ones." + "We'll pass it the different hyperparameters from `log_reg_grid` as well as set `n_iter=20`. This means, `RandomizedSearchCV` will try 20 different combinations of hyperparameters from `log_reg_grid` and save the best ones." ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n" + "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n", + "CPU times: user 160 ms, sys: 7.51 ms, total: 168 ms\n", + "Wall time: 193 ms\n" ] } ], "source": [ + "%%time\n", + "\n", "# Setup random seed\n", "np.random.seed(42)\n", "\n", @@ -3501,7 +3546,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -3510,7 +3555,7 @@ "{'solver': 'liblinear', 'C': np.float64(0.23357214690901212)}" ] }, - "execution_count": 47, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -3521,7 +3566,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -3530,7 +3575,7 @@ "0.8852459016393442" ] }, - "execution_count": 48, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -3543,23 +3588,31 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "Nice! That seems on par with the result we got before *without* any hyperparameter tuning.\n", + "\n", + "> **Note:** Many of the algorithms in Scikit-Learn have pretty good default hyperparameter values so don't be surprised if they perform pretty good on your data straight out of the box. But don't take this as being true all the time. Just because the default hyperparameters perform pretty well on your data doesn't mean there aren't a better set of hyperparameter values out there.\n", + "\n", "Now we've tuned `LogisticRegression` using `RandomizedSearchCV`, we'll do the same for `RandomForestClassifier`." ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n" + "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n", + "CPU times: user 21.6 s, sys: 144 ms, total: 21.8 s\n", + "Wall time: 22.1 s\n" ] } ], "source": [ + "%%time \n", + "\n", "# Setup random seed\n", "np.random.seed(42)\n", "\n", @@ -3600,7 +3653,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -3609,7 +3662,7 @@ "0.8688524590163934" ] }, - "execution_count": 51, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -3627,33 +3680,43 @@ "\n", "This is akin to tuning the settings on your oven and getting it to cook your favourite dish just right.\n", "\n", - "But since `LogisticRegression` is pulling out in front, we'll try tuning it further with [`GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html).\n", - "\n", + "But since `LogisticRegression` is pulling out in front, we'll try tuning it further with [`GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "### Tuning a model with [`GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)\n", "\n", - "The difference between `RandomizedSearchCV` and `GridSearchCV` is where `RandomizedSearchCV` searches over a grid of hyperparameters performing `n_iter` combinations, `GridSearchCV` will test every single possible combination.\n", + "The difference between `RandomizedSearchCV` and `GridSearchCV` is: \n", "\n", - "In short:\n", - "* [`sklearn.model_selection.RandomizedSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) - tries `n_iter` combinations of hyperparameters and saves the best.\n", - "* [`sklearn.model_selection.GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) - tries every single combination of hyperparameters and saves the best.\n", + "* [`sklearn.model_selection.RandomizedSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) searches over a grid of hyperparameters performing `n_iter` combinations (e.g. will explore random combinations of the hyperparameters for a defined number of iterations).\n", + "* [`sklearn.model_selection.GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) will test every single possible combination of hyperparameters in the grid (this is a thorough test but can take quite a long time).\n", + "\n", + "Each class will save the best model at the end of testing.\n", "\n", "Let's see it in action." ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n" + "Fitting 5 folds for each of 20 candidates, totalling 100 fits\n", + "CPU times: user 161 ms, sys: 2.41 ms, total: 163 ms\n", + "Wall time: 212 ms\n" ] } ], "source": [ + "%%time\n", + "\n", "# Different LogisticRegression hyperparameters\n", "log_reg_grid = {\"C\": np.logspace(-4, 4, 20),\n", " \"solver\": [\"liblinear\"]}\n", @@ -3670,7 +3733,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -3679,7 +3742,7 @@ "{'C': np.float64(0.23357214690901212), 'solver': 'liblinear'}" ] }, - "execution_count": 56, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -3691,7 +3754,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -3700,7 +3763,7 @@ "0.8852459016393442" ] }, - "execution_count": 57, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -3729,24 +3792,34 @@ "Now we've got a tuned model, let's get some of the metrics we discussed before.\n", "\n", "We want:\n", - "* ROC curve and AUC score - [`sklearn.metrics.RocCurveDisplay()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.RocCurveDisplay.html) \n", - " * **Note:** This was previously `sklearn.metrics.plot_roc_curve()`, as of Scikit-Learn version 1.2+, it is `sklearn.metrics.RocCurveDisplay()`.\n", - "* Confusion matrix - [`sklearn.metrics.confusion_matrix()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html)\n", - "* Classification report - [`sklearn.metrics.classification_report()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html)\n", - "* Precision - [`sklearn.metrics.precision_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html)\n", - "* Recall - [`sklearn.metrics.recall_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html)\n", - "* F1-score - [`sklearn.metrics.f1_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)\n", + "\n", + "| Metric/Evaluation Technique | Scikit-Learn method/documentation |\n", + "| ----- | ----- |\n", + "| ROC curve and AUC score | [`sklearn.metrics.RocCurveDisplay()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.RocCurveDisplay.html), **Note:** This was previously `sklearn.metrics.plot_roc_curve()`, as of Scikit-Learn version 1.2+, it is `sklearn.metrics.RocCurveDisplay()`. |\n", + "| Confusion matrix | [`sklearn.metrics.confusion_matrix()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html) |\n", + "| Classification report | [`sklearn.metrics.classification_report()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html) |\n", + "| Precision | [`sklearn.metrics.precision_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html) |\n", + "| Recall | [`sklearn.metrics.recall_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html) |\n", + "| F1-score | [`sklearn.metrics.f1_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html) | \n", "\n", "Luckily, Scikit-Learn has these all built-in.\n", "\n", - "To access them, we'll have to use our model to make predictions on the test set. You can make predictions by calling `predict()` on a trained model and passing it the data you'd like to predict on.\n", + "What many evaluation metrics have in common is that they compare model predictions to ground truth data.\n", + "\n", + "So we'll need some model predictions!\n", "\n", - "We'll make predictions on the test data." + "To access them, we'll have to use our model to make predictions on the test set. \n", + "\n", + "We can make predictions by calling `predict()` on a trained model and passing it the data you'd like to predict on.\n", + "\n", + "We'll make predictions on the test data.\n", + "\n", + "> **Note:** When making predictions with a trained model, the data you're trying to predict on must be in the same format your model was trained on. For example, if a model was trained with data formatted in a certain way, it's important to make future predictions on data formatted in that same way. " ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -3763,7 +3836,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -3774,7 +3847,7 @@ " 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])" ] }, - "execution_count": 62, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -3792,7 +3865,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -3803,7 +3876,7 @@ " 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])" ] }, - "execution_count": 63, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -3813,14 +3886,18 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Since we've got our prediction values we can find the metrics we want.\n", "\n", - "Let's start with the ROC curve and AUC scores.\n", - "\n", + "Let's start with the ROC curve and AUC scores." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "### ROC Curve and AUC Scores\n", "\n", "What's a ROC curve?\n", @@ -3831,7 +3908,7 @@ "\n", "> To get an appropriate example in a real-world problem, consider a diagnostic test that seeks to determine whether a person has a certain disease. A false positive in this case occurs when the person tests positive, but does not actually have the disease. A false negative, on the other hand, occurs when the person tests negative, suggesting they are healthy, when they actually do have the disease.\n", "\n", - "Scikit-Learn implements a function `RocCurveDisplay` (previously called `plot_roc_curve` in Scikit-Learn versions > 1.2) which can help us create a ROC curve as well as calculate the area under the curve (AUC) metric.\n", + "Scikit-Learn implements a function `RocCurveDisplay` (previously called `plot_roc_curve` in Scikit-Learn versions < 1.2) which can help us create a ROC curve as well as calculate the area under the curve (AUC) metric.\n", "\n", "Reading the documentation on the [`RocCurveDisplay`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.RocCurveDisplay.html) function we can see it has a class method called [`from_estimator(estimator, X, y)`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.RocCurveDisplay.html#sklearn.metrics.RocCurveDisplay.from_estimator) as inputs. \n", "\n", @@ -3842,7 +3919,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 49, "metadata": {}, "outputs": [ { @@ -3874,20 +3951,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This is great, our model does far better than guessing which would be a line going from the bottom left corner to the top right corner, AUC = 0.5. But a perfect model would achieve an AUC score of 1.0, so there's still room for improvement.\n", + "This is great, our model does far better than guessing which would be a line going from the bottom left corner to the top right corner, AUC = 0.5. \n", "\n", - "Let's move onto the next evaluation request, a confusion matrix.\n", + "But a perfect model would achieve an AUC score of 1.0, so there's still room for improvement.\n", "\n", - "### Confusion matrix \n", + "Let's move on to the next evaluation request, a confusion matrix." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating a confusion matrix \n", "\n", "A confusion matrix is a visual way to show where your model made the right predictions and where it made the wrong predictions (or in other words, got confused).\n", "\n", - "Scikit-Learn allows us to create a confusion matrix using [`confusion_matrix()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html) and passing it the true labels and predicted labels." + "Scikit-Learn allows us to create a confusion matrix using [`sklearn.metrics.confusion_matrix()`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html) and passing it the true labels and predicted labels." ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -3915,7 +3999,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -3954,7 +4038,11 @@ "source": [ "Beautiful! That looks much better. \n", "\n", - "You can see the model gets confused (predicts the wrong label) relatively the same across both classes. In essence, there are 4 occasaions where the model predicted 0 when it should've been 1 (false negative) and 3 occasions where the model predicted 1 instead of 0 (false positive)." + "You can see the model gets confused (predicts the wrong label) relatively the same across both classes. \n", + "\n", + "In essence, there are 4 occasaions where the model predicted 0 when it should've been 1 (false negative) and 3 occasions where the model predicted 1 instead of 0 (false positive).\n", + "\n", + "As further evaluation, we could look into these samples and see why this may be the case." ] }, { @@ -3963,6 +4051,8 @@ "source": [ "### Classification report\n", "\n", + "A classification report is a collection of different metrics and other details.\n", + "\n", "We can make a classification report using [`sklearn.metrics.classification_report(y_true, y_pred)`](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html) and passing it the true labels as well as our models predicted labels. \n", "\n", "A classification report will also give us information on the precision and recall of our model for each class." @@ -4000,17 +4090,21 @@ "source": [ "What's going on here?\n", "\n", - "Let's get a refresh.\n", + "Let's refresh ourselves on of the above metrics.\n", "\n", - "* **Precision** - Indicates the proportion of positive identifications (model predicted class 1) which were actually correct. A model which produces no false positives has a precision of 1.0.\n", - "* **Recall** - Indicates the proportion of actual positives which were correctly classified. A model which produces no false negatives has a recall of 1.0.\n", - "* **F1 score** - A combination of precision and recall. A perfect model achieves an F1 score of 1.0.\n", - "* **Support** - The number of samples each metric was calculated on.\n", - "* **Accuracy** - The accuracy of the model in decimal form. Perfect accuracy is equal to 1.0.\n", - "* **Macro avg** - Short for macro average, the average precision, recall and F1 score between classes. Macro avg doesn’t class imbalance into effort, so if you do have class imbalances, pay attention to this metric.\n", - "* **Weighted avg** - Short for weighted average, the weighted average precision, recall and F1 score between classes. Weighted means each metric is calculated with respect to how many samples there are in each class. This metric will favour the majority class (e.g. will give a high value when one class out performs another due to having more samples).\n", + "| **Metric/metadata** | **Explanation** | \n", + "| ----- | ----- | \n", + "| **Precision** | Indicates the proportion of positive identifications (model predicted class 1) which were actually correct. A model which produces no false positives has a precision of 1.0. |\n", + "| **Recall** | Indicates the proportion of actual positives which were correctly classified. A model which produces no false negatives has a recall of 1.0. |\n", + "| **F1 score** | A combination of precision and recall. A perfect model achieves an F1 score of 1.0. |\n", + "| **Support** | The number of samples each metric was calculated on. |\n", + "| **Accuracy** | The accuracy of the model in decimal form. Perfect accuracy is equal to 1.0. |\n", + "| **Macro avg** | Short for macro average, the average precision, recall and F1 score between classes. Macro avg doesn’t class imbalance into effort, so if you do have class imbalances, pay attention to this metric. |\n", + "| **Weighted avg** | Short for weighted average, the weighted average precision, recall and F1 score between classes. Weighted means each metric is calculated with respect to how many samples there are in each class. This metric will favour the majority class (e.g. will give a high value when one class out performs another due to having more samples). |\n", "\n", - "Ok, now we've got a few deeper insights on our model. But these were all calculated using a single training and test set.\n", + "Ok, now we've got a few deeper insights on our model. \n", + "\n", + "But these were all calculated using a single training and test set.\n", "\n", "What we'll do to make them more solid is calculate them using cross-validation.\n", "\n", @@ -4018,14 +4112,16 @@ "\n", "We'll take the best model along with the best hyperparameters and use [`cross_val_score()`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html) along with various `scoring` parameter values.\n", "\n", - "`cross_val_score()` works by taking an estimator (machine learning model) along with data and labels. It then evaluates the machine learning model on the data and labels using cross-validation and a defined `scoring` parameter.\n", + "`cross_val_score()` works by taking an estimator (machine learning model) along with data and labels. \n", + "\n", + "It then evaluates the machine learning model on the data and labels using cross-validation across `cv=5` (the default number of splits) splits and a defined `scoring` parameter.\n", "\n", "Let's remind ourselves of the best hyperparameters and then see them in action." ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -4034,7 +4130,7 @@ "{'C': np.float64(0.23357214690901212), 'solver': 'liblinear'}" ] }, - "execution_count": 69, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -4046,7 +4142,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -4067,26 +4163,36 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 54, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 9.91 ms, sys: 1.35 ms, total: 11.3 ms\n", + "Wall time: 9.75 ms\n" + ] + }, { "data": { "text/plain": [ "array([0.81967213, 0.90163934, 0.8852459 , 0.88333333, 0.75 ])" ] }, - "execution_count": 71, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "%%time\n", + "\n", "# Cross-validated accuracy score\n", "cv_acc = cross_val_score(clf,\n", " X,\n", " y,\n", - " cv=5, # 5-fold cross-validation\n", + " cv=5, # 5-fold cross-validation, this is the default\n", " scoring=\"accuracy\") # accuracy as scoring\n", "cv_acc" ] @@ -4095,12 +4201,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "Woah!\n", + "\n", + "The output from `cross_val_score()` shows 5 different metrics across different splits of the data.\n", + "\n", + "This goes to show the power of cross-validation.\n", + "\n", + "If we had have only chosen to go with the results of one data split, we might be thinking our model is under performing or over performing.\n", + "\n", "Since there are 5 metrics here, we'll take the average." ] }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 55, "metadata": {}, "outputs": [ { @@ -4109,7 +4223,7 @@ "np.float64(0.8479781420765027)" ] }, - "execution_count": 72, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -4128,7 +4242,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 56, "metadata": {}, "outputs": [ { @@ -4137,7 +4251,7 @@ "np.float64(0.8215873015873015)" ] }, - "execution_count": 73, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -4154,7 +4268,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 57, "metadata": {}, "outputs": [ { @@ -4163,7 +4277,7 @@ "np.float64(0.9272727272727274)" ] }, - "execution_count": 74, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -4180,7 +4294,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 58, "metadata": {}, "outputs": [ { @@ -4189,7 +4303,7 @@ "np.float64(0.8705403543192143)" ] }, - "execution_count": 75, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } @@ -4215,7 +4329,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 59, "metadata": {}, "outputs": [ { @@ -4247,17 +4361,24 @@ "\n", "What now?\n", "\n", - "The final thing to check off the list of our model evaluation techniques is feature importance.\n", - "\n", + "The final thing to check off the list of our model evaluation techniques is feature importance." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "## Feature importance\n", "\n", - "Feature importance is another way of asking, \"which features contributing most to the outcomes of the model?\"\n", + "Feature importance is another way of asking, \"Which features contribute most to the outcomes of the model?\"\n", "\n", - "Or for our problem, trying to predict heart disease using a patient's medical characterisitcs, which charateristics contribute most to a model predicting whether someone has heart disease or not?\n", + "For our problem, trying to predict heart disease using a patient's medical characteristics, getting the feature importance is like asking \"Which characteristics contribute most to a model predicting whether someone has heart disease or not?\"\n", "\n", - "Unlike some of the other functions we've seen, because how each model finds patterns in data is slightly different, how a model judges how important those patterns are is different as well. This means for each model, there's a slightly different way of finding which features were most important.\n", + "Because how each model finds patterns in data is slightly different, how a model judges how important those patterns are is different as well. \n", "\n", - "You can usually find an example via the Scikit-Learn documentation or via searching for something like \"[MODEL TYPE] feature importance\", such as, \"random forest feature importance\".\n", + "This means for each model, there's a slightly different way of finding which features were most important and in turn, the feature importance of one model won't necessarily reflect the feature importance of another.\n", + "\n", + "You can usually find an example via the Scikit-Learn documentation or via searching for something like \"MODEL TYPE feature importance\", such as, \"random forest feature importance\".\n", "\n", "Since we're using `LogisticRegression`, we'll look at one way we can calculate feature importance for it.\n", "\n", @@ -4268,7 +4389,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -4278,7 +4399,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 61, "metadata": {}, "outputs": [ { @@ -4289,7 +4410,7 @@ " 0.47095153, -0.65165346, -0.69984212]])" ] }, - "execution_count": 78, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } @@ -4310,7 +4431,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 62, "metadata": {}, "outputs": [ { @@ -4331,7 +4452,7 @@ " 'thal': np.float64(-0.6998421177365038)}" ] }, - "execution_count": 79, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -4351,7 +4472,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 63, "metadata": {}, "outputs": [ { @@ -4559,7 +4680,9 @@ "\n", "What can you do with this information?\n", "\n", - "This is something you might want to talk to a subject matter expert about. They may be interested in seeing where machine learning model is finding the most patterns (highest correlation) as well as where it's not (lowest correlation). \n", + "This is something you might want to talk to a subject matter expert about. \n", + "\n", + "They may be interested in seeing where machine learning model is finding the most patterns (highest correlation) as well as where it's not (lowest correlation). \n", "\n", "Doing this has a few benefits:\n", "1. **Finding out more** - If some of the correlations and feature importances are confusing, a subject matter expert may be able to shed some light on the situation and help you figure out more.\n", @@ -4573,7 +4696,9 @@ "source": [ "## 6. Experimentation\n", "\n", - "Well we've completed all the metrics your boss requested. You should be able to put together a great report containing a confusion matrix, a handful of cross-valdated metrics such as precision, recall and F1 as well as which features contribute most to the model making a decision.\n", + "We've completed all the metrics your boss requested!\n", + "\n", + "You should be able to put together a great report containing a confusion matrix, and a handful of cross-validated metrics such as precision, recall and F1-score and you can even include which features contribute most to the model making a decision.\n", "\n", "But after all this you might be wondering where step 6 in the framework is, experimentation.\n", "\n", @@ -4589,10 +4714,17 @@ "\n", "Remember we defined one in step 3.\n", "\n", - "> If we can reach 95% accuracy at predicting whether or not a patient has heart disease during the proof of concept, we'll pursure this project.\n", + "> If we can reach 95% accuracy at predicting whether or not a patient has heart disease during the proof of concept, we'll pursue this project.\n", "\n", - "In this case, we didn't. The highest accuracy our model achieved was below 90%.\n", + "In this case, we didn't. \n", "\n", + "The highest accuracy our model achieved was below 90%." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "#### What next?\n", "\n", "You might be wondering, what happens when the evaluation metric doesn't get hit?\n", @@ -4601,22 +4733,26 @@ "\n", "No.\n", "\n", - "It means we know what doesn't work. In this case, we know the current model we're using (a tuned version of `LogisticRegression`) along with our specific data set doesn't hit the target we set ourselves.\n", + "It means we know what doesn't work. \n", + "\n", + "In this case, we know the current model we're using (a tuned version of [`sklearn.linear_model.LogisticRegression`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)) along with our specific data set doesn't hit the target we set ourselves.\n", "\n", "This is where step 6 comes into its own.\n", "\n", "A good next step would be to discuss with your team or research on your own different options of going forward.\n", "\n", - "* Could you collect more data?\n", - "\n", + "* Could you collect more data? Across more patients with more features? This may take a while but in machine learning, more data is generally better.\n", "* Could you try a better model? If you're working with structured data, you might want to look into [CatBoost](https://catboost.ai/) or [XGBoost](https://xgboost.ai/).\n", - "\n", "* Could you improve the current models (beyond what we've done so far)?\n", "* If your model is good enough, how would you export it and share it with others? (Hint: check out [Scikit-Learn's documentation on model persistance](https://scikit-learn.org/stable/modules/model_persistence.html))\n", "\n", - "The key here is to remember, your biggest restriction will be time. Hence, why it's paramount to minimise your times between experiments.\n", + "The key here is to remember, your biggest restriction will be time. \n", + "\n", + "Hence why it's paramount to minimise your time between experiments (if you can).\n", + "\n", + "The more things you try, the more you figure out what doesn't work, the more you'll start to get a hang of what does.\n", "\n", - "The more you try, the more you figure out what doesn't work, the more you'll start to get a hang of what does." + "And that's the whole nature of machine learning." ] } ],