diff --git a/hands-on-housing_exercise.ipynb b/hands-on-housing_exercise.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..716687b898234679e0d3d9b9414599df3dc92372 --- /dev/null +++ b/hands-on-housing_exercise.ipynb @@ -0,0 +1,2032 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hands-on housing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*This Notebook describes an example end-to-end Machine Learning project. The project is completely fictitious and should only serve as an illustrative example to present the main steps of a Machine Learning project. It is based on the Notebook [End-to-end Machine Learning project](https://github.com/ageron/handson-ml/blob/master/02_end_to_end_machine_learning_project.ipynb) by Aurélien Géron.*\n", + "\n", + "The following eight main steps can be used as a checklist for nearly any Machine Learning project:\n", + "1. Look at the big picture\n", + "2. Get the data\n", + "3. Discover and visualize the data to gain insights\n", + "4. Prepare the data for Machine Learning alogrithms\n", + "5. Select and train a model\n", + "6. Fine-tune your model\n", + "7. Launch, monitor and maintain your system" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, setup this Notebook by importing all needed libraries and setting the default properties for all figure creation routines." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "import time\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.image as mpimg\n", + "import shutil\n", + "import tarfile\n", + "import hashlib\n", + "from pandas.plotting import scatter_matrix\n", + "from scipy.stats import expon, reciprocal\n", + "from six.moves import urllib\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.pipeline import FeatureUnion\n", + "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.preprocessing import Imputer\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.model_selection import StratifiedShuffleSplit\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.model_selection import KFold\n", + "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.tree import DecisionTreeRegressor\n", + "from sklearn.svm import SVR\n", + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.externals import joblib\n", + "\n", + "\n", + "plt.rcParams['text.usetex'] = False\n", + "plt.rcParams['text.latex.unicode'] = False\n", + "\n", + "SMALL_SIZE = 12\n", + "MEDIUM_SIZE = 14\n", + "BIGGER_SIZE = 16\n", + "\n", + "plt.rc('font', size=SMALL_SIZE) # controls default text sizes\n", + "plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title\n", + "plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels\n", + "plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels\n", + "plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels\n", + "plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize\n", + "plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# determine path to save figures\n", + "PROJECT_ROOT_DIR = '.'\n", + "NOTEBOOK_ID = 'hands-on-housing'\n", + "IMAGES_DIR = os.path.join(PROJECT_ROOT_DIR, 'images', NOTEBOOK_ID)\n", + "\n", + "def save_fig(fig_name, fig_extension='png', resolution=400):\n", + " if not os.path.exists(IMAGES_DIR):\n", + " os.mkdir(IMAGES_DIR)\n", + " path = os.path.join(IMAGES_DIR, fig_name + '.' + fig_extension)\n", + " print('Saving figure:', fig_name)\n", + " plt.savefig(path, format=fig_extension, dpi=resolution)\n", + " \n", + "\n", + "DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml/master/\"\n", + "DATASET_PATH = os.path.join(\"data\", \"datasets\", \"housing\", \"housing.tgz\")\n", + "DATASET_URL = DOWNLOAD_ROOT + \"datasets/housing/housing.tgz\"\n", + "PROJECT_ROOT_DIR = \".\"\n", + "\n", + "def fetch_dataset(src_url=DATASET_URL, dest_path=DATASET_PATH):\n", + " dest_dirname = os.path.dirname(dest_path)\n", + " if not os.path.isdir(dest_dirname):\n", + " os.makedirs(dest_dirname)\n", + " \n", + " urllib.request.urlretrieve(src_url, dest_path)\n", + "\n", + "def extract_dataset(dest_path=DATASET_PATH):\n", + " dest_dirname = os.path.dirname(dest_path)\n", + " if not os.path.isdir(dest_dirname):\n", + " os.makedirs(dest_dirname)\n", + " \n", + " dataset_tgz = tarfile.open(dest_path)\n", + " dataset_tgz.extractall(path=dest_dirname)\n", + " dataset_tgz.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Look at the big picture" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See powerpoint slides." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<br>\n", + "<br>\n", + "<br>\n", + "<br>\n", + "<br>\n", + "\n", + "## 2. Get the data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1. Fetch the data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Already done by the supervisor (See: [data/datasets/housing/housing.csv](data/datasets/housing/housing.csv))." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2. Load the data using Pandas" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pandas is a core Python package for data science. It offers powerful and flexible data sturctures so that it makes people manipulate and analyze data easily." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_dataset(dataset_path):\n", + " return pd.read_csv(dataset_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_dataset = load_dataset(\"data/datasets/housing/housing.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3. Take a quick look at the data structure" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print out the first five entries of the `pandas.DataFrame` to take a first insight to its columns and values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_dataset.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The shape of the dataset is defined by the number of instances (rows) and the number of attributes (columns)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_dataset.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `info()` method is a very useful function to get a quick description of the data.\n", + "It can be used to e.g. identify each attribute's datatype and the number of non-null values in each column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "housing_dataset.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All attributes are of the type `float64`, except the attribute *ocean_proximity*.\n", + "Its type is `object` and can thus be any kind of Python object.\n", + "Here, as already seen in the first five entries of the dataset, the *ocean_proximity* is a text attribute.\n", + "More precisily, it is a so called **categorical attribute**.\n", + "To get the different categories, one can use the `value_counts()` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_dataset[\"ocean_proximity\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `describe()` method of the `pandas.DataFrame` can be used to generate descriptive statistics that summarize the numerical attributes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "housing_dataset.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another useful function to get a first insight to the data, is the `hist()` method.\n", + "It calculates and visualizes the histograms for the different attributes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "housing_dataset.hist(bins=5, figsize=(20, 15))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Exercise 2.1: Determine a good value for bins. What do you observe in the histograms?***" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Good value for bins: ...\n", + "\n", + "Observations:\n", + "- ...\n", + "- ...\n", + "- ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.4. Create a test set" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To prevent the algorithm and even you as the developer from overfitting to the seen data, one generally split the data into a training and a test set.\n", + "Typically, 20% randomly picked instances of the total dataset are used as the test set." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Exercise 2.2: Write a function, which splits the dataset into random training and test subsets. The parameter test_ratio should represent the proportion of the dataset to include in the test split. Hint: You can use the [np.random.permutation](https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.permutation.html) function.***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# to make this notebook's output identical at every run\n", + "np.random.seed(41)\n", + "\n", + "def split_train_test(dataset, test_ratio):\n", + " # ...\n", + " \n", + " train_subset = dataset.iloc[0:15000] # To be modified\n", + " test_subset = dataset.iloc[15000:] # To be modified\n", + " \n", + " return train_subset, test_subset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_set, test_set = split_train_test(housing_dataset, 0.2)\n", + "print(\"Splitted dataset into\", len(train_set), \"train and\", len(test_set), \"test instances.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you fetch an updated dataset, this solution will generate a test set which contains parts of the previous train set. Solution: Use a unique and immutable identifier for each instance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def test_set_check(identifier, test_ratio, hash):\n", + " return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio\n", + "\n", + "def split_train_test_by_id(dataset, test_ratio, id_column, hash=hashlib.md5):\n", + " ids = dataset[id_column]\n", + " in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))\n", + " return dataset.loc[~in_test_set], dataset.loc[in_test_set]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_dataset_with_id = housing_dataset.reset_index() # Adds an 'index' column\n", + "train_set, test_set = split_train_test_by_id(housing_dataset_with_id, 0.2, \"index\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_dataset_with_id[\"id\"] = housing_dataset[\"longitude\"] * 1000 + housing_dataset[\"latitude\"]\n", + "train_set, test_set = split_train_test_by_id(housing_dataset_with_id, 0.2, \"id\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Scikit-Learn provides a few functions for splitting the dataset. The `train_test_split()` function does pretty much the same as the `split_train_test()` function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_set, test_set = train_test_split(housing_dataset, test_size=0.2, random_state=41)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Purely random sampling methods (see above) can be used if the dataset is large enough. If it is too small, a significant sampling bias may be introduced. Solution: Stratified Sampling (splitting into more representative subsets)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "housing_dataset[\"median_income\"].hist(bins=50)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Exercise 2.3: Create an income category (income_cat) attribute, which can be used to divide the dataset into homogeneous subgroups (strata). There should be less than 8 categories.***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_dataset[\"income_cat\"] = np.round(housing_dataset[\"median_income\"])\n", + "\n", + "#..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=41)\n", + "for train_index, test_index in split.split(housing_dataset, housing_dataset[\"income_cat\"]):\n", + " strat_train_set = housing_dataset.loc[train_index]\n", + " strat_test_set = housing_dataset.loc[test_index]\n", + "\n", + "strat_test_set[\"income_cat\"].value_counts() / len(strat_test_set)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "housing_dataset[\"income_cat\"].value_counts() / len(housing_dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets now compare the *income_cat* proportions in:\n", + "- the purely random sampled test set,\n", + "- the stratified sampled test set and\n", + "- the total dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def income_cat_proportions(dataset):\n", + " return dataset[\"income_cat\"].value_counts() / len(dataset)\n", + "\n", + "train_set, test_set = train_test_split(housing_dataset, test_size=0.2, random_state=41)\n", + "\n", + "compare_props = pd.DataFrame({\n", + " \"Overall\": income_cat_proportions(housing_dataset),\n", + " \"Stratified\": income_cat_proportions(strat_test_set),\n", + " \"Random\": income_cat_proportions(test_set),\n", + "}).sort_index()\n", + "compare_props[\"Rand. %error\"] = 100 * compare_props[\"Random\"] / compare_props[\"Overall\"] - 100\n", + "compare_props[\"Strat. %error\"] = 100 * compare_props[\"Stratified\"] / compare_props[\"Overall\"] - 100\n", + "\n", + "compare_props" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for set_ in (strat_train_set, strat_test_set):\n", + " set_.drop(\"income_cat\", axis=1, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the following, the training and test set are determined by using the stratified sampling method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_set = strat_train_set\n", + "test_set = strat_test_set" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<br>\n", + "<br>\n", + "<br>\n", + "<br>\n", + "<br>\n", + "\n", + "## 3. Discover and visualize the data to gain insights" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.1. Visualize geographical data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The *longitute* and *latitude* attributes are geographical information.\n", + "It could therefore be a good idea to visualize the data based on this attributes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_dataset.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.4,\n", + " s=housing_dataset[\"population\"]/100, label=\"population\", figsize=(10,7),\n", + " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"), colorbar=True)\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using an image of the california map as a background, we can even improve the readability of the figure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "california_img=mpimg.imread('images/hands-on-housing/california.png')\n", + "ax = housing_dataset.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", figsize=(10,7),\n", + " s=housing_dataset['population']/100, label=\"Population\",\n", + " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n", + " colorbar=False, alpha=0.4,\n", + " )\n", + "plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,\n", + " cmap=plt.get_cmap(\"jet\"))\n", + "plt.ylabel(\"Latitude\", fontsize=14)\n", + "plt.xlabel(\"Longitude\", fontsize=14)\n", + "\n", + "prices = housing_dataset[\"median_house_value\"]\n", + "tick_values = np.linspace(prices.min(), prices.max(), 11)\n", + "cbar = plt.colorbar()\n", + "cbar.ax.set_yticklabels([\"$%dk\"%(round(v/100)) for v in tick_values], fontsize=14)\n", + "cbar.set_label('Median House Value', fontsize=16)\n", + "\n", + "plt.legend(fontsize=16)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Exercise 3.1: Play around with the visualization parameters. What relations can be seen in this figure?***" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Observations:\n", + "- ...\n", + "- ...\n", + "- ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2. Look for correlations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A correlation coefficient between attributes decribes how the corresponding data dependens linearly on each other.\n", + "Some exemplary correlation coefficients can be found in the following figure (source: [wikipedia](https://en.wikipedia.org/wiki/Correlation_and_dependence)).\n", + "\n", + "<img src=\"images/hands-on-housing/correlations.png\"/>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets now take a look at our housing data again and calculate the correlations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_dataset.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\", alpha=0.1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Exercise 3.2: Search for correlations between the numerical attributes using the plot function.***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.3. Experimenting with attribute combinations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Often, a combination of different attributes can lead to even better attributes.\n", + "For example, one could calculate the *population_per_household* by dividing the *population* by the *households*.\n", + "This new attribute could be a much better indicator for a high housing price while even reducing the amount of data by a factor of 2." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_dataset[\"population_per_household\"] = housing_dataset[\"population\"] / housing_dataset[\"households\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "corr_matrix = housing_dataset.corr()\n", + "corr_matrix[\"median_house_value\"].sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_dataset.plot(kind=\"scatter\", x=\"population_per_household\", y=\"median_house_value\", alpha=0.2)\n", + "plt.axis([0, 5, 0, 520000])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Exercise 3.3: Try out various attribute combinations. Use the `corr()` and `plot()` function to identify good combinations.***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "housing_dataset.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<br>\n", + "<br>\n", + "<br>\n", + "<br>\n", + "<br>\n", + "\n", + "## 4. Prepare the data for Machine Learning algorithms" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are some preparation steps done in almost any Machine Learning project.\n", + "It is not recommended to do these preparation steps manually.\n", + "Instead one should use transformation functions which can be applied several times on e.g. different data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = train_set.drop(\"median_house_value\", axis=1) # Data/features (drop labels)\n", + "y_train = train_set[\"median_house_value\"].copy() # Labels" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.1. Clean the data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First of all, lets clean the data.\n", + "This means e.g. removing missing fields (i.e. `NaN`/null values) in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_set.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_incomplete_rows = X_train[X_train.isnull().any(axis=1)].head()\n", + "sample_incomplete_rows" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Exercise 4.1: How to fix the missing values in the total_bedrooms column? Name at least two options and think about their advantages/disadvantages.***" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Options:\n", + "- ...\n", + "- ...\n", + "- ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "imputer = Imputer(strategy=\"median\") # Imputer saves the filled in values\n", + "X_train_num = X_train.drop(\"ocean_proximity\", axis=1) # Median can only be calculated for numerical attributes\n", + "imputer.fit(X_train_num)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "imputer.statistics_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_num.median().values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_num.loc[sample_incomplete_rows.index.values]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = imputer.transform(X_train_num) # results a numpy array containing the transformed features\n", + "\n", + "# put it back into a pandas dataframe\n", + "X_train_num_tr = pd.DataFrame(X, columns=X_train_num.columns, index=list(X_train_num.index.values))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_num_tr.loc[sample_incomplete_rows.index.values]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "X_train_num_tr.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.2. Handling text and categorical attributes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Most Machine Learning algorithms can only be applied on numerical attributes.\n", + "Therefore the text and categorical attributes need to be transformed to a numerical representation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_cat = X_train[\"ocean_proximity\"]\n", + "X_train_cat.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_cat_encoded, X_train_categories = X_train_cat.factorize()\n", + "X_train_cat_encoded[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_categories" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Machine Learning algorithms will assume that two nearby values are more similar than two distant values. Solution: One-Hot Encoding." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "encoder = OneHotEncoder()\n", + "X_train_cat_1hot = encoder.fit_transform(X_train_cat_encoded.reshape(-1,1)) # Returns a sparse matrix\n", + "X_train_cat_1hot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "pd.DataFrame(X_train_cat_1hot.toarray(), columns=X_train_categories).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from src.CategoricalEncoder import CategoricalEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cat_encoder = CategoricalEncoder()\n", + "X_train_cat_reshaped = X_train_cat.values.reshape(-1,1)\n", + "X_train_cat_1hot = cat_encoder.fit_transform(X_train_cat_reshaped)\n", + "X_train_cat_1hot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cat_encoder = CategoricalEncoder(encoding=\"onehot-dense\")\n", + "X_train_cat_1hot = cat_encoder.fit_transform(X_train_cat_reshaped)\n", + "pd.DataFrame(X_train_cat_1hot, columns=cat_encoder.categories_).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cat_encoder.categories_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.3. Create custom transformers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Besides using the many useful scikit-learn transformers one can also write own transformers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# indices\n", + "rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6\n", + "\n", + "class CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n", + " def __init__(self, add_bedrooms_per_room = True): # no *args or **kwargs\n", + " self.add_bedrooms_per_room = add_bedrooms_per_room\n", + " def fit(self, X, y=None):\n", + " return self # nothing else to do\n", + " def transform(self, X, y=None):\n", + " rooms_per_household = X[:, rooms_ix] / X[:, household_ix]\n", + " population_per_household = X[:, population_ix] / X[:, household_ix]\n", + " if self.add_bedrooms_per_room:\n", + " bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n", + " return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]\n", + " else:\n", + " return np.c_[X, rooms_per_household, population_per_household]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "add_bedrooms_per_room = False\n", + "attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=add_bedrooms_per_room)\n", + "X_extra_attribs = attr_adder.transform(X_train.values)\n", + "\n", + "new_columns = list(X_train.columns)+['rooms_per_household', 'population_per_household']\n", + "if add_bedrooms_per_room:\n", + " new_columns = list(new_columns)+['bedrooms_per_room']\n", + "\n", + "# put it back into a pandas dataframe\n", + "X_train_extra_attribs = pd.DataFrame(X_extra_attribs, columns=new_columns, index=list(X_train.index.values))\n", + "\n", + "X_train_extra_attribs.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\", \"housing_median_age\", \"households\"]\n", + "scatter_matrix(train_set[attributes], figsize=(12,8), alpha=0.1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Exercise 4.2: Create a transformer, which removes outliers in the dataset.***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class RemoveOutliers(BaseEstimator, TransformerMixin):\n", + " def fit (self, X, y=None):\n", + " return self\n", + "\n", + " def transform(self, X, y=None):\n", + " X=X[(X['median_house_value']!=500001) | (X['median_income']>=2)].reset_index(drop=True)\n", + " # ...\n", + " \n", + " return X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "remOutliers = RemoveOutliers()\n", + "X_train_num_tmp = remOutliers.transform(train_set)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "X_train_num_tmp.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\", \"housing_median_age\", \"households\"]\n", + "scatter_matrix(X_train_num_tmp[attributes], figsize=(12,8), alpha=0.1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.4. Feature scaling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One of the most important transformations is the so called **feature scaling**.\n", + "This is necessary as the most Machine Learning algorithms do not perform well on numerical **input** attributes that have very different scales.\n", + "The output attributes are generally not scaled.\n", + "Two common used scaling operations are:\n", + "- the standard scaling generating a distribution with zero mean and unit variance,\n", + "- the min-max scaling generating a distribution ranging from 0 to 1." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "minmaxScaler = MinMaxScaler()\n", + "stdScaler = StandardScaler()\n", + "\n", + "X = imputer.transform(X_train_num) # fill in missing values\n", + "\n", + "X_standard = stdScaler.fit_transform(X) # standard scaling\n", + "X_minmax = minmaxScaler.fit_transform(X) # min-max scaling\n", + "\n", + "# put it back into a pandas dataframe\n", + "X_train_num_standard = pd.DataFrame(X_standard, columns=X_train_num.columns, index=list(X_train_num.index.values))\n", + "X_train_num_minmax = pd.DataFrame(X_minmax, columns=X_train_num.columns, index=list(X_train_num.index.values))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "X_train_num_standard.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_num_minmax.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.5. Transformation pipelines" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The several transformation steps can be summarized in a transformation pipeline where the transformation steps are just concatenated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "add_bedrooms_per_room = False\n", + "\n", + "num_pipeline = Pipeline([\n", + " ('imputer', Imputer(strategy='median')),\n", + " ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room)),\n", + " ('std_scaler', StandardScaler()),\n", + "])\n", + "\n", + "new_columns = list(X_train_num.columns)+['rooms_per_household', 'population_per_household']\n", + "if add_bedrooms_per_room:\n", + " new_columns = list(new_columns)+['bedrooms_per_room']\n", + "\n", + "X = num_pipeline.fit_transform(X_train_num) # returns a numpy.array\n", + "\n", + "# put it back into a pandas dataframe\n", + "X_train_num_tr = pd.DataFrame(X, columns=new_columns, index=list(X_train_num.index.values))\n", + "X_train_num_tr.shape\n", + "X_train_num_tr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class DataFrameSelector(BaseEstimator, TransformerMixin):\n", + " def __init__(self, attribute_names):\n", + " self.attribute_names = attribute_names\n", + " def fit(self, X, y=None):\n", + " return self\n", + " def transform(self, X):\n", + " return X[self.attribute_names].values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "num_attribs = list(X_train_num)\n", + "cat_attribs = [\"ocean_proximity\"]\n", + "\n", + "num_pipeline = Pipeline([\n", + " ('selector', DataFrameSelector(num_attribs)),\n", + " ('imputer', Imputer(strategy=\"median\")),\n", + " ('attribs_adder', CombinedAttributesAdder()),\n", + " ('std_scaler', StandardScaler()),\n", + "])\n", + "\n", + "cat_pipeline = Pipeline([\n", + " ('selector', DataFrameSelector(cat_attribs)),\n", + " ('cat_encoder', CategoricalEncoder(encoding=\"onehot-dense\")),\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "full_pipeline = FeatureUnion(transformer_list=[\n", + " (\"num_pipeline\", num_pipeline),\n", + " (\"cat_pipeline\", cat_pipeline),\n", + "])\n", + "\n", + "X = full_pipeline.fit_transform(X_train)\n", + "\n", + "new_columns = list(X_train.columns)+['rooms_per_household', 'population_per_household']+list(cat_pipeline.named_steps['cat_encoder'].categories_[0])\n", + "\n", + "\n", + "# put it back into a pandas dataframe\n", + "X_train_tr = pd.DataFrame(X, columns=new_columns, index=list(X_train.index.values))\n", + "\n", + "# rename columns for tensorflow\n", + "rename_dict = [(' ', '_'), ('<', 'less')]\n", + "renamed_columns = list(X_train_tr.columns)\n", + "for old, new in rename_dict:\n", + " renamed_columns = [w.replace(old, new) for w in renamed_columns]\n", + "\n", + "X_train_tr.columns = renamed_columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "X_train_tr.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<br>\n", + "<br>\n", + "<br>\n", + "<br>\n", + "<br>\n", + "\n", + "## 5. Select and train a model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After exploring and preparing the data, one can now start to select a model and train it.\n", + "This training is typically only done using the training set.\n", + "Afterwards the test set is used to validate the trained model.\n", + "In this chapter several different Machine Learning algorithms are trained and evaluated." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.1. Training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.1.1. LinearRegression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lin_reg = LinearRegression()\n", + "lin_reg.fit(X_train_tr, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.1.2. DecisionTreeRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "tree_reg = DecisionTreeRegressor()\n", + "tree_reg.fit(X_train_tr, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.1.3. DNNRegressor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Construction phase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf.logging.set_verbosity(tf.logging.INFO)\n", + "\n", + "def training_input_fn(x=X_train_tr, y=y_train, batch_size=1):\n", + " return tf.estimator.inputs.pandas_input_fn(\n", + " x=x,\n", + " y=y,\n", + " batch_size=batch_size,\n", + " num_epochs=None,\n", + " shuffle=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dnn_reg = tf.estimator.DNNRegressor(\n", + " hidden_units=[1000, 1000],\n", + " model_dir=\"models/dnn_reg_model\",\n", + " activation_fn=tf.nn.relu,\n", + " feature_columns=[tf.feature_column.numeric_column(x, shape=(1,)) for x in X_train_tr.columns])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Execution phase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dnn_reg.train(input_fn=training_input_fn(batch_size=50), steps=5000)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.2. Evaluate on the training set" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For simplicity, here we just use the training set to evaluate our models which is generally **not** recommended." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# let's try the full pipeline on a few training instances\n", + "some_data = X_train.iloc[:8]\n", + "some_labels = y_train.iloc[:8]\n", + "X = full_pipeline.transform(some_data)\n", + "\n", + "# put it back into a pandas dataframe\n", + "some_data_tr = pd.DataFrame(X, columns=renamed_columns, index=list(some_data.index.values))\n", + "some_data_tr.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.2.1. LinearRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Predictions:\", lin_reg.predict(some_data_tr))\n", + "print(\"Labels:\", list(some_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lin_reg_predictions = lin_reg.predict(X_train_tr)\n", + "lin_mse = mean_squared_error(y_train, lin_reg_predictions)\n", + "lin_rmse = np.sqrt(lin_mse)\n", + "lin_rmse" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.2.2. DecisionTreeRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Predictions:\", tree_reg.predict(some_data_tr))\n", + "print(\"Labels:\", list(some_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tree_reg_predictions = tree_reg.predict(X_train_tr)\n", + "tree_mse = mean_squared_error(y_train, tree_reg_predictions)\n", + "tree_rmse = np.sqrt(tree_mse)\n", + "tree_rmse" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.2.3. DNNRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def test_input_fn(x=X_train_tr, y=y_train):\n", + " return tf.estimator.inputs.pandas_input_fn(\n", + " x=x,\n", + " y=y,\n", + " num_epochs=1,\n", + " shuffle=False)\n", + "\n", + "dnn_reg_predictions_gen_expr = dnn_reg.predict(input_fn=test_input_fn(x=some_data_tr, y=some_labels))\n", + "dnn_reg_predictions = [x['predictions'][0] for x in dnn_reg_predictions_gen_expr]\n", + "\n", + "print(\"\\nPredictions:\", list(dnn_reg_predictions))\n", + "print(\"Labels:\", list(some_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "dnn_reg_predictions_gen_expr = dnn_reg.predict(input_fn=test_input_fn())\n", + "dnn_reg_predictions = [x['predictions'][0] for x in dnn_reg_predictions_gen_expr]\n", + "\n", + "dnn_mse = mean_squared_error(y_train, dnn_reg_predictions)\n", + "dnn_rmse = np.sqrt(dnn_mse)\n", + "dnn_rmse" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.3. Better evaluation using cross-validation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One way to evaluate the models would be to split the training set into a smaller training set and a validation set.\n", + "This validation set could then be used for evaluation during the training procedure.\n", + "Another way is to use the scikit-learn cross-validation function which randomly splits the training set into 10 distinct subsets, then it trains and evaluates the model 10 times, picking a different subset for evaluation every time and training on the other 9 subsets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def display_scores(scores):\n", + " print(\"Scores:\", scores)\n", + " print(\"Mean:\", scores.mean())\n", + " print(\"Standard deviation:\", scores.std())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.3.1. LinearRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lin_scores = cross_val_score(lin_reg, X_train_tr, y_train, scoring=\"neg_mean_squared_error\", cv=10)\n", + "lin_rmse_scores = np.sqrt(-lin_scores)\n", + "display_scores(lin_rmse_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.3.2. DecisionTreeRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tree_scores = cross_val_score(tree_reg, X_train_tr, y_train, scoring=\"neg_mean_squared_error\", cv=10)\n", + "tree_rmse_scores = np.sqrt(-tree_scores)\n", + "display_scores(tree_rmse_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.3.3. DNNRegressor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def cross_val_score_dnn_regressor(dnn_reg_estim, X, y, batch_size=50, steps=5000, cv=5):\n", + " scores = list()\n", + " k_fold = KFold(n_splits=cv)\n", + " for train_indices, validation_indices in k_fold.split(X): \n", + " # clear model dir\n", + " shutil.rmtree(dnn_reg_estim.model_dir, ignore_errors=True)\n", + "\n", + " print('\\nTrain: %s | Validation: %s' % (train_indices, validation_indices))\n", + "\n", + " # train\n", + " dnn_reg_estim.train(input_fn=training_input_fn(x=X.iloc[train_indices], y=y.iloc[train_indices], batch_size=batch_size), steps=steps)\n", + "\n", + " # predict\n", + " dnn_reg_predictions_gen_expr = dnn_reg_estim.predict(input_fn=test_input_fn(x=X.iloc[validation_indices], y=y.iloc[validation_indices]))\n", + " dnn_reg_predictions = [x['predictions'][0] for x in dnn_reg_predictions_gen_expr]\n", + "\n", + " # save score\n", + " mse = mean_squared_error(y_train.iloc[validation_indices], dnn_reg_predictions)\n", + " scores.append(mse)\n", + " \n", + " return scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dnn_scores = cross_val_score_dnn_regressor(dnn_reg, X_train_tr, y_train, 50, 3000, cv=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dnn_rmse_scores = np.sqrt(np.asarray(dnn_scores))\n", + "display_scores(dnn_rmse_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Exercise 5.1: Add more promising models (at least 5 different models in total). Hint: Take a look at the [Supervised Learning Models](http://scikit-learn.org/stable/supervised_learning.html) of Scikit-Learn.***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<br>\n", + "<br>\n", + "<br>\n", + "<br>\n", + "<br>\n", + "\n", + "## 6. Fine-tune your model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After specifying a list of promising models, one can now start to fine-tune them.\n", + "This means that we try to find good hyperparameter values.\n", + "Doing this manually would cost a lost of time and would be very tedious work.\n", + "Thus one can better do this hyperparameter search by using e.g. the so called grid or randomized search." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.1. Grid search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "param_grid = [\n", + " # try 12 (3×4) combinations of hyperparameters\n", + " {'max_depth': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n", + " # then try 6 (2×3) combinations with presort set as True\n", + " {'presort': [True], 'max_depth': [3, 10], 'max_features': [2, 3, 4]},\n", + "]\n", + "\n", + "tree_reg = DecisionTreeRegressor(random_state=42)\n", + "\n", + "# Train across 5 folds, that's a total of (12+6)*5=90 rounds of training \n", + "grid_search = GridSearchCV(tree_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)\n", + "grid_search.fit(X_train_tr, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grid_search.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grid_search.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "cvres = grid_search.cv_results_\n", + "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n", + " print(np.sqrt(-mean_score), params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame(grid_search.cv_results_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.2. Randomized search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Exercise 6.1: Add [RandomizedSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) by replacing the GridSearchCV.***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.3. Analyze the best models and their errors" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You will often gain good insights on the problem by inspecting the best models.\n", + "For example, one could take a look at the importances of the features for the best estimator." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "feature_importances = grid_search.best_estimator_.feature_importances_\n", + "feature_importances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "extra_attribs = [\"rooms_per_household\", \"population_per_household\", \"bedrooms_per_room\"]\n", + "cat_encoder = cat_pipeline.named_steps[\"cat_encoder\"]\n", + "cat_one_hot_attribs = list(cat_encoder.categories_[0])\n", + "attributes = num_attribs + extra_attribs + cat_one_hot_attribs\n", + "sorted(zip(feature_importances, attributes), reverse=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.4. Evaluate your system on the test set" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As already mentioned before, up to this point we only should consider the training set and do not take a look at the test set.\n", + "But now, after we trained our model and fine-tuned it, we can now evaluate the final model on the test set.\n", + "Based on the achieved performance on the test set we can identify how our model generalizes to new (unseen) data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_model = grid_search.best_estimator_\n", + "\n", + "X_test = test_set.drop(\"median_house_value\", axis=1)\n", + "y_test = test_set[\"median_house_value\"].copy()\n", + "\n", + "X_test_tr = full_pipeline.transform(X_test)\n", + "final_predictions = final_model.predict(X_test_tr)\n", + "\n", + "final_mse = mean_squared_error(y_test, final_predictions)\n", + "final_rmse = np.sqrt(final_mse)\n", + "final_rmse" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# evaluation on train set\n", + "final_scores = cross_val_score(final_model, X_train_tr, y_train, scoring=\"neg_mean_squared_error\", cv=10)\n", + "final_rmse_scores = np.sqrt(-final_scores)\n", + "display_scores(final_rmse_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Exercise 6.2: Try to find the best model-parameter-combination.***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After finding a good model one can easily save it to the disk." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "best_reg = final_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save it\n", + "joblib.dump(best_reg, \"models/my_best_model.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load it\n", + "best_reg = joblib.load(\"models/my_best_model.pkl\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<br>\n", + "<br>\n", + "<br>\n", + "<br>\n", + "<br>\n", + "\n", + "## 7. Launch, monitor and maintain your system" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See powerpoint slides." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}