diff --git a/hands-on-housing_exercise.ipynb b/hands-on-housing_exercise.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..716687b898234679e0d3d9b9414599df3dc92372
--- /dev/null
+++ b/hands-on-housing_exercise.ipynb
@@ -0,0 +1,2032 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Hands-on housing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "*This Notebook describes an example end-to-end Machine Learning project. The project is completely fictitious and should only serve as an illustrative example to present the main steps of a Machine Learning project. It is based on the Notebook [End-to-end Machine Learning project](https://github.com/ageron/handson-ml/blob/master/02_end_to_end_machine_learning_project.ipynb) by Aurélien Géron.*\n",
+    "\n",
+    "The following eight main steps can be used as a checklist for nearly any Machine Learning project:\n",
+    "1. Look at the big picture\n",
+    "2. Get the data\n",
+    "3. Discover and visualize the data to gain insights\n",
+    "4. Prepare the data for Machine Learning alogrithms\n",
+    "5. Select and train a model\n",
+    "6. Fine-tune your model\n",
+    "7. Launch, monitor and maintain your system"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, setup this Notebook by importing all needed libraries and setting the default properties for all figure creation routines."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "\n",
+    "import os\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import tensorflow as tf\n",
+    "import time\n",
+    "import matplotlib\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.image as mpimg\n",
+    "import shutil\n",
+    "import tarfile\n",
+    "import hashlib\n",
+    "from pandas.plotting import scatter_matrix\n",
+    "from scipy.stats import expon, reciprocal\n",
+    "from six.moves import urllib\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.pipeline import FeatureUnion\n",
+    "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from sklearn.preprocessing import Imputer\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.model_selection import StratifiedShuffleSplit\n",
+    "from sklearn.model_selection import cross_val_score\n",
+    "from sklearn.model_selection import KFold\n",
+    "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.ensemble import RandomForestRegressor\n",
+    "from sklearn.tree import DecisionTreeRegressor\n",
+    "from sklearn.svm import SVR\n",
+    "from sklearn.base import BaseEstimator, TransformerMixin\n",
+    "from sklearn.metrics import mean_squared_error\n",
+    "from sklearn.externals import joblib\n",
+    "\n",
+    "\n",
+    "plt.rcParams['text.usetex'] = False\n",
+    "plt.rcParams['text.latex.unicode'] = False\n",
+    "\n",
+    "SMALL_SIZE = 12\n",
+    "MEDIUM_SIZE = 14\n",
+    "BIGGER_SIZE = 16\n",
+    "\n",
+    "plt.rc('font', size=SMALL_SIZE)          # controls default text sizes\n",
+    "plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title\n",
+    "plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels\n",
+    "plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels\n",
+    "plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels\n",
+    "plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize\n",
+    "plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# determine path to save figures\n",
+    "PROJECT_ROOT_DIR = '.'\n",
+    "NOTEBOOK_ID = 'hands-on-housing'\n",
+    "IMAGES_DIR = os.path.join(PROJECT_ROOT_DIR, 'images', NOTEBOOK_ID)\n",
+    "\n",
+    "def save_fig(fig_name, fig_extension='png', resolution=400):\n",
+    "    if not os.path.exists(IMAGES_DIR):\n",
+    "        os.mkdir(IMAGES_DIR)\n",
+    "    path = os.path.join(IMAGES_DIR, fig_name + '.' + fig_extension)\n",
+    "    print('Saving figure:', fig_name)\n",
+    "    plt.savefig(path, format=fig_extension, dpi=resolution)\n",
+    "    \n",
+    "\n",
+    "DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml/master/\"\n",
+    "DATASET_PATH = os.path.join(\"data\", \"datasets\", \"housing\", \"housing.tgz\")\n",
+    "DATASET_URL = DOWNLOAD_ROOT + \"datasets/housing/housing.tgz\"\n",
+    "PROJECT_ROOT_DIR = \".\"\n",
+    "\n",
+    "def fetch_dataset(src_url=DATASET_URL, dest_path=DATASET_PATH):\n",
+    "    dest_dirname = os.path.dirname(dest_path)\n",
+    "    if not os.path.isdir(dest_dirname):\n",
+    "        os.makedirs(dest_dirname)\n",
+    "    \n",
+    "    urllib.request.urlretrieve(src_url, dest_path)\n",
+    "\n",
+    "def extract_dataset(dest_path=DATASET_PATH):\n",
+    "    dest_dirname = os.path.dirname(dest_path)\n",
+    "    if not os.path.isdir(dest_dirname):\n",
+    "        os.makedirs(dest_dirname)\n",
+    "    \n",
+    "    dataset_tgz = tarfile.open(dest_path)\n",
+    "    dataset_tgz.extractall(path=dest_dirname)\n",
+    "    dataset_tgz.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Look at the big picture"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "See powerpoint slides."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "\n",
+    "## 2. Get the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1. Fetch the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Already done by the supervisor (See: [data/datasets/housing/housing.csv](data/datasets/housing/housing.csv))."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2. Load the data using Pandas"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Pandas is a core Python package for data science. It offers powerful and flexible data sturctures so that it makes people manipulate and analyze data easily."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_dataset(dataset_path):\n",
+    "    return pd.read_csv(dataset_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_dataset = load_dataset(\"data/datasets/housing/housing.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3. Take a quick look at the data structure"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Print out the first five entries of the `pandas.DataFrame` to take a first insight to its columns and values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_dataset.head(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The shape of the dataset is defined by the number of instances (rows) and the number of attributes (columns)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_dataset.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `info()` method is a very useful function to get a quick description of the data.\n",
+    "It can be used to e.g. identify each attribute's datatype and the number of non-null values in each column."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "housing_dataset.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "All attributes are of the type `float64`, except the attribute *ocean_proximity*.\n",
+    "Its type is `object` and can thus be any kind of Python object.\n",
+    "Here, as already seen in the first five entries of the dataset, the *ocean_proximity* is a text attribute.\n",
+    "More precisily, it is a so called **categorical attribute**.\n",
+    "To get the different categories, one can use the `value_counts()` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_dataset[\"ocean_proximity\"].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `describe()` method of the `pandas.DataFrame` can be used to generate descriptive statistics that summarize the numerical attributes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "housing_dataset.describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Another useful function to get a first insight to the data, is the `hist()` method.\n",
+    "It calculates and visualizes the histograms for the different attributes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "housing_dataset.hist(bins=5, figsize=(20, 15))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Exercise 2.1: Determine a good value for bins. What do you observe in the histograms?***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Good value for bins: ...\n",
+    "\n",
+    "Observations:\n",
+    "- ...\n",
+    "- ...\n",
+    "- ..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.4. Create a test set"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To prevent the algorithm and even you as the developer from overfitting to the seen data, one generally split the data into a training and a test set.\n",
+    "Typically, 20% randomly picked instances of the total dataset are used as the test set."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Exercise 2.2: Write a function, which splits the dataset into random training and test subsets. The parameter test_ratio should represent the proportion of the dataset to include in the test split. Hint: You can use the [np.random.permutation](https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.permutation.html) function.***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# to make this notebook's output identical at every run\n",
+    "np.random.seed(41)\n",
+    "\n",
+    "def split_train_test(dataset, test_ratio):\n",
+    "    # ...\n",
+    "    \n",
+    "    train_subset = dataset.iloc[0:15000] # To be modified\n",
+    "    test_subset = dataset.iloc[15000:]   # To be modified\n",
+    "    \n",
+    "    return train_subset, test_subset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_set, test_set = split_train_test(housing_dataset, 0.2)\n",
+    "print(\"Splitted dataset into\", len(train_set), \"train and\", len(test_set), \"test instances.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you fetch an updated dataset, this solution will generate a test set which contains parts of the previous train set. Solution: Use a unique and immutable identifier for each instance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def test_set_check(identifier, test_ratio, hash):\n",
+    "    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio\n",
+    "\n",
+    "def split_train_test_by_id(dataset, test_ratio, id_column, hash=hashlib.md5):\n",
+    "    ids = dataset[id_column]\n",
+    "    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))\n",
+    "    return dataset.loc[~in_test_set], dataset.loc[in_test_set]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_dataset_with_id = housing_dataset.reset_index()  # Adds an 'index' column\n",
+    "train_set, test_set = split_train_test_by_id(housing_dataset_with_id, 0.2, \"index\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_dataset_with_id[\"id\"] = housing_dataset[\"longitude\"] * 1000 + housing_dataset[\"latitude\"]\n",
+    "train_set, test_set = split_train_test_by_id(housing_dataset_with_id, 0.2, \"id\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Scikit-Learn provides a few functions for splitting the dataset. The `train_test_split()` function does pretty much the same as the `split_train_test()` function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_set, test_set = train_test_split(housing_dataset, test_size=0.2, random_state=41)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Purely random sampling methods (see above) can be used if the dataset is large enough. If it is too small, a significant sampling bias may be introduced. Solution: Stratified Sampling (splitting into more representative subsets)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "housing_dataset[\"median_income\"].hist(bins=50)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Exercise 2.3: Create an income category (income_cat) attribute, which can be used to divide the dataset into homogeneous subgroups (strata). There should be less than 8 categories.***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_dataset[\"income_cat\"] = np.round(housing_dataset[\"median_income\"])\n",
+    "\n",
+    "#..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=41)\n",
+    "for train_index, test_index in split.split(housing_dataset, housing_dataset[\"income_cat\"]):\n",
+    "    strat_train_set = housing_dataset.loc[train_index]\n",
+    "    strat_test_set = housing_dataset.loc[test_index]\n",
+    "\n",
+    "strat_test_set[\"income_cat\"].value_counts() / len(strat_test_set)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "housing_dataset[\"income_cat\"].value_counts() / len(housing_dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lets now compare the *income_cat* proportions in:\n",
+    "- the purely random sampled test set,\n",
+    "- the stratified sampled test set and\n",
+    "- the total dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def income_cat_proportions(dataset):\n",
+    "    return dataset[\"income_cat\"].value_counts() / len(dataset)\n",
+    "\n",
+    "train_set, test_set = train_test_split(housing_dataset, test_size=0.2, random_state=41)\n",
+    "\n",
+    "compare_props = pd.DataFrame({\n",
+    "    \"Overall\": income_cat_proportions(housing_dataset),\n",
+    "    \"Stratified\": income_cat_proportions(strat_test_set),\n",
+    "    \"Random\": income_cat_proportions(test_set),\n",
+    "}).sort_index()\n",
+    "compare_props[\"Rand. %error\"] = 100 * compare_props[\"Random\"] / compare_props[\"Overall\"] - 100\n",
+    "compare_props[\"Strat. %error\"] = 100 * compare_props[\"Stratified\"] / compare_props[\"Overall\"] - 100\n",
+    "\n",
+    "compare_props"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for set_ in (strat_train_set, strat_test_set):\n",
+    "    set_.drop(\"income_cat\", axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, the training and test set are determined by using the stratified sampling method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_set = strat_train_set\n",
+    "test_set = strat_test_set"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "\n",
+    "## 3. Discover and visualize the data to gain insights"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.1. Visualize geographical data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The *longitute* and *latitude* attributes are geographical information.\n",
+    "It could therefore be a good idea to visualize the data based on this attributes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_dataset.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.4,\n",
+    "                     s=housing_dataset[\"population\"]/100, label=\"population\", figsize=(10,7),\n",
+    "                     c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"), colorbar=True)\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Using an image of the california map as a background, we can even improve the readability of the figure."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "california_img=mpimg.imread('images/hands-on-housing/california.png')\n",
+    "ax = housing_dataset.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", figsize=(10,7),\n",
+    "                          s=housing_dataset['population']/100, label=\"Population\",\n",
+    "                          c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n",
+    "                          colorbar=False, alpha=0.4,\n",
+    "                      )\n",
+    "plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,\n",
+    "           cmap=plt.get_cmap(\"jet\"))\n",
+    "plt.ylabel(\"Latitude\", fontsize=14)\n",
+    "plt.xlabel(\"Longitude\", fontsize=14)\n",
+    "\n",
+    "prices = housing_dataset[\"median_house_value\"]\n",
+    "tick_values = np.linspace(prices.min(), prices.max(), 11)\n",
+    "cbar = plt.colorbar()\n",
+    "cbar.ax.set_yticklabels([\"$%dk\"%(round(v/100)) for v in tick_values], fontsize=14)\n",
+    "cbar.set_label('Median House Value', fontsize=16)\n",
+    "\n",
+    "plt.legend(fontsize=16)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Exercise 3.1: Play around with the visualization parameters. What relations can be seen in this figure?***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Observations:\n",
+    "- ...\n",
+    "- ...\n",
+    "- ..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.2. Look for correlations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A correlation coefficient between attributes decribes how the corresponding data dependens linearly on each other.\n",
+    "Some exemplary correlation coefficients can be found in the following figure (source: [wikipedia](https://en.wikipedia.org/wiki/Correlation_and_dependence)).\n",
+    "\n",
+    "<img src=\"images/hands-on-housing/correlations.png\"/>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lets now take a look at our housing data again and calculate the correlations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_dataset.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\", alpha=0.1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Exercise 3.2: Search for correlations between the numerical attributes using the plot function.***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.3. Experimenting with attribute combinations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Often, a combination of different attributes can lead to even better attributes.\n",
+    "For example, one could calculate the *population_per_household* by dividing the *population* by the *households*.\n",
+    "This new attribute could be a much better indicator for a high housing price while even reducing the amount of data by a factor of 2."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_dataset[\"population_per_household\"] = housing_dataset[\"population\"] / housing_dataset[\"households\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "corr_matrix = housing_dataset.corr()\n",
+    "corr_matrix[\"median_house_value\"].sort_values(ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_dataset.plot(kind=\"scatter\", x=\"population_per_household\", y=\"median_house_value\", alpha=0.2)\n",
+    "plt.axis([0, 5, 0, 520000])\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Exercise 3.3: Try out various attribute combinations. Use the `corr()` and `plot()` function to identify good combinations.***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_dataset.describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "\n",
+    "## 4. Prepare the data for Machine Learning algorithms"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There are some preparation steps done in almost any Machine Learning project.\n",
+    "It is not recommended to do these preparation steps manually.\n",
+    "Instead one should use transformation functions which can be applied several times on e.g. different data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train = train_set.drop(\"median_house_value\", axis=1) # Data/features (drop labels)\n",
+    "y_train = train_set[\"median_house_value\"].copy()       # Labels"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4.1. Clean the data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First of all, lets clean the data.\n",
+    "This means e.g. removing missing fields (i.e. `NaN`/null values) in the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_set.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_incomplete_rows = X_train[X_train.isnull().any(axis=1)].head()\n",
+    "sample_incomplete_rows"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Exercise 4.1: How to fix the missing values in the total_bedrooms column? Name at least two options and think about their advantages/disadvantages.***"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Options:\n",
+    "- ...\n",
+    "- ...\n",
+    "- ..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imputer = Imputer(strategy=\"median\")                  # Imputer saves the filled in values\n",
+    "X_train_num = X_train.drop(\"ocean_proximity\", axis=1) # Median can only be calculated for numerical attributes\n",
+    "imputer.fit(X_train_num)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imputer.statistics_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_num.median().values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_num.loc[sample_incomplete_rows.index.values]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = imputer.transform(X_train_num) # results a numpy array containing the transformed features\n",
+    "\n",
+    "# put it back into a pandas dataframe\n",
+    "X_train_num_tr = pd.DataFrame(X, columns=X_train_num.columns, index=list(X_train_num.index.values))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_num_tr.loc[sample_incomplete_rows.index.values]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "X_train_num_tr.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4.2. Handling text and categorical attributes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Most Machine Learning algorithms can only be applied on numerical attributes.\n",
+    "Therefore the text and categorical attributes need to be transformed to a numerical representation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_cat = X_train[\"ocean_proximity\"]\n",
+    "X_train_cat.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_cat_encoded, X_train_categories = X_train_cat.factorize()\n",
+    "X_train_cat_encoded[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_categories"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Machine Learning algorithms will assume that two nearby values are more similar than two distant values. Solution: One-Hot Encoding."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "encoder = OneHotEncoder()\n",
+    "X_train_cat_1hot = encoder.fit_transform(X_train_cat_encoded.reshape(-1,1)) # Returns a sparse matrix\n",
+    "X_train_cat_1hot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "pd.DataFrame(X_train_cat_1hot.toarray(), columns=X_train_categories).head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.CategoricalEncoder import CategoricalEncoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cat_encoder = CategoricalEncoder()\n",
+    "X_train_cat_reshaped = X_train_cat.values.reshape(-1,1)\n",
+    "X_train_cat_1hot = cat_encoder.fit_transform(X_train_cat_reshaped)\n",
+    "X_train_cat_1hot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cat_encoder = CategoricalEncoder(encoding=\"onehot-dense\")\n",
+    "X_train_cat_1hot = cat_encoder.fit_transform(X_train_cat_reshaped)\n",
+    "pd.DataFrame(X_train_cat_1hot, columns=cat_encoder.categories_).head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cat_encoder.categories_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4.3. Create custom transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Besides using the many useful scikit-learn transformers one can also write own transformers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# indices\n",
+    "rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6\n",
+    "\n",
+    "class CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n",
+    "    def __init__(self, add_bedrooms_per_room = True):  # no *args or **kwargs\n",
+    "        self.add_bedrooms_per_room = add_bedrooms_per_room\n",
+    "    def fit(self, X, y=None):\n",
+    "        return self  # nothing else to do\n",
+    "    def transform(self, X, y=None):\n",
+    "        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]\n",
+    "        population_per_household = X[:, population_ix] / X[:, household_ix]\n",
+    "        if self.add_bedrooms_per_room:\n",
+    "            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n",
+    "            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]\n",
+    "        else:\n",
+    "            return np.c_[X, rooms_per_household, population_per_household]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "add_bedrooms_per_room = False\n",
+    "attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=add_bedrooms_per_room)\n",
+    "X_extra_attribs = attr_adder.transform(X_train.values)\n",
+    "\n",
+    "new_columns = list(X_train.columns)+['rooms_per_household', 'population_per_household']\n",
+    "if add_bedrooms_per_room:\n",
+    "    new_columns = list(new_columns)+['bedrooms_per_room']\n",
+    "\n",
+    "# put it back into a pandas dataframe\n",
+    "X_train_extra_attribs = pd.DataFrame(X_extra_attribs, columns=new_columns, index=list(X_train.index.values))\n",
+    "\n",
+    "X_train_extra_attribs.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\", \"housing_median_age\", \"households\"]\n",
+    "scatter_matrix(train_set[attributes], figsize=(12,8), alpha=0.1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Exercise 4.2: Create a transformer, which removes outliers in the dataset.***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class RemoveOutliers(BaseEstimator, TransformerMixin):\n",
+    "    def fit (self, X, y=None):\n",
+    "        return self\n",
+    "\n",
+    "    def transform(self, X, y=None):\n",
+    "        X=X[(X['median_house_value']!=500001) | (X['median_income']>=2)].reset_index(drop=True)\n",
+    "        # ...\n",
+    "        \n",
+    "        return X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "remOutliers = RemoveOutliers()\n",
+    "X_train_num_tmp = remOutliers.transform(train_set)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "X_train_num_tmp.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\", \"housing_median_age\", \"households\"]\n",
+    "scatter_matrix(X_train_num_tmp[attributes], figsize=(12,8), alpha=0.1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4.4. Feature scaling"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "One of the most important transformations is the so called **feature scaling**.\n",
+    "This is necessary as the most Machine Learning algorithms do not perform well on numerical **input** attributes that have very different scales.\n",
+    "The output attributes are generally not scaled.\n",
+    "Two common used scaling operations are:\n",
+    "- the standard scaling generating a distribution with zero mean and unit variance,\n",
+    "- the min-max scaling generating a distribution ranging from 0 to 1."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "minmaxScaler = MinMaxScaler()\n",
+    "stdScaler = StandardScaler()\n",
+    "\n",
+    "X = imputer.transform(X_train_num)        # fill in missing values\n",
+    "\n",
+    "X_standard = stdScaler.fit_transform(X)   # standard scaling\n",
+    "X_minmax = minmaxScaler.fit_transform(X)  # min-max scaling\n",
+    "\n",
+    "# put it back into a pandas dataframe\n",
+    "X_train_num_standard = pd.DataFrame(X_standard, columns=X_train_num.columns, index=list(X_train_num.index.values))\n",
+    "X_train_num_minmax = pd.DataFrame(X_minmax, columns=X_train_num.columns, index=list(X_train_num.index.values))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "X_train_num_standard.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_num_minmax.describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4.5. Transformation pipelines"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The several transformation steps can be summarized in a transformation pipeline where the transformation steps are just concatenated."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "add_bedrooms_per_room = False\n",
+    "\n",
+    "num_pipeline = Pipeline([\n",
+    "    ('imputer', Imputer(strategy='median')),\n",
+    "    ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room)),\n",
+    "    ('std_scaler', StandardScaler()),\n",
+    "])\n",
+    "\n",
+    "new_columns = list(X_train_num.columns)+['rooms_per_household', 'population_per_household']\n",
+    "if add_bedrooms_per_room:\n",
+    "    new_columns = list(new_columns)+['bedrooms_per_room']\n",
+    "\n",
+    "X = num_pipeline.fit_transform(X_train_num) # returns a numpy.array\n",
+    "\n",
+    "# put it back into a pandas dataframe\n",
+    "X_train_num_tr = pd.DataFrame(X, columns=new_columns, index=list(X_train_num.index.values))\n",
+    "X_train_num_tr.shape\n",
+    "X_train_num_tr"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class DataFrameSelector(BaseEstimator, TransformerMixin):\n",
+    "    def __init__(self, attribute_names):\n",
+    "        self.attribute_names = attribute_names\n",
+    "    def fit(self, X, y=None):\n",
+    "        return self\n",
+    "    def transform(self, X):\n",
+    "        return X[self.attribute_names].values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_attribs = list(X_train_num)\n",
+    "cat_attribs = [\"ocean_proximity\"]\n",
+    "\n",
+    "num_pipeline = Pipeline([\n",
+    "    ('selector', DataFrameSelector(num_attribs)),\n",
+    "    ('imputer', Imputer(strategy=\"median\")),\n",
+    "    ('attribs_adder', CombinedAttributesAdder()),\n",
+    "    ('std_scaler', StandardScaler()),\n",
+    "])\n",
+    "\n",
+    "cat_pipeline = Pipeline([\n",
+    "    ('selector', DataFrameSelector(cat_attribs)),\n",
+    "    ('cat_encoder', CategoricalEncoder(encoding=\"onehot-dense\")),\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "full_pipeline = FeatureUnion(transformer_list=[\n",
+    "    (\"num_pipeline\", num_pipeline),\n",
+    "    (\"cat_pipeline\", cat_pipeline),\n",
+    "])\n",
+    "\n",
+    "X = full_pipeline.fit_transform(X_train)\n",
+    "\n",
+    "new_columns = list(X_train.columns)+['rooms_per_household', 'population_per_household']+list(cat_pipeline.named_steps['cat_encoder'].categories_[0])\n",
+    "\n",
+    "\n",
+    "# put it back into a pandas dataframe\n",
+    "X_train_tr = pd.DataFrame(X, columns=new_columns, index=list(X_train.index.values))\n",
+    "\n",
+    "# rename columns for tensorflow\n",
+    "rename_dict = [(' ', '_'), ('<', 'less')]\n",
+    "renamed_columns = list(X_train_tr.columns)\n",
+    "for old, new in rename_dict:\n",
+    "    renamed_columns = [w.replace(old, new) for w in renamed_columns]\n",
+    "\n",
+    "X_train_tr.columns = renamed_columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "X_train_tr.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "\n",
+    "## 5. Select and train a model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After exploring and preparing the data, one can now start to select a model and train it.\n",
+    "This training is typically only done using the training set.\n",
+    "Afterwards the test set is used to validate the trained model.\n",
+    "In this chapter several different Machine Learning algorithms are trained and evaluated."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.1. Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 5.1.1. LinearRegression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lin_reg = LinearRegression()\n",
+    "lin_reg.fit(X_train_tr, y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 5.1.2. DecisionTreeRegressor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "tree_reg = DecisionTreeRegressor()\n",
+    "tree_reg.fit(X_train_tr, y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 5.1.3. DNNRegressor"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Construction phase"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.logging.set_verbosity(tf.logging.INFO)\n",
+    "\n",
+    "def training_input_fn(x=X_train_tr, y=y_train, batch_size=1):\n",
+    "    return tf.estimator.inputs.pandas_input_fn(\n",
+    "        x=x,\n",
+    "        y=y,\n",
+    "        batch_size=batch_size,\n",
+    "        num_epochs=None,\n",
+    "        shuffle=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dnn_reg = tf.estimator.DNNRegressor(\n",
+    "    hidden_units=[1000, 1000],\n",
+    "    model_dir=\"models/dnn_reg_model\",\n",
+    "    activation_fn=tf.nn.relu,\n",
+    "    feature_columns=[tf.feature_column.numeric_column(x, shape=(1,)) for x in X_train_tr.columns])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Execution phase"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dnn_reg.train(input_fn=training_input_fn(batch_size=50), steps=5000)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.2. Evaluate on the training set"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For simplicity, here we just use the training set to evaluate our models which is generally **not** recommended."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# let's try the full pipeline on a few training instances\n",
+    "some_data = X_train.iloc[:8]\n",
+    "some_labels = y_train.iloc[:8]\n",
+    "X = full_pipeline.transform(some_data)\n",
+    "\n",
+    "# put it back into a pandas dataframe\n",
+    "some_data_tr = pd.DataFrame(X, columns=renamed_columns, index=list(some_data.index.values))\n",
+    "some_data_tr.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 5.2.1. LinearRegressor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Predictions:\", lin_reg.predict(some_data_tr))\n",
+    "print(\"Labels:\", list(some_labels))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lin_reg_predictions = lin_reg.predict(X_train_tr)\n",
+    "lin_mse = mean_squared_error(y_train, lin_reg_predictions)\n",
+    "lin_rmse = np.sqrt(lin_mse)\n",
+    "lin_rmse"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 5.2.2. DecisionTreeRegressor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Predictions:\", tree_reg.predict(some_data_tr))\n",
+    "print(\"Labels:\", list(some_labels))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tree_reg_predictions = tree_reg.predict(X_train_tr)\n",
+    "tree_mse = mean_squared_error(y_train, tree_reg_predictions)\n",
+    "tree_rmse = np.sqrt(tree_mse)\n",
+    "tree_rmse"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 5.2.3. DNNRegressor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def test_input_fn(x=X_train_tr, y=y_train):\n",
+    "    return tf.estimator.inputs.pandas_input_fn(\n",
+    "        x=x,\n",
+    "        y=y,\n",
+    "        num_epochs=1,\n",
+    "        shuffle=False)\n",
+    "\n",
+    "dnn_reg_predictions_gen_expr = dnn_reg.predict(input_fn=test_input_fn(x=some_data_tr, y=some_labels))\n",
+    "dnn_reg_predictions = [x['predictions'][0] for x in dnn_reg_predictions_gen_expr]\n",
+    "\n",
+    "print(\"\\nPredictions:\", list(dnn_reg_predictions))\n",
+    "print(\"Labels:\", list(some_labels))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "dnn_reg_predictions_gen_expr = dnn_reg.predict(input_fn=test_input_fn())\n",
+    "dnn_reg_predictions = [x['predictions'][0] for x in dnn_reg_predictions_gen_expr]\n",
+    "\n",
+    "dnn_mse = mean_squared_error(y_train, dnn_reg_predictions)\n",
+    "dnn_rmse = np.sqrt(dnn_mse)\n",
+    "dnn_rmse"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5.3. Better evaluation using cross-validation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "One way to evaluate the models would be to split the training set into a smaller training set and a validation set.\n",
+    "This validation set could then be used for evaluation during the training procedure.\n",
+    "Another way is to use the scikit-learn cross-validation function which randomly splits the training set into 10 distinct subsets, then it trains and evaluates the model 10 times, picking a different subset for evaluation every time and training on the other 9 subsets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def display_scores(scores):\n",
+    "    print(\"Scores:\", scores)\n",
+    "    print(\"Mean:\", scores.mean())\n",
+    "    print(\"Standard deviation:\", scores.std())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 5.3.1. LinearRegressor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lin_scores = cross_val_score(lin_reg, X_train_tr, y_train, scoring=\"neg_mean_squared_error\", cv=10)\n",
+    "lin_rmse_scores = np.sqrt(-lin_scores)\n",
+    "display_scores(lin_rmse_scores)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 5.3.2. DecisionTreeRegressor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tree_scores = cross_val_score(tree_reg, X_train_tr, y_train, scoring=\"neg_mean_squared_error\", cv=10)\n",
+    "tree_rmse_scores = np.sqrt(-tree_scores)\n",
+    "display_scores(tree_rmse_scores)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 5.3.3. DNNRegressor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def cross_val_score_dnn_regressor(dnn_reg_estim, X, y, batch_size=50, steps=5000, cv=5):\n",
+    "    scores = list()\n",
+    "    k_fold = KFold(n_splits=cv)\n",
+    "    for train_indices, validation_indices in k_fold.split(X):    \n",
+    "        # clear model dir\n",
+    "        shutil.rmtree(dnn_reg_estim.model_dir, ignore_errors=True)\n",
+    "\n",
+    "        print('\\nTrain: %s | Validation: %s' % (train_indices, validation_indices))\n",
+    "\n",
+    "        # train\n",
+    "        dnn_reg_estim.train(input_fn=training_input_fn(x=X.iloc[train_indices], y=y.iloc[train_indices], batch_size=batch_size), steps=steps)\n",
+    "\n",
+    "        # predict\n",
+    "        dnn_reg_predictions_gen_expr = dnn_reg_estim.predict(input_fn=test_input_fn(x=X.iloc[validation_indices], y=y.iloc[validation_indices]))\n",
+    "        dnn_reg_predictions = [x['predictions'][0] for x in dnn_reg_predictions_gen_expr]\n",
+    "\n",
+    "        # save score\n",
+    "        mse = mean_squared_error(y_train.iloc[validation_indices], dnn_reg_predictions)\n",
+    "        scores.append(mse)\n",
+    "        \n",
+    "    return scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dnn_scores = cross_val_score_dnn_regressor(dnn_reg, X_train_tr, y_train, 50, 3000, cv=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dnn_rmse_scores = np.sqrt(np.asarray(dnn_scores))\n",
+    "display_scores(dnn_rmse_scores)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Exercise 5.1: Add more promising models (at least 5 different models in total). Hint: Take a look at the [Supervised Learning Models](http://scikit-learn.org/stable/supervised_learning.html) of Scikit-Learn.***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "\n",
+    "## 6. Fine-tune your model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After specifying a list of promising models, one can now start to fine-tune them.\n",
+    "This means that we try to find good hyperparameter values.\n",
+    "Doing this manually would cost a lost of time and would be very tedious work.\n",
+    "Thus one can better do this hyperparameter search by using e.g. the so called grid or randomized search."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 6.1. Grid search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "param_grid = [\n",
+    "    # try 12 (3×4) combinations of hyperparameters\n",
+    "    {'max_depth': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n",
+    "    # then try 6 (2×3) combinations with presort set as True\n",
+    "    {'presort': [True], 'max_depth': [3, 10], 'max_features': [2, 3, 4]},\n",
+    "]\n",
+    "\n",
+    "tree_reg = DecisionTreeRegressor(random_state=42)\n",
+    "\n",
+    "# Train across 5 folds, that's a total of (12+6)*5=90 rounds of training \n",
+    "grid_search = GridSearchCV(tree_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)\n",
+    "grid_search.fit(X_train_tr, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grid_search.best_params_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grid_search.best_estimator_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "cvres = grid_search.cv_results_\n",
+    "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n",
+    "    print(np.sqrt(-mean_score), params)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame(grid_search.cv_results_)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 6.2. Randomized search"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Exercise 6.1: Add [RandomizedSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) by replacing the GridSearchCV.***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 6.3. Analyze the best models and their errors"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You will often gain good insights on the problem by inspecting the best models.\n",
+    "For example, one could take a look at the importances of the features for the best estimator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "feature_importances = grid_search.best_estimator_.feature_importances_\n",
+    "feature_importances"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "extra_attribs = [\"rooms_per_household\", \"population_per_household\", \"bedrooms_per_room\"]\n",
+    "cat_encoder = cat_pipeline.named_steps[\"cat_encoder\"]\n",
+    "cat_one_hot_attribs = list(cat_encoder.categories_[0])\n",
+    "attributes = num_attribs + extra_attribs + cat_one_hot_attribs\n",
+    "sorted(zip(feature_importances, attributes), reverse=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 6.4. Evaluate your system on the test set"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As already mentioned before, up to this point we only should consider the training set and do not take a look at the test set.\n",
+    "But now, after we trained our model and fine-tuned it, we can now evaluate the final model on the test set.\n",
+    "Based on the achieved performance on the test set we can identify how our model generalizes to new (unseen) data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_model = grid_search.best_estimator_\n",
+    "\n",
+    "X_test = test_set.drop(\"median_house_value\", axis=1)\n",
+    "y_test = test_set[\"median_house_value\"].copy()\n",
+    "\n",
+    "X_test_tr = full_pipeline.transform(X_test)\n",
+    "final_predictions = final_model.predict(X_test_tr)\n",
+    "\n",
+    "final_mse = mean_squared_error(y_test, final_predictions)\n",
+    "final_rmse = np.sqrt(final_mse)\n",
+    "final_rmse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# evaluation on train set\n",
+    "final_scores = cross_val_score(final_model, X_train_tr, y_train, scoring=\"neg_mean_squared_error\", cv=10)\n",
+    "final_rmse_scores = np.sqrt(-final_scores)\n",
+    "display_scores(final_rmse_scores)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Exercise 6.2: Try to find the best model-parameter-combination.***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After finding a good model one can easily save it to the disk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_reg = final_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save it\n",
+    "joblib.dump(best_reg, \"models/my_best_model.pkl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load it\n",
+    "best_reg = joblib.load(\"models/my_best_model.pkl\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "<br>\n",
+    "\n",
+    "## 7. Launch, monitor and maintain your system"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "See powerpoint slides."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}