updated solution with more efficient one discussed in the course

b09e32ec · Jammer, Tim · ed4e7617 · b09e32ec
Commit b09e32ec authored Sep 12, 2022 by Jammer, Tim
--- a/exercises/Numpy_KMeansClustering/NumPy_KMeansClustering_stdPython.ipynb
+++ b/exercises/Numpy_KMeansClustering/NumPy_KMeansClustering_stdPython.ipynb
@@ -139,6 +139,38 @@
    "    return coords_center"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "662132b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# return the updated locations of the cluster centers\n",
+    "def compute_centers_efficient(coords, labels, n_centers):\n",
+    "    # Provide your implementation here. \n",
+    "    # **HINT**:\n",
+    "    # \n",
+    "    # Use advanced indexing with boolean masks to access\n",
+    "    # all points that have a label corresponding to the \n",
+    "    # index of a cluster center.\n",
+    "    new_center_coords=[[0,0] for x in range(n_centers)]\n",
+    "    number_of_points=[0 for x in range(n_centers)]\n",
+    "    \n",
+    "    for label, coordinate in zip (labels,coords):\n",
+    "        new_center_coords[label][1]+=coordinate[1]#y\n",
+    "        new_center_coords[label][0]+=coordinate[0]#x\n",
+    "        number_of_points[label]+=1\n",
+    "        \n",
+    "    for coord,num_point in zip(new_center_coords,number_of_points):\n",
+    "        assert num_point != 0, \"Error - found cluster size with value 0.\"\n",
+    "        coord[1]=coord[1]/num_point\n",
+    "        coord[0]=coord[0]/num_point\n",
+    "        \n",
+    "    return new_center_coords\n",
+    "    "
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -283,7 +315,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },

 %% Cell type:code id:partial-munich tags:
 ``` python
 %matplotlib inline
 from matplotlib import pyplot as plt
 import numpy as np
 import importlib
 import helper
 importlib.reload(helper)
 import math
 from IPython.display import clear_output
 from time import sleep, time
 ```
 %% Cell type:markdown id:honest-mexico tags:
 ## Beispiel fuer einen Datensatz mit 4 Clustern
 %% Cell type:code id:invalid-baseball tags:
 ``` python
 dataset = np.loadtxt("sample-data/coords-with-labels-4.dat", delimiter=",")
 coords, labels = dataset.T[:2].T, dataset.T[-1].astype(int)
 num_labels = np.unique(labels).size
 coords_by_label = list(coords[labels == tt] for tt in range(num_labels))
 coords_center = np.loadtxt("sample-data/cluster-centers-4.dat", delimiter=",")
 ```
 %% Cell type:code id:piano-vehicle tags:
 ``` python
 ax1, ax2 = helper.init_figure()
 # Scatter plot of coords without clustering.
 helper.make_scatter_plot(ax1, coords=[coords], labels=[""])
 # Scatter plot of coords assigned to clusters
 helper.make_scatter_plot(
    ax2,
    coords_by_label,
    labels=[f"cluster {tt}" for tt in range(num_labels)],
    markers=["o"] * num_labels
 )
 # Plot cluster centers.
 helper.make_scatter_plot(
    ax2,
    coords_center,
    labels=[f"centeroid {tt}" for tt in range(num_labels)],
    colors=["black"] * num_labels,
    with_legend=True,
 )
 ```
 %% Cell type:markdown id:collectible-detector tags:
 ## Implementation using standard Python only
 %% Cell type:code id:3084ddcb tags:
 ``` python
 # return True, if centers have not changed and the algorithm can therefore stop
 def centers_have_not_changed(a, b):
    # if the center location only changes very little, we also consider it same
    rtol=1e-05
    atol=1e-08
    #has_changed=False
    # Provide your implementation here.
    for point_a,point_b in zip(a,b):
        for coordinate_a, coordinate_b in zip(point_a,point_b):
            if abs(coordinate_a - coordinate_b) >= (atol + rtol * abs(coordinate_b)):
                #has_changed=True
                return False
    return True
 ```
 %% Cell type:code id:0ae4cfd1 tags:
 ``` python
 # return the updated locations of the cluster centers
 def compute_centers(coords, labels, n_centers):
    # Provide your implementation here.
    # **HINT**:
    #
    # Use advanced indexing with boolean masks to access
    # all points that have a label corresponding to the
    # index of a cluster center.
    coords_center = []
    # For every cluster we look up all points that are closest to it.
    for ccidx in range(n_centers):
        ccx, ccy = 0, 0
        cluster_size = 0
        # Find all points "assigned" to the current cluster center.
        for lc, c in zip(labels, coords):
            cx, cy = c
            if ccidx == lc:
                cluster_size += 1
                ccx += cx
                ccy += cy
        assert cluster_size > 0, "Error - found cluster size with value 0."
        # Remember to divide by the cluster_size since we compute the
        # new cluster centre as the arithmetic mean from the coordinates
        # of all points assigned to it.
        coords_center.append([ccx / cluster_size, ccy / cluster_size])
    return coords_center
 ```
+%% Cell type:code id:662132b4 tags:
+``` python
+# return the updated locations of the cluster centers
+def compute_centers_efficient(coords, labels, n_centers):
+    # Provide your implementation here.
+    # **HINT**:
+    #
+    # Use advanced indexing with boolean masks to access
+    # all points that have a label corresponding to the
+    # index of a cluster center.
+    new_center_coords=[[0,0] for x in range(n_centers)]
+    number_of_points=[0 for x in range(n_centers)]
+    for label, coordinate in zip (labels,coords):
+        new_center_coords[label][1]+=coordinate[1]#y
+        new_center_coords[label][0]+=coordinate[0]#x
+        number_of_points[label]+=1
+    for coord,num_point in zip(new_center_coords,number_of_points):
+        assert num_point != 0, "Error - found cluster size with value 0."
+        coord[1]=coord[1]/num_point
+        coord[0]=coord[0]/num_point
+    return new_center_coords
+```
 %% Cell type:code id:3c7f163f tags:
 ``` python
 # return the list of *indices* of the cluster centers for the coordinates
 def find_closest_center(coords, coords_center):
    # Provide your implementation here.
    # **HINT**:
    #
    # Use `np.tile()` to augment `coords` and then make use
    # of NumPy's implicit broadcasting capabilities to
    # compute the distance of each point to *all* cluster
    # centers. You might also need to reshape the array.
    # Think about along which *axis* to compute the norm.
    #
    # Then select the *index* of cluster center with the
    # least distance for each point (Look up the
    # `np.argmin()` function.).
    labels = []
    # For *all* points search the closest cluster centre.
    for c in coords:
        min_ccidx, min_dist = 100000, 1e+18
        # Test each cluster center ...
        for ccidx, cc in enumerate(coords_center):
            # Squared distance of point to cluster centre.
            dist = sum(r ** 2 for r in (x - y for x, y in zip(c, cc)))
            # Found a new candidate.
            if dist < min_dist:
                min_ccidx, min_dist = ccidx, dist
        # After finishing this loop we have a found the closest cluster centre.
        # (Or at least a the closest in case some have the same distance.)
        # The *index* of that cluster centre is stored.
        labels.append(min_ccidx)
    return labels
 ```
 %% Cell type:code id:0707a439 tags:
 ``` python
 # The driver function is supplied, you do not need to change it
 def kmeans(coords, n_centers, n_iter, initial_random_state=42,visualize_progres=True,sleep_time=0.5):
    # Initialise the coordinates of the cluster centers
    rng = np.random.RandomState(initial_random_state)
    index = rng.choice(coords.shape[0], n_centers, replace=False)
    # Store coords of the center for iterations
    coords_center = coords[index, ...].copy()
    coords_center_old = coords_center.copy()
    for i in range(n_iter):
        # Find closest center for each point
        ### --> you provide this function ###
        labels = find_closest_center(coords, coords_center)
        if visualize_progres:
            # Visualization of the process
            sleep(sleep_time)
            clear_output(wait=True)
            # vor visualization, we have to convert the list of tuples back into an numpy array
            helper.plot_clustering(n_centers,coords,np.asarray(coords_center),np.asarray(labels))
        # Update the centeroids
        # INFO: "..." in x[...] is a slicing operation called "ellipsis". You can learn
        # more about it here: https://stackoverflow.com/questions/118370/how-do-you-use-the-ellipsis-slicing-syntax-in-python
        coords_center_old = coords_center # save old version for testing convergence
        ### --> you provide this solution ###
        coords_center= compute_centers(coords, labels, n_centers)
        # Test for convergence
        ### --> you provide this solution ###
        if centers_have_not_changed(coords_center, coords_center_old):
            if visualize_progres:
                # visualize final state
                sleep(sleep_time)
                clear_output(wait=True)
                helper.plot_clustering(n_centers,coords,np.asarray(coords_center),np.asarray(labels))
            print("Finished after %d iterations"%i)
            break
    return coords_center, labels
 ```
 %% Cell type:code id:authorized-slovenia tags:
 ``` python
 def main(n_clusters, dataset, n_iter=1000):
 #     coords, labels = dataset.T[:2].T, dataset.T[-1].astype(int)
    coords  = dataset.T[:2].T
    coords_center, center_labels = kmeans(
        coords=coords,# the input data (coordinates of the points to be clustered)
        n_centers=n_clusters,# number of clusters
        n_iter=n_iter,# maximum number of iterations to perform, if algorithm does not converge before
        initial_random_state=int(time()),# initial random seed - use a fixed value, if you want to have the same initial state for every execution
        visualize_progres=True,#Turn Off, if you do not want to wait for the visualization
        sleep_time=0.5 # the sleep time controls the speed of the visualization (lower means faster)
    )
    print(coords_center)
 ```
 %% Cell type:code id:scientific-compensation tags:
 ``` python
 if __name__ == "__main__":
    n_clusters = 4 # change this value to test different datasets
    dataset = np.loadtxt(f"sample-data/coords-with-labels-{n_clusters}.dat", delimiter=",")
    main(n_clusters, dataset)
 ```
 %% Cell type:code id:098b1f1d-049d-4459-a518-1b2aef76c40e tags:
 ``` python
 ```
 %% Cell type:code id:6c4b7a8a tags:
 ``` python
 ```