Skip to content
Snippets Groups Projects
Commit b09e32ec authored by Jammer, Tim's avatar Jammer, Tim
Browse files

updated solution with more efficient one discussed in the course

parent ed4e7617
Branches
No related tags found
No related merge requests found
%% Cell type:code id:partial-munich tags: %% Cell type:code id:partial-munich tags:
``` python ``` python
%matplotlib inline %matplotlib inline
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
import numpy as np import numpy as np
import importlib import importlib
import helper import helper
importlib.reload(helper) importlib.reload(helper)
import math import math
from IPython.display import clear_output from IPython.display import clear_output
from time import sleep, time from time import sleep, time
``` ```
%% Cell type:markdown id:honest-mexico tags: %% Cell type:markdown id:honest-mexico tags:
## Beispiel fuer einen Datensatz mit 4 Clustern ## Beispiel fuer einen Datensatz mit 4 Clustern
%% Cell type:code id:invalid-baseball tags: %% Cell type:code id:invalid-baseball tags:
``` python ``` python
dataset = np.loadtxt("sample-data/coords-with-labels-4.dat", delimiter=",") dataset = np.loadtxt("sample-data/coords-with-labels-4.dat", delimiter=",")
coords, labels = dataset.T[:2].T, dataset.T[-1].astype(int) coords, labels = dataset.T[:2].T, dataset.T[-1].astype(int)
num_labels = np.unique(labels).size num_labels = np.unique(labels).size
coords_by_label = list(coords[labels == tt] for tt in range(num_labels)) coords_by_label = list(coords[labels == tt] for tt in range(num_labels))
coords_center = np.loadtxt("sample-data/cluster-centers-4.dat", delimiter=",") coords_center = np.loadtxt("sample-data/cluster-centers-4.dat", delimiter=",")
``` ```
%% Cell type:code id:piano-vehicle tags: %% Cell type:code id:piano-vehicle tags:
``` python ``` python
ax1, ax2 = helper.init_figure() ax1, ax2 = helper.init_figure()
# Scatter plot of coords without clustering. # Scatter plot of coords without clustering.
helper.make_scatter_plot(ax1, coords=[coords], labels=[""]) helper.make_scatter_plot(ax1, coords=[coords], labels=[""])
# Scatter plot of coords assigned to clusters # Scatter plot of coords assigned to clusters
helper.make_scatter_plot( helper.make_scatter_plot(
ax2, ax2,
coords_by_label, coords_by_label,
labels=[f"cluster {tt}" for tt in range(num_labels)], labels=[f"cluster {tt}" for tt in range(num_labels)],
markers=["o"] * num_labels markers=["o"] * num_labels
) )
# Plot cluster centers. # Plot cluster centers.
helper.make_scatter_plot( helper.make_scatter_plot(
ax2, ax2,
coords_center, coords_center,
labels=[f"centeroid {tt}" for tt in range(num_labels)], labels=[f"centeroid {tt}" for tt in range(num_labels)],
colors=["black"] * num_labels, colors=["black"] * num_labels,
with_legend=True, with_legend=True,
) )
``` ```
%% Cell type:markdown id:collectible-detector tags: %% Cell type:markdown id:collectible-detector tags:
## Implementation using standard Python only ## Implementation using standard Python only
%% Cell type:code id:3084ddcb tags: %% Cell type:code id:3084ddcb tags:
``` python ``` python
# return True, if centers have not changed and the algorithm can therefore stop # return True, if centers have not changed and the algorithm can therefore stop
def centers_have_not_changed(a, b): def centers_have_not_changed(a, b):
# if the center location only changes very little, we also consider it same # if the center location only changes very little, we also consider it same
rtol=1e-05 rtol=1e-05
atol=1e-08 atol=1e-08
#has_changed=False #has_changed=False
# Provide your implementation here. # Provide your implementation here.
for point_a,point_b in zip(a,b): for point_a,point_b in zip(a,b):
for coordinate_a, coordinate_b in zip(point_a,point_b): for coordinate_a, coordinate_b in zip(point_a,point_b):
if abs(coordinate_a - coordinate_b) >= (atol + rtol * abs(coordinate_b)): if abs(coordinate_a - coordinate_b) >= (atol + rtol * abs(coordinate_b)):
#has_changed=True #has_changed=True
return False return False
return True return True
``` ```
%% Cell type:code id:0ae4cfd1 tags: %% Cell type:code id:0ae4cfd1 tags:
``` python ``` python
# return the updated locations of the cluster centers # return the updated locations of the cluster centers
def compute_centers(coords, labels, n_centers): def compute_centers(coords, labels, n_centers):
# Provide your implementation here. # Provide your implementation here.
# **HINT**: # **HINT**:
# #
# Use advanced indexing with boolean masks to access # Use advanced indexing with boolean masks to access
# all points that have a label corresponding to the # all points that have a label corresponding to the
# index of a cluster center. # index of a cluster center.
coords_center = [] coords_center = []
# For every cluster we look up all points that are closest to it. # For every cluster we look up all points that are closest to it.
for ccidx in range(n_centers): for ccidx in range(n_centers):
ccx, ccy = 0, 0 ccx, ccy = 0, 0
cluster_size = 0 cluster_size = 0
# Find all points "assigned" to the current cluster center. # Find all points "assigned" to the current cluster center.
for lc, c in zip(labels, coords): for lc, c in zip(labels, coords):
cx, cy = c cx, cy = c
if ccidx == lc: if ccidx == lc:
cluster_size += 1 cluster_size += 1
ccx += cx ccx += cx
ccy += cy ccy += cy
assert cluster_size > 0, "Error - found cluster size with value 0." assert cluster_size > 0, "Error - found cluster size with value 0."
# Remember to divide by the cluster_size since we compute the # Remember to divide by the cluster_size since we compute the
# new cluster centre as the arithmetic mean from the coordinates # new cluster centre as the arithmetic mean from the coordinates
# of all points assigned to it. # of all points assigned to it.
coords_center.append([ccx / cluster_size, ccy / cluster_size]) coords_center.append([ccx / cluster_size, ccy / cluster_size])
return coords_center return coords_center
``` ```
%% Cell type:code id:662132b4 tags:
``` python
# return the updated locations of the cluster centers
def compute_centers_efficient(coords, labels, n_centers):
# Provide your implementation here.
# **HINT**:
#
# Use advanced indexing with boolean masks to access
# all points that have a label corresponding to the
# index of a cluster center.
new_center_coords=[[0,0] for x in range(n_centers)]
number_of_points=[0 for x in range(n_centers)]
for label, coordinate in zip (labels,coords):
new_center_coords[label][1]+=coordinate[1]#y
new_center_coords[label][0]+=coordinate[0]#x
number_of_points[label]+=1
for coord,num_point in zip(new_center_coords,number_of_points):
assert num_point != 0, "Error - found cluster size with value 0."
coord[1]=coord[1]/num_point
coord[0]=coord[0]/num_point
return new_center_coords
```
%% Cell type:code id:3c7f163f tags: %% Cell type:code id:3c7f163f tags:
``` python ``` python
# return the list of *indices* of the cluster centers for the coordinates # return the list of *indices* of the cluster centers for the coordinates
def find_closest_center(coords, coords_center): def find_closest_center(coords, coords_center):
# Provide your implementation here. # Provide your implementation here.
# **HINT**: # **HINT**:
# #
# Use `np.tile()` to augment `coords` and then make use # Use `np.tile()` to augment `coords` and then make use
# of NumPy's implicit broadcasting capabilities to # of NumPy's implicit broadcasting capabilities to
# compute the distance of each point to *all* cluster # compute the distance of each point to *all* cluster
# centers. You might also need to reshape the array. # centers. You might also need to reshape the array.
# Think about along which *axis* to compute the norm. # Think about along which *axis* to compute the norm.
# #
# Then select the *index* of cluster center with the # Then select the *index* of cluster center with the
# least distance for each point (Look up the # least distance for each point (Look up the
# `np.argmin()` function.). # `np.argmin()` function.).
labels = [] labels = []
# For *all* points search the closest cluster centre. # For *all* points search the closest cluster centre.
for c in coords: for c in coords:
min_ccidx, min_dist = 100000, 1e+18 min_ccidx, min_dist = 100000, 1e+18
# Test each cluster center ... # Test each cluster center ...
for ccidx, cc in enumerate(coords_center): for ccidx, cc in enumerate(coords_center):
# Squared distance of point to cluster centre. # Squared distance of point to cluster centre.
dist = sum(r ** 2 for r in (x - y for x, y in zip(c, cc))) dist = sum(r ** 2 for r in (x - y for x, y in zip(c, cc)))
# Found a new candidate. # Found a new candidate.
if dist < min_dist: if dist < min_dist:
min_ccidx, min_dist = ccidx, dist min_ccidx, min_dist = ccidx, dist
# After finishing this loop we have a found the closest cluster centre. # After finishing this loop we have a found the closest cluster centre.
# (Or at least a the closest in case some have the same distance.) # (Or at least a the closest in case some have the same distance.)
# The *index* of that cluster centre is stored. # The *index* of that cluster centre is stored.
labels.append(min_ccidx) labels.append(min_ccidx)
return labels return labels
``` ```
%% Cell type:code id:0707a439 tags: %% Cell type:code id:0707a439 tags:
``` python ``` python
# The driver function is supplied, you do not need to change it # The driver function is supplied, you do not need to change it
def kmeans(coords, n_centers, n_iter, initial_random_state=42,visualize_progres=True,sleep_time=0.5): def kmeans(coords, n_centers, n_iter, initial_random_state=42,visualize_progres=True,sleep_time=0.5):
# Initialise the coordinates of the cluster centers # Initialise the coordinates of the cluster centers
rng = np.random.RandomState(initial_random_state) rng = np.random.RandomState(initial_random_state)
index = rng.choice(coords.shape[0], n_centers, replace=False) index = rng.choice(coords.shape[0], n_centers, replace=False)
# Store coords of the center for iterations # Store coords of the center for iterations
coords_center = coords[index, ...].copy() coords_center = coords[index, ...].copy()
coords_center_old = coords_center.copy() coords_center_old = coords_center.copy()
for i in range(n_iter): for i in range(n_iter):
# Find closest center for each point # Find closest center for each point
### --> you provide this function ### ### --> you provide this function ###
labels = find_closest_center(coords, coords_center) labels = find_closest_center(coords, coords_center)
if visualize_progres: if visualize_progres:
# Visualization of the process # Visualization of the process
sleep(sleep_time) sleep(sleep_time)
clear_output(wait=True) clear_output(wait=True)
# vor visualization, we have to convert the list of tuples back into an numpy array # vor visualization, we have to convert the list of tuples back into an numpy array
helper.plot_clustering(n_centers,coords,np.asarray(coords_center),np.asarray(labels)) helper.plot_clustering(n_centers,coords,np.asarray(coords_center),np.asarray(labels))
# Update the centeroids # Update the centeroids
# INFO: "..." in x[...] is a slicing operation called "ellipsis". You can learn # INFO: "..." in x[...] is a slicing operation called "ellipsis". You can learn
# more about it here: https://stackoverflow.com/questions/118370/how-do-you-use-the-ellipsis-slicing-syntax-in-python # more about it here: https://stackoverflow.com/questions/118370/how-do-you-use-the-ellipsis-slicing-syntax-in-python
coords_center_old = coords_center # save old version for testing convergence coords_center_old = coords_center # save old version for testing convergence
### --> you provide this solution ### ### --> you provide this solution ###
coords_center= compute_centers(coords, labels, n_centers) coords_center= compute_centers(coords, labels, n_centers)
# Test for convergence # Test for convergence
### --> you provide this solution ### ### --> you provide this solution ###
if centers_have_not_changed(coords_center, coords_center_old): if centers_have_not_changed(coords_center, coords_center_old):
if visualize_progres: if visualize_progres:
# visualize final state # visualize final state
sleep(sleep_time) sleep(sleep_time)
clear_output(wait=True) clear_output(wait=True)
helper.plot_clustering(n_centers,coords,np.asarray(coords_center),np.asarray(labels)) helper.plot_clustering(n_centers,coords,np.asarray(coords_center),np.asarray(labels))
print("Finished after %d iterations"%i) print("Finished after %d iterations"%i)
break break
return coords_center, labels return coords_center, labels
``` ```
%% Cell type:code id:authorized-slovenia tags: %% Cell type:code id:authorized-slovenia tags:
``` python ``` python
def main(n_clusters, dataset, n_iter=1000): def main(n_clusters, dataset, n_iter=1000):
# coords, labels = dataset.T[:2].T, dataset.T[-1].astype(int) # coords, labels = dataset.T[:2].T, dataset.T[-1].astype(int)
coords = dataset.T[:2].T coords = dataset.T[:2].T
coords_center, center_labels = kmeans( coords_center, center_labels = kmeans(
coords=coords,# the input data (coordinates of the points to be clustered) coords=coords,# the input data (coordinates of the points to be clustered)
n_centers=n_clusters,# number of clusters n_centers=n_clusters,# number of clusters
n_iter=n_iter,# maximum number of iterations to perform, if algorithm does not converge before n_iter=n_iter,# maximum number of iterations to perform, if algorithm does not converge before
initial_random_state=int(time()),# initial random seed - use a fixed value, if you want to have the same initial state for every execution initial_random_state=int(time()),# initial random seed - use a fixed value, if you want to have the same initial state for every execution
visualize_progres=True,#Turn Off, if you do not want to wait for the visualization visualize_progres=True,#Turn Off, if you do not want to wait for the visualization
sleep_time=0.5 # the sleep time controls the speed of the visualization (lower means faster) sleep_time=0.5 # the sleep time controls the speed of the visualization (lower means faster)
) )
print(coords_center) print(coords_center)
``` ```
%% Cell type:code id:scientific-compensation tags: %% Cell type:code id:scientific-compensation tags:
``` python ``` python
if __name__ == "__main__": if __name__ == "__main__":
n_clusters = 4 # change this value to test different datasets n_clusters = 4 # change this value to test different datasets
dataset = np.loadtxt(f"sample-data/coords-with-labels-{n_clusters}.dat", delimiter=",") dataset = np.loadtxt(f"sample-data/coords-with-labels-{n_clusters}.dat", delimiter=",")
main(n_clusters, dataset) main(n_clusters, dataset)
``` ```
%% Cell type:code id:098b1f1d-049d-4459-a518-1b2aef76c40e tags: %% Cell type:code id:098b1f1d-049d-4459-a518-1b2aef76c40e tags:
``` python ``` python
``` ```
%% Cell type:code id:6c4b7a8a tags: %% Cell type:code id:6c4b7a8a tags:
``` python ``` python
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment