Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Scientific-Data-Processing-With-Python
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
HKHLR
Scientific-Data-Processing-With-Python
Commits
b09e32ec
Commit
b09e32ec
authored
2 years ago
by
Jammer, Tim
Browse files
Options
Downloads
Patches
Plain Diff
updated solution with more efficient one discussed in the course
parent
ed4e7617
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
exercises/Numpy_KMeansClustering/NumPy_KMeansClustering_stdPython.ipynb
+33
-1
33 additions, 1 deletion
...y_KMeansClustering/NumPy_KMeansClustering_stdPython.ipynb
with
33 additions
and
1 deletion
exercises/Numpy_KMeansClustering/NumPy_KMeansClustering_stdPython.ipynb
+
33
−
1
View file @
b09e32ec
...
@@ -139,6 +139,38 @@
...
@@ -139,6 +139,38 @@
" return coords_center"
" return coords_center"
]
]
},
},
{
"cell_type": "code",
"execution_count": null,
"id": "662132b4",
"metadata": {},
"outputs": [],
"source": [
"# return the updated locations of the cluster centers\n",
"def compute_centers_efficient(coords, labels, n_centers):\n",
" # Provide your implementation here. \n",
" # **HINT**:\n",
" # \n",
" # Use advanced indexing with boolean masks to access\n",
" # all points that have a label corresponding to the \n",
" # index of a cluster center.\n",
" new_center_coords=[[0,0] for x in range(n_centers)]\n",
" number_of_points=[0 for x in range(n_centers)]\n",
" \n",
" for label, coordinate in zip (labels,coords):\n",
" new_center_coords[label][1]+=coordinate[1]#y\n",
" new_center_coords[label][0]+=coordinate[0]#x\n",
" number_of_points[label]+=1\n",
" \n",
" for coord,num_point in zip(new_center_coords,number_of_points):\n",
" assert num_point != 0, \"Error - found cluster size with value 0.\"\n",
" coord[1]=coord[1]/num_point\n",
" coord[0]=coord[0]/num_point\n",
" \n",
" return new_center_coords\n",
" "
]
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": null,
"execution_count": null,
...
@@ -283,7 +315,7 @@
...
@@ -283,7 +315,7 @@
],
],
"metadata": {
"metadata": {
"kernelspec": {
"kernelspec": {
"display_name": "Python 3
(ipykernel)
",
"display_name": "Python 3",
"language": "python",
"language": "python",
"name": "python3"
"name": "python3"
},
},
...
...
%% Cell type:code id:partial-munich tags:
%% Cell type:code id:partial-munich tags:
```
python
```
python
%
matplotlib
inline
%
matplotlib
inline
from
matplotlib
import
pyplot
as
plt
from
matplotlib
import
pyplot
as
plt
import
numpy
as
np
import
numpy
as
np
import
importlib
import
importlib
import
helper
import
helper
importlib
.
reload
(
helper
)
importlib
.
reload
(
helper
)
import
math
import
math
from
IPython.display
import
clear_output
from
IPython.display
import
clear_output
from
time
import
sleep
,
time
from
time
import
sleep
,
time
```
```
%% Cell type:markdown id:honest-mexico tags:
%% Cell type:markdown id:honest-mexico tags:
## Beispiel fuer einen Datensatz mit 4 Clustern
## Beispiel fuer einen Datensatz mit 4 Clustern
%% Cell type:code id:invalid-baseball tags:
%% Cell type:code id:invalid-baseball tags:
```
python
```
python
dataset
=
np
.
loadtxt
(
"
sample-data/coords-with-labels-4.dat
"
,
delimiter
=
"
,
"
)
dataset
=
np
.
loadtxt
(
"
sample-data/coords-with-labels-4.dat
"
,
delimiter
=
"
,
"
)
coords
,
labels
=
dataset
.
T
[:
2
].
T
,
dataset
.
T
[
-
1
].
astype
(
int
)
coords
,
labels
=
dataset
.
T
[:
2
].
T
,
dataset
.
T
[
-
1
].
astype
(
int
)
num_labels
=
np
.
unique
(
labels
).
size
num_labels
=
np
.
unique
(
labels
).
size
coords_by_label
=
list
(
coords
[
labels
==
tt
]
for
tt
in
range
(
num_labels
))
coords_by_label
=
list
(
coords
[
labels
==
tt
]
for
tt
in
range
(
num_labels
))
coords_center
=
np
.
loadtxt
(
"
sample-data/cluster-centers-4.dat
"
,
delimiter
=
"
,
"
)
coords_center
=
np
.
loadtxt
(
"
sample-data/cluster-centers-4.dat
"
,
delimiter
=
"
,
"
)
```
```
%% Cell type:code id:piano-vehicle tags:
%% Cell type:code id:piano-vehicle tags:
```
python
```
python
ax1
,
ax2
=
helper
.
init_figure
()
ax1
,
ax2
=
helper
.
init_figure
()
# Scatter plot of coords without clustering.
# Scatter plot of coords without clustering.
helper
.
make_scatter_plot
(
ax1
,
coords
=
[
coords
],
labels
=
[
""
])
helper
.
make_scatter_plot
(
ax1
,
coords
=
[
coords
],
labels
=
[
""
])
# Scatter plot of coords assigned to clusters
# Scatter plot of coords assigned to clusters
helper
.
make_scatter_plot
(
helper
.
make_scatter_plot
(
ax2
,
ax2
,
coords_by_label
,
coords_by_label
,
labels
=
[
f
"
cluster
{
tt
}
"
for
tt
in
range
(
num_labels
)],
labels
=
[
f
"
cluster
{
tt
}
"
for
tt
in
range
(
num_labels
)],
markers
=
[
"
o
"
]
*
num_labels
markers
=
[
"
o
"
]
*
num_labels
)
)
# Plot cluster centers.
# Plot cluster centers.
helper
.
make_scatter_plot
(
helper
.
make_scatter_plot
(
ax2
,
ax2
,
coords_center
,
coords_center
,
labels
=
[
f
"
centeroid
{
tt
}
"
for
tt
in
range
(
num_labels
)],
labels
=
[
f
"
centeroid
{
tt
}
"
for
tt
in
range
(
num_labels
)],
colors
=
[
"
black
"
]
*
num_labels
,
colors
=
[
"
black
"
]
*
num_labels
,
with_legend
=
True
,
with_legend
=
True
,
)
)
```
```
%% Cell type:markdown id:collectible-detector tags:
%% Cell type:markdown id:collectible-detector tags:
## Implementation using standard Python only
## Implementation using standard Python only
%% Cell type:code id:3084ddcb tags:
%% Cell type:code id:3084ddcb tags:
```
python
```
python
# return True, if centers have not changed and the algorithm can therefore stop
# return True, if centers have not changed and the algorithm can therefore stop
def
centers_have_not_changed
(
a
,
b
):
def
centers_have_not_changed
(
a
,
b
):
# if the center location only changes very little, we also consider it same
# if the center location only changes very little, we also consider it same
rtol
=
1e-05
rtol
=
1e-05
atol
=
1e-08
atol
=
1e-08
#has_changed=False
#has_changed=False
# Provide your implementation here.
# Provide your implementation here.
for
point_a
,
point_b
in
zip
(
a
,
b
):
for
point_a
,
point_b
in
zip
(
a
,
b
):
for
coordinate_a
,
coordinate_b
in
zip
(
point_a
,
point_b
):
for
coordinate_a
,
coordinate_b
in
zip
(
point_a
,
point_b
):
if
abs
(
coordinate_a
-
coordinate_b
)
>=
(
atol
+
rtol
*
abs
(
coordinate_b
)):
if
abs
(
coordinate_a
-
coordinate_b
)
>=
(
atol
+
rtol
*
abs
(
coordinate_b
)):
#has_changed=True
#has_changed=True
return
False
return
False
return
True
return
True
```
```
%% Cell type:code id:0ae4cfd1 tags:
%% Cell type:code id:0ae4cfd1 tags:
```
python
```
python
# return the updated locations of the cluster centers
# return the updated locations of the cluster centers
def
compute_centers
(
coords
,
labels
,
n_centers
):
def
compute_centers
(
coords
,
labels
,
n_centers
):
# Provide your implementation here.
# Provide your implementation here.
# **HINT**:
# **HINT**:
#
#
# Use advanced indexing with boolean masks to access
# Use advanced indexing with boolean masks to access
# all points that have a label corresponding to the
# all points that have a label corresponding to the
# index of a cluster center.
# index of a cluster center.
coords_center
=
[]
coords_center
=
[]
# For every cluster we look up all points that are closest to it.
# For every cluster we look up all points that are closest to it.
for
ccidx
in
range
(
n_centers
):
for
ccidx
in
range
(
n_centers
):
ccx
,
ccy
=
0
,
0
ccx
,
ccy
=
0
,
0
cluster_size
=
0
cluster_size
=
0
# Find all points "assigned" to the current cluster center.
# Find all points "assigned" to the current cluster center.
for
lc
,
c
in
zip
(
labels
,
coords
):
for
lc
,
c
in
zip
(
labels
,
coords
):
cx
,
cy
=
c
cx
,
cy
=
c
if
ccidx
==
lc
:
if
ccidx
==
lc
:
cluster_size
+=
1
cluster_size
+=
1
ccx
+=
cx
ccx
+=
cx
ccy
+=
cy
ccy
+=
cy
assert
cluster_size
>
0
,
"
Error - found cluster size with value 0.
"
assert
cluster_size
>
0
,
"
Error - found cluster size with value 0.
"
# Remember to divide by the cluster_size since we compute the
# Remember to divide by the cluster_size since we compute the
# new cluster centre as the arithmetic mean from the coordinates
# new cluster centre as the arithmetic mean from the coordinates
# of all points assigned to it.
# of all points assigned to it.
coords_center
.
append
([
ccx
/
cluster_size
,
ccy
/
cluster_size
])
coords_center
.
append
([
ccx
/
cluster_size
,
ccy
/
cluster_size
])
return
coords_center
return
coords_center
```
```
%% Cell type:code id:662132b4 tags:
```
python
# return the updated locations of the cluster centers
def
compute_centers_efficient
(
coords
,
labels
,
n_centers
):
# Provide your implementation here.
# **HINT**:
#
# Use advanced indexing with boolean masks to access
# all points that have a label corresponding to the
# index of a cluster center.
new_center_coords
=
[[
0
,
0
]
for
x
in
range
(
n_centers
)]
number_of_points
=
[
0
for
x
in
range
(
n_centers
)]
for
label
,
coordinate
in
zip
(
labels
,
coords
):
new_center_coords
[
label
][
1
]
+=
coordinate
[
1
]
#y
new_center_coords
[
label
][
0
]
+=
coordinate
[
0
]
#x
number_of_points
[
label
]
+=
1
for
coord
,
num_point
in
zip
(
new_center_coords
,
number_of_points
):
assert
num_point
!=
0
,
"
Error - found cluster size with value 0.
"
coord
[
1
]
=
coord
[
1
]
/
num_point
coord
[
0
]
=
coord
[
0
]
/
num_point
return
new_center_coords
```
%% Cell type:code id:3c7f163f tags:
%% Cell type:code id:3c7f163f tags:
```
python
```
python
# return the list of *indices* of the cluster centers for the coordinates
# return the list of *indices* of the cluster centers for the coordinates
def
find_closest_center
(
coords
,
coords_center
):
def
find_closest_center
(
coords
,
coords_center
):
# Provide your implementation here.
# Provide your implementation here.
# **HINT**:
# **HINT**:
#
#
# Use `np.tile()` to augment `coords` and then make use
# Use `np.tile()` to augment `coords` and then make use
# of NumPy's implicit broadcasting capabilities to
# of NumPy's implicit broadcasting capabilities to
# compute the distance of each point to *all* cluster
# compute the distance of each point to *all* cluster
# centers. You might also need to reshape the array.
# centers. You might also need to reshape the array.
# Think about along which *axis* to compute the norm.
# Think about along which *axis* to compute the norm.
#
#
# Then select the *index* of cluster center with the
# Then select the *index* of cluster center with the
# least distance for each point (Look up the
# least distance for each point (Look up the
# `np.argmin()` function.).
# `np.argmin()` function.).
labels
=
[]
labels
=
[]
# For *all* points search the closest cluster centre.
# For *all* points search the closest cluster centre.
for
c
in
coords
:
for
c
in
coords
:
min_ccidx
,
min_dist
=
100000
,
1e+18
min_ccidx
,
min_dist
=
100000
,
1e+18
# Test each cluster center ...
# Test each cluster center ...
for
ccidx
,
cc
in
enumerate
(
coords_center
):
for
ccidx
,
cc
in
enumerate
(
coords_center
):
# Squared distance of point to cluster centre.
# Squared distance of point to cluster centre.
dist
=
sum
(
r
**
2
for
r
in
(
x
-
y
for
x
,
y
in
zip
(
c
,
cc
)))
dist
=
sum
(
r
**
2
for
r
in
(
x
-
y
for
x
,
y
in
zip
(
c
,
cc
)))
# Found a new candidate.
# Found a new candidate.
if
dist
<
min_dist
:
if
dist
<
min_dist
:
min_ccidx
,
min_dist
=
ccidx
,
dist
min_ccidx
,
min_dist
=
ccidx
,
dist
# After finishing this loop we have a found the closest cluster centre.
# After finishing this loop we have a found the closest cluster centre.
# (Or at least a the closest in case some have the same distance.)
# (Or at least a the closest in case some have the same distance.)
# The *index* of that cluster centre is stored.
# The *index* of that cluster centre is stored.
labels
.
append
(
min_ccidx
)
labels
.
append
(
min_ccidx
)
return
labels
return
labels
```
```
%% Cell type:code id:0707a439 tags:
%% Cell type:code id:0707a439 tags:
```
python
```
python
# The driver function is supplied, you do not need to change it
# The driver function is supplied, you do not need to change it
def
kmeans
(
coords
,
n_centers
,
n_iter
,
initial_random_state
=
42
,
visualize_progres
=
True
,
sleep_time
=
0.5
):
def
kmeans
(
coords
,
n_centers
,
n_iter
,
initial_random_state
=
42
,
visualize_progres
=
True
,
sleep_time
=
0.5
):
# Initialise the coordinates of the cluster centers
# Initialise the coordinates of the cluster centers
rng
=
np
.
random
.
RandomState
(
initial_random_state
)
rng
=
np
.
random
.
RandomState
(
initial_random_state
)
index
=
rng
.
choice
(
coords
.
shape
[
0
],
n_centers
,
replace
=
False
)
index
=
rng
.
choice
(
coords
.
shape
[
0
],
n_centers
,
replace
=
False
)
# Store coords of the center for iterations
# Store coords of the center for iterations
coords_center
=
coords
[
index
,
...].
copy
()
coords_center
=
coords
[
index
,
...].
copy
()
coords_center_old
=
coords_center
.
copy
()
coords_center_old
=
coords_center
.
copy
()
for
i
in
range
(
n_iter
):
for
i
in
range
(
n_iter
):
# Find closest center for each point
# Find closest center for each point
### --> you provide this function ###
### --> you provide this function ###
labels
=
find_closest_center
(
coords
,
coords_center
)
labels
=
find_closest_center
(
coords
,
coords_center
)
if
visualize_progres
:
if
visualize_progres
:
# Visualization of the process
# Visualization of the process
sleep
(
sleep_time
)
sleep
(
sleep_time
)
clear_output
(
wait
=
True
)
clear_output
(
wait
=
True
)
# vor visualization, we have to convert the list of tuples back into an numpy array
# vor visualization, we have to convert the list of tuples back into an numpy array
helper
.
plot_clustering
(
n_centers
,
coords
,
np
.
asarray
(
coords_center
),
np
.
asarray
(
labels
))
helper
.
plot_clustering
(
n_centers
,
coords
,
np
.
asarray
(
coords_center
),
np
.
asarray
(
labels
))
# Update the centeroids
# Update the centeroids
# INFO: "..." in x[...] is a slicing operation called "ellipsis". You can learn
# INFO: "..." in x[...] is a slicing operation called "ellipsis". You can learn
# more about it here: https://stackoverflow.com/questions/118370/how-do-you-use-the-ellipsis-slicing-syntax-in-python
# more about it here: https://stackoverflow.com/questions/118370/how-do-you-use-the-ellipsis-slicing-syntax-in-python
coords_center_old
=
coords_center
# save old version for testing convergence
coords_center_old
=
coords_center
# save old version for testing convergence
### --> you provide this solution ###
### --> you provide this solution ###
coords_center
=
compute_centers
(
coords
,
labels
,
n_centers
)
coords_center
=
compute_centers
(
coords
,
labels
,
n_centers
)
# Test for convergence
# Test for convergence
### --> you provide this solution ###
### --> you provide this solution ###
if
centers_have_not_changed
(
coords_center
,
coords_center_old
):
if
centers_have_not_changed
(
coords_center
,
coords_center_old
):
if
visualize_progres
:
if
visualize_progres
:
# visualize final state
# visualize final state
sleep
(
sleep_time
)
sleep
(
sleep_time
)
clear_output
(
wait
=
True
)
clear_output
(
wait
=
True
)
helper
.
plot_clustering
(
n_centers
,
coords
,
np
.
asarray
(
coords_center
),
np
.
asarray
(
labels
))
helper
.
plot_clustering
(
n_centers
,
coords
,
np
.
asarray
(
coords_center
),
np
.
asarray
(
labels
))
print
(
"
Finished after %d iterations
"
%
i
)
print
(
"
Finished after %d iterations
"
%
i
)
break
break
return
coords_center
,
labels
return
coords_center
,
labels
```
```
%% Cell type:code id:authorized-slovenia tags:
%% Cell type:code id:authorized-slovenia tags:
```
python
```
python
def
main
(
n_clusters
,
dataset
,
n_iter
=
1000
):
def
main
(
n_clusters
,
dataset
,
n_iter
=
1000
):
# coords, labels = dataset.T[:2].T, dataset.T[-1].astype(int)
# coords, labels = dataset.T[:2].T, dataset.T[-1].astype(int)
coords
=
dataset
.
T
[:
2
].
T
coords
=
dataset
.
T
[:
2
].
T
coords_center
,
center_labels
=
kmeans
(
coords_center
,
center_labels
=
kmeans
(
coords
=
coords
,
# the input data (coordinates of the points to be clustered)
coords
=
coords
,
# the input data (coordinates of the points to be clustered)
n_centers
=
n_clusters
,
# number of clusters
n_centers
=
n_clusters
,
# number of clusters
n_iter
=
n_iter
,
# maximum number of iterations to perform, if algorithm does not converge before
n_iter
=
n_iter
,
# maximum number of iterations to perform, if algorithm does not converge before
initial_random_state
=
int
(
time
()),
# initial random seed - use a fixed value, if you want to have the same initial state for every execution
initial_random_state
=
int
(
time
()),
# initial random seed - use a fixed value, if you want to have the same initial state for every execution
visualize_progres
=
True
,
#Turn Off, if you do not want to wait for the visualization
visualize_progres
=
True
,
#Turn Off, if you do not want to wait for the visualization
sleep_time
=
0.5
# the sleep time controls the speed of the visualization (lower means faster)
sleep_time
=
0.5
# the sleep time controls the speed of the visualization (lower means faster)
)
)
print
(
coords_center
)
print
(
coords_center
)
```
```
%% Cell type:code id:scientific-compensation tags:
%% Cell type:code id:scientific-compensation tags:
```
python
```
python
if
__name__
==
"
__main__
"
:
if
__name__
==
"
__main__
"
:
n_clusters
=
4
# change this value to test different datasets
n_clusters
=
4
# change this value to test different datasets
dataset
=
np
.
loadtxt
(
f
"
sample-data/coords-with-labels-
{
n_clusters
}
.dat
"
,
delimiter
=
"
,
"
)
dataset
=
np
.
loadtxt
(
f
"
sample-data/coords-with-labels-
{
n_clusters
}
.dat
"
,
delimiter
=
"
,
"
)
main
(
n_clusters
,
dataset
)
main
(
n_clusters
,
dataset
)
```
```
%% Cell type:code id:098b1f1d-049d-4459-a518-1b2aef76c40e tags:
%% Cell type:code id:098b1f1d-049d-4459-a518-1b2aef76c40e tags:
```
python
```
python
``
`
``
`
%%
Cell
type
:
code
id
:
6
c4b7a8a
tags
:
%%
Cell
type
:
code
id
:
6
c4b7a8a
tags
:
```
python
```
python
```
```
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment