Commit 804df640 authored by Lennart Holzenkamp's avatar Lennart Holzenkamp
Browse files

Merge branch 'clustering' into distance-calculation-lh-improved

parents 024b3bbd 217142e5
from bisect import bisect_left
import numpy as np
from pandas import DataFrame
from sklearn import cluster
import networkx as nx
import string
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn_extra import cluster as cluster_extra
from sklearn.metrics import silhouette_score
np.random.seed(1)
matrix = np.random.normal(0.5,0.01, size=(100, 100))
#matrix = np.array([[0,0.7,0.9], [0.7,0,0.2], [0.9,0.2,0]])
# the list_of_clusters elements which are returned in a list are ordered by index
def cluster_matrix(matrix, algorithm=cluster.KMeans(n_clusters=2, random_state=0)):
model = algorithm.fit(matrix)
labels = model.labels_
num_clusters = model.n_clusters
list_of_clusters = []
# list_of_clusters_values = []
for _ in range(num_clusters):
list_of_clusters.append([])
# list_of_clusters_values.append([])
for i, label in enumerate(labels):
list_of_clusters[label].append(i)
# list_of_clusters_values[label].append(matrix[i])
print(DataFrame(list_of_clusters))
return list_of_clusters, labels
# k = number of clusters
# m = len(list_of_clusters[i]) \forall i \in [k]
# n = len(items)
# creating a set: O(n), thus not worth creating it for lookup
# O(k*n*m)
# For all-assigning, only one cluster is possible
def assign_event_to_cluster_all(items: list[int], list_of_clusters):
for cluster in list_of_clusters:
is_in_all = True
for item in items:
if item not in cluster:
is_in_all = False
if is_in_all:
return cluster
return False
# We use the property that the cluster-lists are ordered
# Thus O(k*n*log(m))
# For exist-assigning, we will return all suitable clusters
# time complexity of "cluster not in suitable_clusters" is negligible, because size of suitable_clusters is assumed to be small ( <k )
def assign_event_to_cluster_exists(items: list[int], list_of_clusters):
suitable_clusters = []
for cluster in list_of_clusters:
for item in items:
if bin_search(cluster, item) and cluster not in suitable_clusters:
suitable_clusters.append(cluster)
if suitable_clusters == []:
return False
return suitable_clusters
# O(log(n))
def bin_search(list, x):
'Locate the leftmost value exactly equal to x'
i = bisect_left(list, x)
if i != len(list) and list[i] == x:
return True
else:
return False
def determine_optimal_k(X, algorithm):
sil = []
k_max = 10
for k in range(2, k_max+1):
alg = algorithm.set_params(n_clusters=k)
alg.fit(X)
labels = alg.labels_
sil.append(silhouette_score(X, labels, metric = 'euclidean'))
best_k = sil.index(max(sil)) + 1
return best_k
# cluster_matrix(matrix, algorithm=cluster_extra.KMedoids(n_clusters=3, random_state=0))
# list_of_clusters, labels = cluster_matrix(matrix, cluster.KMeans(n_clusters=2, random_state=0))
# list_of_clusters, labels = cluster_matrix(matrix, algorithm=cluster.SpectralClustering(n_clusters=3, random_state=0))
#list_of_clusters, labels = cluster_matrix(matrix, algorithm=cluster.AgglomerativeClustering(n_clusters=3))
print(determine_optimal_k(matrix, cluster.KMeans(random_state=0)))
# assign_event_to_cluster_exists([1,4], list_of_clusters)
\ No newline at end of file
......@@ -67,4 +67,4 @@ print('last 10 indexes:')
print(res['index'][-10:])
print('duration: ' + str(datetime.datetime.now() - start_ts))
quit()
\ No newline at end of file
quit()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment