Commit d9cbac6c authored by Lennart Holzenkamp's avatar Lennart Holzenkamp
Browse files

Merge branch 'merging-sy-lh'

parents b9deb382 bf20989b
......@@ -11,4 +11,5 @@ documentation/*/*.bbl
documentation/*/*.blg
data/*
legacy.md
**/__pycache__
\ No newline at end of file
**/__pycache__
data/clustered_ocel_files/*
\ No newline at end of file
from bisect import bisect_left
from sklearn import cluster
from sklearn_extra import cluster as cluster_extra
from sklearn.metrics import silhouette_score
#matrix = np.array([[0,0.7,0.9], [0.7,0,0.2], [0.9,0.2,0]])
# the list_of_clusters elements which are returned in a list are ordered by index
def cluster_matrix(matrix, algorithm=cluster.KMeans(n_clusters=2, random_state=0)):
model = algorithm.fit(matrix)
labels = model.labels_
num_clusters = model.n_clusters
list_of_clusters = []
# list_of_clusters_values = []
for _ in range(num_clusters):
list_of_clusters.append([])
# list_of_clusters_values.append([])
for i, label in enumerate(labels):
list_of_clusters[label].append(i)
# list_of_clusters_values[label].append(matrix[i])
return list_of_clusters, labels
# k = number of clusters
# m = len(list_of_clusters[i]) \forall i \in [k]
# n = len(items)
# creating a set: O(n), thus not worth creating it for lookup
# O(k*n*m)
# For all-assigning, only one cluster is possible
def assign_event_to_cluster_all(items: list[int], list_of_clusters):
for cluster in list_of_clusters:
is_in_all = True
for item in items:
if item not in cluster:
is_in_all = False
if is_in_all:
return cluster
return False
# We use the property that the cluster-lists are ordered
# Thus O(k*n*log(m))
# For exist-assigning, we will return all suitable clusters
# time complexity of "cluster not in suitable_clusters" is negligible, because size of suitable_clusters is assumed to be small ( <k )
def assign_event_to_cluster_exists(items: list[int], list_of_clusters):
suitable_clusters = []
for cluster in list_of_clusters:
for item in items:
if bin_search(cluster, item) and cluster not in suitable_clusters:
suitable_clusters.append(cluster)
if suitable_clusters == []:
return False
return suitable_clusters
# O(log(n))
def bin_search(list, x):
'Locate the leftmost value exactly equal to x'
i = bisect_left(list, x)
if i != len(list) and list[i] == x:
return True
else:
return False
def determine_optimal_k(X, algorithm, k_max=20):
score_max = 0.0
best_k = 2
results = []
for k in range(2, k_max):
alg = algorithm.set_params(n_clusters=k)
alg.fit(X)
labels = alg.labels_
score = silhouette_score(X, labels, metric = 'euclidean')
results.append([k, score])
if score > score_max:
best_k = k
score_max = score
return best_k
# cluster_matrix(matrix, algorithm=cluster_extra.KMedoids(n_clusters=3, random_state=0))
# list_of_clusters, labels = cluster_matrix(matrix, cluster.KMeans(n_clusters=2, random_state=0))
# list_of_clusters, labels = cluster_matrix(matrix, algorithm=cluster.SpectralClustering(n_clusters=3, random_state=0))
#list_of_clusters, labels = cluster_matrix(matrix, algorithm=cluster.AgglomerativeClustering(n_clusters=3))
# print(determine_optimal_k(matrix, cluster.KMeans(random_state=0)))
# assign_event_to_cluster_exists([1,4], list_of_clusters)
\ No newline at end of file
This diff is collapsed.
import datetime
import math
import os
from os.path import exists
import pandas as pd
import pm4py
import constants as c
import numpy as np
import functions as f
import cluster
from sklearn import cluster as skcluster
#
# TEST-SETTING
......@@ -20,47 +24,196 @@ test_factor = 0.1 # [0,1]: share of data to use
p_ocel_file = './data/ocel.jsonocel'
# all | existence
p_mode = 'all'
p_mode = 'existence'
# object-type-name
p_object_type = 'items'
p_object_type = 'product'
# example data for attribute weighting in distance measurements
p_attr_weights = {}
p_attr_weights['control_flow'] = 1
p_attr_weights['bankaccount'] = 1
p_attr_weights = {
}
# clustering-mode
p_clustering_mode = 'k-means'
p_clustering_mode = 'kmeans'
# cluster-count (optional)
p_cluster_count = 6
p_cluster_count = np.NaN
# ocel_file_type
p_ocel_file_type = 'xml'
p_ocel_file_type = 'json'
# graph_file_type
p_graph_file_type = 'svg'
# END PARAMETERS
#
f.print_w_ts('Program startet...')
print('Program startet...')
print('-------------------------------------------------------')
print(' SETTINGS ')
print('p_ocel_file: "' + str(p_ocel_file) + '".')
print('p_mode: "' + str(p_mode) + '".')
print('p_object_type: "' + str(p_object_type) + '".')
print('p_attr_weights:')
print(p_attr_weights)
print('p_clustering_mode: "' + str(p_clustering_mode) + '".')
print('p_cluster_count: "' + str(p_cluster_count) + '".')
print('p_ocel_file_type: "' + str(p_ocel_file_type) + '".')
print('p_graph_file_type: "' + str(p_graph_file_type) + '".')
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
f.print_w_ts('Reading inputs...')
f.print_w_ts('Params read.')
# non-data-based assertions
assert p_mode in c.MODES, 'selected mode not possible. Use either ''all'' or ''existence'''
assert exists(p_ocel_file), 'file does not exists'
algorithms = {
'kmeans': skcluster.KMeans(),
'spectral': skcluster.SpectralClustering(),
'agglomerative': skcluster.AgglomerativeClustering()
}
assert p_clustering_mode in algorithms.keys(), 'The given clustering mode "' + p_clustering_mode + '" is not availabel. Use on of: ' + ', '.join(algorithms.keys()) + '.'
f.print_w_ts('Reading ocel data...')
ocel = pm4py.read_ocel(p_ocel_file)
f.print_w_ts('ocel data read.')
res = f.ocel_get_object_distances(ocel, p_object_type, p_attr_weights)
# res = f.ocel_get_object_distances_lh(ocel, p_object_type, p_attr_weights)
res = f.ocel_get_object_distances_sy(ocel, p_object_type, p_attr_weights)
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' DISTANCE CALCULATION ')
print('distances:')
print(res['distances'])
print('first 10 indexes of map:')
print(res['index'][:10])
print('last 10 indexes of map:')
print(res['index'][-10:])
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
distance_matrix = res['distances']
index_to_oid_map = res['index']
algo = algorithms[p_clustering_mode]
try:
cluster_count = int(p_cluster_count)
assert cluster_count >= 2, 'cluster_count needs to be at least 2'
assert cluster_count < len(index_to_oid_map), 'cluster_count needs to be less than the count of distinct objects in the ocel-data.'
except:
cluster_count = cluster.determine_optimal_k(distance_matrix, algorithm=algo, k_max=math.floor(len(index_to_oid_map) / 2)+1)
algo.set_params(n_clusters=cluster_count)
cluster_res = cluster.cluster_matrix(distance_matrix, algorithm=algo)
df_clusters = pd.DataFrame({'cluster': cluster_res[1]}, index=index_to_oid_map) # creating dataframe
df_clusters.index.name='ocel:oid' # setting index name for joining
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' OBJECT-CLUSTERING ')
print('clustering-technique:')
print(p_clustering_mode)
print('given cluster-count:')
print(p_cluster_count)
print('cluster-count used:')
print(cluster_count)
print('object-cluster-dataframe:')
print(df_clusters)
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
df_object_data = ocel.relations[ocel.relations['ocel:type'] == p_object_type].set_index('ocel:oid')[['ocel:eid']]
# get relation data for objects of selected type and set index to oid for joinin and only load ocel:eid column
df_event_object_clusters = df_clusters.join(df_object_data, how='right').reset_index()
# join ob object-cluster and object-event data. hence every event has now its potential cluster candidates assigned
df_cluster_candidates = df_event_object_clusters[['ocel:eid', 'cluster']].drop_duplicates()
# contains all ocel:eid->cluster combinations. now a special aggregation has to be done (existence/all approach)
# function for all approach. returns the number of the cluster (0..n) or -1 if cluster is not unique and -2 if cluster list is empty
def get_single_value(series: pd.Series):
arr = series.unique()
length = len(arr)
if length > 1:
return -1
elif length == 0:
return -2 # error, should not occur
else:
return int(arr[0]) # first element (all are the same)
if p_mode == 'all':
clusters = df_cluster_candidates.groupby('ocel:eid').agg({'cluster': get_single_value})
clusters = clusters[clusters['cluster']>=0]
elif p_mode == 'existence':
clusters = df_cluster_candidates.set_index('ocel:eid')
# clusters contains now a event->cluster mapping.
# All-approach: clusters contains less or equal event ids compared to all events that have a relation to a instance of the specified object type
# Cause: One event can have relations to multiple instances (shopping multiple products) and they can occur in different clusters, hence the event is assigned to no cluster
# Existence-approach: clusters contains more or equal event ids compared to all events that have a relation to a instance of the specified object type.
# Cause: One event can have relations to multiple instances (shopping multiple products) and they can occur in different clusters, hence the event is assigned to multiple clusters
clusters.reset_index(inplace=True)
ocel_clusters = []
groups = clusters.groupby('cluster')['ocel:eid'].apply(list).reset_index(name='event_list').set_index('cluster')
groups = groups.to_dict()['event_list'] # now: {<cluster_0>: [event_01, ..., evenet_m], ..., <cluster_n>: [event_234, ..., evenet_8992]}
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' EVENT-ASSIGNING ')
print('selected mode: ' + p_mode)
print('Event groups:')
for dict_key in groups.keys():
print('cluster "' + str(dict_key) + '":')
group = groups[dict_key]
len_group = len(group)
print('-- element count: ' + str(len_group))
print('-- first 5 elements:')
for ii in range(0, min(5, len_group)):
print('-- -- ' + group[ii])
print('-- last 5 elements:')
for ii in range(max(0, len_group-5), len_group):
print('-- -- ' + group[ii])
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
res = []
for ii in range(0, cluster_count):
tmp_ocel = f.ocel_filter_by_events(ocel, groups[ii])
res.append(tmp_ocel)
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' CREATING SEPARATE OCELs ')
print('ocels:')
print(res)
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
# storing ocels
print('-------------------------------------------------------')
print(' STORING CLUSTERED-OCEL-FILES ')
directory = os.path.dirname(p_ocel_file) + '/clustered_ocel_files'
if not os.path.exists(directory):
os.makedirs(directory)
appendix_len = len(str(cluster_count))
for ii in range(0, len(res)):
filename = directory + '/cluster_' + str(ii+1).rjust(appendix_len) + '.' + p_ocel_file_type + 'ocel'
pm4py.write_ocel(ocel, filename)
print(str(ii+1).rjust(appendix_len) + '/' + str(cluster_count) + '"' + filename + '" stored.')
f.print_w_ts('distance matrix calculated:')
f.print_w_ts(res)
f.print_w_ts('duration: ' + str(datetime.datetime.now() - start_ts))
print('-------------------------------------------------------')
quit()
\ No newline at end of file
quit()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment