Commit fc0d6360 authored by Lennart Holzenkamp's avatar Lennart Holzenkamp
Browse files

adds clustering to main program, adds event-assignment, creating of ocels and storing of them

parent 804df640
......@@ -11,4 +11,5 @@ documentation/*/*.bbl
documentation/*/*.blg
data/*
legacy.md
**/__pycache__
\ No newline at end of file
**/__pycache__
data/clustered_ocel_files/*
\ No newline at end of file
from bisect import bisect_left
import numpy as np
from pandas import DataFrame
from sklearn import cluster
import networkx as nx
import string
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn_extra import cluster as cluster_extra
from sklearn.metrics import silhouette_score
np.random.seed(1)
matrix = np.random.normal(0.5,0.01, size=(100, 100))
#matrix = np.array([[0,0.7,0.9], [0.7,0,0.2], [0.9,0.2,0]])
# the list_of_clusters elements which are returned in a list are ordered by index
......@@ -34,7 +25,6 @@ def cluster_matrix(matrix, algorithm=cluster.KMeans(n_clusters=2, random_state=0
list_of_clusters[label].append(i)
# list_of_clusters_values[label].append(matrix[i])
print(DataFrame(list_of_clusters))
return list_of_clusters, labels
......@@ -82,16 +72,19 @@ def bin_search(list, x):
else:
return False
def determine_optimal_k(X, algorithm):
sil = []
k_max = 10
for k in range(2, k_max+1):
def determine_optimal_k(X, algorithm, k_max=20):
score_max = 0.0
best_k = 2
results = []
for k in range(2, k_max):
alg = algorithm.set_params(n_clusters=k)
alg.fit(X)
labels = alg.labels_
sil.append(silhouette_score(X, labels, metric = 'euclidean'))
best_k = sil.index(max(sil)) + 1
score = silhouette_score(X, labels, metric = 'euclidean')
results.append([k, score])
if score > score_max:
best_k = k
score_max = score
return best_k
......@@ -100,5 +93,5 @@ def determine_optimal_k(X, algorithm):
# list_of_clusters, labels = cluster_matrix(matrix, algorithm=cluster.SpectralClustering(n_clusters=3, random_state=0))
#list_of_clusters, labels = cluster_matrix(matrix, algorithm=cluster.AgglomerativeClustering(n_clusters=3))
print(determine_optimal_k(matrix, cluster.KMeans(random_state=0)))
# print(determine_optimal_k(matrix, cluster.KMeans(random_state=0)))
# assign_event_to_cluster_exists([1,4], list_of_clusters)
\ No newline at end of file
import datetime
from enum import unique
from functools import cache
import itertools
import math
from operator import index
from typing import Any
from matplotlib.cbook import flatten
from matplotlib.pyplot import axis
import numpy as np
import pandas as pd
from pm4py.objects.ocel.obj import OCEL
from Levenshtein import distance as lev
import pm4py
import sklearn
from sympy import EX
import constants as c
import scipy.spatial.distance as spd
from pandarallel import pandarallel
def count_non_nans(data):
count = 0
......@@ -230,11 +222,12 @@ def ocel_get_object_distances(ocel, object_type, weights_per_attribute) -> dict[
# CALCULATING DISTANCES (20-22s)
df_distance_matrix = df_pairwise_attr_distance(df_object_data, attr_func_map, ('_a', '_b'))
# NORMALIZING / WEIGHTING / AVERAGING
df_distance_matrix = df_normalize_columns(df_distance_matrix, list(attr_func_map.keys()))
df_distance_matrix = df_weight_columns(df_distance_matrix, weights_per_attribute)
df_distance_matrix['distance_avg'] = df_distance_matrix[list(attr_func_map.keys())].mean(axis=1)
df_distance_matrix = df_normalize_columns(df_distance_matrix, ['distance_avg'])
df_distance_matrix = df_distance_matrix.drop(list(attr_func_map.keys()), axis=1) # single distances not relevant anymore
# Creating the matrix-index->ocel:oid map
......@@ -246,4 +239,22 @@ def ocel_get_object_distances(ocel, object_type, weights_per_attribute) -> dict[
return {
'distances': df_matrix,
'index': index_to_id_map
}
\ No newline at end of file
}
# filters the given ocel data to all nodes that contain only the given events (by ocel:eid)
def ocel_filter_by_events(ocel: OCEL, events: np.array) -> OCEL:
# setting up dataframe for joining
df_event_ids = pd.DataFrame(events, columns=['ocel:eid'])
df_event_ids['ocel:eid'] = df_event_ids['ocel:eid'].astype(str)
df_event_ids.set_index('ocel:eid', inplace=True)
# creating relation data
res_relations = ocel.relations.join(df_event_ids, on='ocel:eid', how='right') # get all relations for events, no more (right join)
# creating object data
res_objects = res_relations[['ocel:oid', 'ocel:type']].join(ocel.objects.set_index(['ocel:oid', 'ocel:type']), on=['ocel:oid', 'ocel:type'], how='left')[ocel.objects.columns]
# creating event data
res_events = ocel.events.join(df_event_ids, on=['ocel:eid'], how='right')
# assembling ocel
res = OCEL(res_events, res_objects, res_relations, ocel.globals, ocel.parameters)
return res
\ No newline at end of file
import datetime
import math
import os
from os.path import exists
import pandas as pd
import pm4py
import scipy
import constants as c
import numpy as np
import functions as f
import cluster
from sklearn import cluster as skcluster
#
# TEST-SETTING
......@@ -21,24 +24,22 @@ test_factor = 0.1 # [0,1]: share of data to use
p_ocel_file = './data/ocel.jsonocel'
# all | existence
p_mode = 'all'
p_mode = 'existence'
# object-type-name
p_object_type = 'items'
p_object_type = 'products'
# example data for attribute weighting in distance measurements
p_attr_weights = {}
p_attr_weights['control_flow'] = 1
p_attr_weights['bankaccount'] = 1
# clustering-mode
p_clustering_mode = 'k-means'
p_clustering_mode = 'kmeans'
# cluster-count (optional)
p_cluster_count = 6
p_cluster_count = np.NaN
# ocel_file_type
p_ocel_file_type = 'xml'
p_ocel_file_type = 'json'
# graph_file_type
p_graph_file_type = 'svg'
......@@ -46,25 +47,173 @@ p_graph_file_type = 'svg'
#
print('Program startet...')
print('-------------------------------------------------------')
print(' SETTINGS ')
print('p_ocel_file: "' + str(p_ocel_file) + '".')
print('p_mode: "' + str(p_mode) + '".')
print('p_object_type: "' + str(p_object_type) + '".')
print('p_attr_weights:')
print(p_attr_weights)
print('p_clustering_mode: "' + str(p_clustering_mode) + '".')
print('p_cluster_count: "' + str(p_cluster_count) + '".')
print('p_ocel_file_type: "' + str(p_ocel_file_type) + '".')
print('p_graph_file_type: "' + str(p_graph_file_type) + '".')
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
# non-data-based assertions
assert p_mode in c.MODES, 'selected mode not possible. Use either ''all'' or ''existence'''
assert exists(p_ocel_file), 'file does not exists'
algorithms = {
'kmeans': skcluster.KMeans(),
'spectral': skcluster.SpectralClustering(),
'agglomerative': skcluster.AgglomerativeClustering()
}
assert p_clustering_mode in algorithms.keys(), 'The given clustering mode "' + p_clustering_mode + '" is not availabel. Use on of: ' + ', '.join(algorithms.keys()) + '.'
# reading ocel data
ocel = pm4py.read_ocel(p_ocel_file)
# calculating distances of objects based on control-flow and all non-NaN attributes of objects
res = f.ocel_get_object_distances(ocel, p_object_type, p_attr_weights)
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' DISTANCE CALCULATION ')
print('distances:')
print(res['distances'])
print('first 10 indexes:')
print('first 10 indexes of map:')
print(res['index'][:10])
print('last 10 indexes:')
print('last 10 indexes of map:')
print(res['index'][-10:])
print('duration: ' + str(datetime.datetime.now() - start_ts))
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
distance_matrix = res['distances']
index_to_oid_map = res['index']
algo = algorithms[p_clustering_mode]
try:
cluster_count = int(p_cluster_count)
assert cluster_count >= 2, 'cluster_count needs to be at least 2'
assert cluster_count < len(index_to_oid_map), 'cluster_count needs to be less than the count of distinct objects in the ocel-data.'
except:
cluster_count = cluster.determine_optimal_k(distance_matrix, algorithm=algo, k_max=math.floor(len(index_to_oid_map) / 2)+1)
algo.set_params(n_clusters=cluster_count)
cluster_res = cluster.cluster_matrix(distance_matrix, algorithm=algo)
df_clusters = pd.DataFrame({'cluster': cluster_res[1]}, index=index_to_oid_map) # creating dataframe
df_clusters.index.name='ocel:oid' # setting index name for joining
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' OBJECT-CLUSTERING ')
print('clustering-technique:')
print(p_clustering_mode)
print('given cluster-count:')
print(p_cluster_count)
print('cluster-count used:')
print(cluster_count)
print('object-cluster-dataframe:')
print(df_clusters)
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
df_object_data = ocel.relations[ocel.relations['ocel:type'] == p_object_type].set_index('ocel:oid')[['ocel:eid']]
# get relation data for objects of selected type and set index to oid for joinin and only load ocel:eid column
df_event_object_clusters = df_clusters.join(df_object_data, how='right').reset_index()
# join ob object-cluster and object-event data. hence every event has now its potential cluster candidates assigned
df_cluster_candidates = df_event_object_clusters[['ocel:eid', 'cluster']].drop_duplicates()
# contains all ocel:eid->cluster combinations. now a special aggregation has to be done (existence/all approach)
# function for all approach. returns the number of the cluster (0..n) or -1 if cluster is not unique and -2 if cluster list is empty
def get_single_value(series: pd.Series):
arr = series.unique()
length = len(arr)
if length > 1:
return -1
elif length == 0:
return -2 # error, should not occur
else:
return int(arr[0]) # first element (all are the same)
if p_mode == 'all':
clusters = df_cluster_candidates.groupby('ocel:eid').agg({'cluster': get_single_value})
clusters = clusters[clusters['cluster']>=0]
elif p_mode == 'existence':
clusters = df_cluster_candidates.set_index('ocel:eid')
# clusters contains now a event->cluster mapping.
# All-approach: clusters contains less or equal event ids compared to all events that have a relation to a instance of the specified object type
# Cause: One event can have relations to multiple instances (shopping multiple products) and they can occur in different clusters, hence the event is assigned to no cluster
# Existence-approach: clusters contains more or equal event ids compared to all events that have a relation to a instance of the specified object type.
# Cause: One event can have relations to multiple instances (shopping multiple products) and they can occur in different clusters, hence the event is assigned to multiple clusters
clusters.reset_index(inplace=True)
ocel_clusters = []
groups = clusters.groupby('cluster')['ocel:eid'].apply(list).reset_index(name='event_list').set_index('cluster')
groups = groups.to_dict()['event_list'] # now: {<cluster_0>: [event_01, ..., evenet_m], ..., <cluster_n>: [event_234, ..., evenet_8992]}
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' EVENT-ASSIGNING ')
print('selected mode: ' + p_mode)
print('Event groups:')
for dict_key in groups.keys():
print('cluster "' + str(dict_key) + '":')
group = groups[dict_key]
len_group = len(group)
print('-- element count: ' + str(len_group))
print('-- first 5 elements:')
for ii in range(0, min(5, len_group)):
print('-- -- ' + group[ii])
print('-- last 5 elements:')
for ii in range(max(0, len_group-5), len_group):
print('-- -- ' + group[ii])
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
res = []
for ii in range(0, cluster_count):
tmp_ocel = f.ocel_filter_by_events(ocel, groups[ii])
res.append(tmp_ocel)
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' CREATING SEPARATE OCELs ')
print('ocels:')
print(res)
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
# storing ocels
print('-------------------------------------------------------')
print(' STORING CLUSTERED-OCEL-FILES ')
directory = os.path.dirname(p_ocel_file) + '/clustered_ocel_files'
if not os.path.exists(directory):
os.makedirs(directory)
appendix_len = len(str(cluster_count))
for ii in range(0, len(res)):
filename = directory + '/cluster_' + str(ii+1).rjust(appendix_len) + '.' + p_ocel_file_type + 'ocel'
pm4py.write_ocel(ocel, filename)
print(str(ii+1).rjust(appendix_len) + '/' + str(cluster_count) + '"' + filename + '" stored.')
print('-------------------------------------------------------')
quit()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment