Commit 4fe17995 authored by Lennart Holzenkamp's avatar Lennart Holzenkamp
Browse files

Merge branch 'refactoring' into flask-refactoring-merge-try

parents b5a0f12b 4f2c8b87
# oc-dfg-clustering
Implementation of two clustering techniques, 'Existential' and 'All', to Object-Centric Event Logs (OCEL) to reduce the complexity of Object-Centric Direct Follow Graphs (OC-DFG).
Small library to cluster objects in ocel-files by their control-flow and their attributes. Therefor distances are calculated to later apply clustering on the corresponding distance matrix.
# Install the App
# Library
For using the library import the file from code/ocel_clustering/main. There you may use the method ocel_cluster_by_objects. Fill in the arguments according to the in-code documentation. If you would like to play around with the code try the CLI version.
# CLI
For more information execute the following as the cli-version is self documenting:
For examples how to use the CLI check out the file code/sample-cli-execution.sh, there are some examples.
```powershell
python ./code/cli.py --help
```
# Docker
## Install the Docker-App
This command will load the necessary docker-image, mount your directories and start the container.
- run .\install_app.bat from the oc-dfg-clustering directory
# Start the App
## Start the Docker-App
This command will only work if you already installed the app. Then it will just start the container and bind the CLI to the program.
- Start your docker daemon.
- run .\start_app.bat from the oc-dfg-clustering directory
\ No newline at end of file
import datetime
import os
from os.path import exists
import pm4py
import ocel_clustering.main as ocel_clustering
import json
import argparse
###############
## CONSTANTS ##
###############
DEFAULT_TARGET_DIR = 'ocels_clustered' # default target directory for resulting ocel-files
CLUSTER_MODES = ['existence', 'all']
####################################################
## DEFINING, VALIDATING AND READING CLI-ARGUMENTS ##
####################################################
CLI=argparse.ArgumentParser()
CLI.add_argument(
"ocel_file",
help='The ocel-file to apply the clustering on. File type and suffix has to be either ".jsonocel" or ".xmlocel".',
type=str
)
CLI.add_argument(
"object_type",
help='The type of objects from the ocel file which should be clustered. The object type needs to occur in the given ocel file.',
type=str
)
CLI.add_argument(
"mode",
choices=CLUSTER_MODES,
help='The mode to use for event assingments (either "all" or "existence").',
type=str
)
CLI.add_argument(
"--attr_weights",
type=str,
default="{}",
help='The weights for every attribute of the object instances to use for the internal distance calculation. Needs to be a flat attribute->weight json-string. If you would like to set the weight of the internally computed control-flow distance use the following attribute-name: "' + ocel_clustering.OCEL_COL_NAME_CONTROL_FLOW + '". If no weights are set all weights are set equally.',
required=False
)
CLI.add_argument(
"--clustering_mode",
type=str,
choices=['kmeans', 'spectral', 'agglomerative'],
default='kmeans',
help='The algorithm to use to cluster the data internally. Default value is kmeans.',
required=False
)
def validate_cluster_count(val: str) -> int:
if val in ['auto', 'automatic']:
return -1 # symbol for automatic
res = int(val)
if not res >= 2:
raise argparse.ArgumentTypeError('Parameter needs to be at least 2. Less clusters are not possible. You can also use "auto" or "automatic".')
return res
CLI.add_argument(
"--max_cluster_count",
type=validate_cluster_count,
default='auto',
help='The max count of clusters to create. The software uses the optimum between 2 and the given max_cluster_count. Default is the number of distinct items of the given type (attetion!). Default value is auto.',
required=False
)
CLI.add_argument(
"--cluster_count",
type=validate_cluster_count,
default='auto',
help='The count of clusters to create. If the value is "auto" or "automatic" the software uses the optimum between 2 and the given max_cluster_count. Default value is auto.'
)
CLI.add_argument(
"--ocel_file_type",
type=str,
choices=['json', 'xml'],
default='json',
help='Defines the type of output file. Default value is json.',
required=False
)
CLI.add_argument(
"--graph_file_type",
type=str,
choices=['svg', 'png', 'none'],
default='none',
help='Defines the type of the exported graph file type. If not given or set to none this file is not generated. Default value is none',
required=False
)
CLI.add_argument(
"--graph_activity_threshold",
type=int,
default=0,
help='Defines the activity threshold for a generated graph (only if --graph_file_type is not "none"). Default value is 0.',
required=False
)
CLI.add_argument(
"--graph_edge_threshold",
type=int,
default=0,
help='Defines the edge threshold for a generated graph (only if --graph_file_type is not "none"). Default value is 0.',
required=False
)
def validate_directory(dir: str) -> str:
res = dir
while(len(res) > 0 and res[0] == '/'):
res = res[1:]
while(len(res) > 0 and res[-1] == '/'):
res = res[:len(res)-1]
return str(os.getcwd() + '/' + res)
CLI.add_argument(
"--target_dir",
type=validate_directory,
default=DEFAULT_TARGET_DIR,
required=False,
help='Defines the target directory relative to the current directory or a full path. Default value is: ' + DEFAULT_TARGET_DIR
)
args = CLI.parse_args()
########################################
## DATA-BASED-VALIDATING OF ARGUMENTS ##
########################################
# Check file existence
assert exists(args.ocel_file), 'Given file does not exists.'
# Reading file to OCEL-structure and possible object types
ocel = pm4py.read_ocel(args.ocel_file)
ocel_object_types = list(ocel.objects['ocel:type'].unique())
# Validating if given args.object_type is possible
assert args.object_type in ocel_object_types, 'Given type "' + args.object_type + '" is not present in the data please use one of: "' + '", "'.join(ocel_object_types) + '".'
# reading args.attr_weights as json
try:
print(args.attr_weights)
args.attr_weights = json.loads(args.attr_weights)
except json.JSONDecodeError as error_msg:
raise error_msg # further improvement needed
# checking if clustering mode is possible
assert args.clustering_mode in ocel_clustering.OCEL_CLUSTER_ALGORITHMS, 'The given clustering mode "' + args.clustering_mode + '" is not availabel. Use on of: ' + ', '.join(ocel_clustering.OCEL_CLUSTER_ALGORITHMS) + '.'
print('-------------------------------------------------------')
print(' SETTINGS ')
print('ocel_file: ' + str(args.ocel_file))
print('mode: ' + str(args.mode))
print('object_type: ' + str(args.object_type))
print('attr_weights:')
print(args.attr_weights)
print('clustering_mode: ' + str(args.clustering_mode))
print('cluster_count: ' + str(args.cluster_count))
print('max_cluster_count: ' + str(args.max_cluster_count))
print('target_dir: ' + str(args.target_dir))
print('ocel_file_type: ' + str(args.ocel_file_type))
print('graph_file_type: ' + str(args.graph_file_type))
print('graph_activity_threshold: ' + str(args.graph_activity_threshold))
print('graph_edge_threshold: ' + str(args.graph_edge_threshold))
print('-------------------------------------------------------')
#################################
## PREPARING ATTRIBUTE WEIGHTS ##
#################################
# getting default full attribute definition
attr_def = ocel_clustering.ocel_get_attr_def(ocel)
def set_weights_function(attr_def: dict) -> dict:
res = attr_def
if res['name'] in args.attr_weights.keys():
res['weight'] = args.attr_weights[res['name']]
return res
attr_def = list(map(set_weights_function, attr_def))
control_flow_weight = args.attr_weights.get(ocel_clustering.OCEL_COL_NAME_CONTROL_FLOW, 1.0)
##############################
## EXECUTING CORE ALGORITHM ##
##############################
print('calculating...')
start_ts = datetime.datetime.now()
res = ocel_clustering.ocel_cluster_by_objects(
ocel=ocel,
object_type = args.object_type,
event_assignment_mode = args.mode,
attr_def = attr_def,
clustering_algorithm = args.clustering_mode,
max_cluster_count = args.max_cluster_count,
cluster_count = args.cluster_count,
control_flow_active = control_flow_weight != 0.0,
control_flow_weight = control_flow_weight
)
print('duration: ' + str(datetime.datetime.now() - start_ts))
##########################
## STORING OUTPUT FILES ##
##########################
print('-------------------------------------------------------')
print(' STORING CLUSTERED-OCEL-FILES ')
if not os.path.exists(args.target_dir):
os.makedirs(args.target_dir)
appendix_len = len(str(len(res)))
for ii in range(0, len(res)):
filename = args.target_dir + '/cluster_' + str(ii+1).rjust(appendix_len, '0') + '.' + args.ocel_file_type + 'ocel'
pm4py.write_ocel(res[ii], filename)
print(str(ii+1).rjust(appendix_len, '0') + '/' + str(len(res)) + ' "' + filename + '" stored.')
####################################################
## OPTIONAL DISCOVERING OCDFGS AND STORING IMAGES ##
####################################################
if args.graph_file_type != 'none':
print('-------------------------------------------------------')
print(' GENERATING AND STORING GRAPHS ')
for ii in range(0, len(res)):
filename = args.target_dir + '/cluster_' + str(ii+1).rjust(appendix_len, '0') + '_ocdfg.' + args.graph_file_type
ocdfg = pm4py.discover_ocdfg(res[ii])
pm4py.save_vis_ocdfg(ocdfg, filename, act_threshold=args.graph_activity_threshold, edge_threshold=args.graph_edge_threshold)
print(str(ii+1).rjust(appendix_len, '0') + '/' + str(len(res)) + ' "' + filename + '" stored.')
quit()
from bisect import bisect_left
from sklearn import cluster
from sklearn_extra import cluster as cluster_extra
from sklearn.metrics import silhouette_score
#matrix = np.array([[0,0.7,0.9], [0.7,0,0.2], [0.9,0.2,0]])
# the list_of_clusters elements which are returned in a list are ordered by index
def cluster_matrix(matrix, algorithm=cluster.KMeans(n_clusters=2, random_state=0)):
model = algorithm.fit(matrix)
labels = model.labels_
num_clusters = model.n_clusters
list_of_clusters = []
# list_of_clusters_values = []
for _ in range(num_clusters):
list_of_clusters.append([])
# list_of_clusters_values.append([])
for i, label in enumerate(labels):
list_of_clusters[label].append(i)
# list_of_clusters_values[label].append(matrix[i])
return list_of_clusters, labels
# k = number of clusters
# m = len(list_of_clusters[i]) \forall i \in [k]
# n = len(items)
# creating a set: O(n), thus not worth creating it for lookup
# O(k*n*m)
# For all-assigning, only one cluster is possible
def assign_event_to_cluster_all(items: list[int], list_of_clusters):
for cluster in list_of_clusters:
is_in_all = True
for item in items:
if item not in cluster:
is_in_all = False
if is_in_all:
return cluster
return False
# We use the property that the cluster-lists are ordered
# Thus O(k*n*log(m))
# For exist-assigning, we will return all suitable clusters
# time complexity of "cluster not in suitable_clusters" is negligible, because size of suitable_clusters is assumed to be small ( <k )
def assign_event_to_cluster_exists(items: list[int], list_of_clusters):
suitable_clusters = []
for cluster in list_of_clusters:
for item in items:
if bin_search(cluster, item) and cluster not in suitable_clusters:
suitable_clusters.append(cluster)
if suitable_clusters == []:
return False
return suitable_clusters
# O(log(n))
def bin_search(list, x):
'Locate the leftmost value exactly equal to x'
i = bisect_left(list, x)
if i != len(list) and list[i] == x:
return True
else:
return False
def determine_optimal_k(X, algorithm, k_max=20):
score_max = 0.0
best_k = 2
results = []
for k in range(2, k_max):
alg = algorithm.set_params(n_clusters=k)
alg.fit(X)
labels = alg.labels_
score = silhouette_score(X, labels, metric = 'euclidean')
results.append([k, score])
if score > score_max:
best_k = k
score_max = score
return best_k
# cluster_matrix(matrix, algorithm=cluster_extra.KMedoids(n_clusters=3, random_state=0))
# list_of_clusters, labels = cluster_matrix(matrix, cluster.KMeans(n_clusters=2, random_state=0))
# list_of_clusters, labels = cluster_matrix(matrix, algorithm=cluster.SpectralClustering(n_clusters=3, random_state=0))
#list_of_clusters, labels = cluster_matrix(matrix, algorithm=cluster.AgglomerativeClustering(n_clusters=3))
# print(determine_optimal_k(matrix, cluster.KMeans(random_state=0)))
# assign_event_to_cluster_exists([1,4], list_of_clusters)
\ No newline at end of file
MODES = ['all', 'existence']
DEFAULT_VALUES = {
'float64': 0.0,
'str': '',
'list': [],
'tuple': (),
'cf': ''
}
DEFAULT_CF_ATTR_NAME = 'control_flow'
LS_CACHE_ACTIVE = 0 # useless since library function uses cache (it seems like that)
\ No newline at end of file
from tkinter.tix import InputOnly
from typing import Any
import numpy as np
import pandas as pd
from pm4py.objects.ocel.obj import OCEL
from Levenshtein import distance as lev
import pm4py
import constants as c
import scipy.spatial.distance as spd
def count_non_nans(data):
count = 0
for entry in data:
if not pd.isnull(entry):
count += 1
return count
def check_suffixes(suffixes=tuple[str, str]):
if len(suffixes) != 2: raise Exception('length of suffixes need to be exact two.')
if suffixes[0] == suffixes[1]: raise Exception('suffixes need to be distinct')
return True
def df_create_cross_df(df: pd.DataFrame, suffixes=('_x', '_y')) -> pd.DataFrame:
"""
Creates a full cross join of a dataframe and preservers the index used before.
"""
check_suffixes(suffixes) # Checking if suffixes are valid.
# setting the new list of index columns (given dataframe could already use a MultiIndex)
new_index_names = list(map(lambda name: name+suffixes[0], df.index.names)) + list(map(lambda name: name+suffixes[1], df.index.names))
# Resetting the index for preserving the index later.
pre = df.reset_index(inplace=False)
cross = pre.join(pre, how='cross', lsuffix=suffixes[0], rsuffix=suffixes[1]) # 2.5 sec
cross.set_index(new_index_names, verify_integrity=False, inplace=True) # 11 sec -> 6.2 sec
return cross
# returns a dataframe based on a cross join of the given dataframe.
# All attributes are replaced by their row wise distance which is calculated via the given functions in attribute_func_map
def df_pairwise_attr_distance(df: pd.DataFrame, attribute_func_map: dict[str, Any], suffixes=('_x', '_y')) -> pd.DataFrame:
check_suffixes(suffixes)
# getting relevant attributes
selected_attributes = list(set(attribute_func_map.keys()).intersection(df.columns))
# CARTESIAN PRODUCT of all data
cross = df_create_cross_df(df[selected_attributes], suffixes=suffixes) # ~8-9 seconds
# determining data_types
for attr in selected_attributes:
# creating attribute names for each 'side'
new_attr_names = {0: attr+suffixes[0], 1: attr+suffixes[1]}
# getting unique values
unique_vals = df[attr].unique()
# setting function for distance calculation per attribute
func = attribute_func_map[attr]
if type(func) == None:
raise Exception('No function defined for attribute "' + attr + '".')
# reshaping and calculating the distances (its only done one ways by pdist, hence squareform is necessary)
reshaped_vals = unique_vals.reshape(-1,1)
# CALCULATING (less than 1s)
d_matrix = spd.pdist(reshaped_vals, func)
d_matrix = spd.squareform(d_matrix)
# creating dataframe (matrix like) of the result setting index and columns accordingly
res = pd.DataFrame(d_matrix)
res.index = unique_vals
res.columns = unique_vals
# RESHAPING (less than 10 ms)
res = res.rename_axis(index=new_attr_names[0], columns=new_attr_names[1]).melt(ignore_index=False) # retransfrom from matrix to list
res.reset_index(inplace=True)
# SETTING MULTIINDEX (less than 100 ms)
res.set_index(list(new_attr_names.values()), verify_integrity=False, inplace=True)
res.rename({'value': attr}, inplace=True, axis=1) # 'value' is the automatic name
# JOINING RESULTS to cross table (~11-12s)
cross = cross.join(res, on=list(new_attr_names.values()), how='left') # some kind of mapping may be faster...
# DELETING COLUMNS (less than 1s)
del cross[new_attr_names[0]]
del cross[new_attr_names[1]]
# FILLING NaNs (less than 1s)
cross[attr] = cross[attr].fillna(0.0)
return cross
# Determines column types by first elements that are not NaN
# becomes: float64, string, list, unknown
def df_determine_data_types(df: pd.DataFrame) -> dict[str, str]:
types = {}
for column in (set(df.columns) - set(types.keys())):
first_non_nan = df[column].loc[~df[column].isnull()].iloc[0]
types[column] = type(first_non_nan).__name__
types['control_flow'] = 'cf' # dirty fix!
# fix that also the dtypes thing gives a string
return types
# only fills known datatypes in DEFAULT_VALUES
def df_fill_nans(df: pd.DataFrame, type_series: pd.Series, default_values: dict):
default_vals = {}
# iterate all columns that are given AND have a defined typing
for column in set(type_series.keys()).intersection(set(df.columns)):
# setting default_value for column based on given constant
default_vals[column] = default_values[type_series[column]]
# special case: lists
if(default_vals[column] == []):
df[column] = df[column].apply(lambda x: x if isinstance(x, list) else default_vals[column])
elif(default_vals[column] == ()):
df[column] = df[column].apply(lambda x: x if isinstance(x, tuple) else default_vals[column])
else:
df[column] = df[column].fillna(default_vals[column])
return df
def df_get_object_table_for_type(ocel: OCEL, object_type: str):
# Retrieving all possible attribute names of the selected object (items are unique in list)
# object table:
# ocel:oid, ocel:type, cost, producer, age, bankaccount
all_objects = ocel.objects
# containing all the objects from the selected type
filtered_objects = pd.DataFrame(all_objects[all_objects['ocel:type'] == object_type])
del filtered_objects['ocel:type'] # contains no information due to filtering to one ocel:type in the preceeding code row.
# removing columns that have only NaN as value
for col in filtered_objects.columns:
if col not in ['ocel:oid', 'ocel:type']:
distinct_vals = filtered_objects[col].unique()
if count_non_nans(distinct_vals) == 0:
del filtered_objects[col]
return filtered_objects
def df_get_control_flow_per_object_of_type(ocel: OCEL, object_type: str, activity_letter_map: dict):
# Getting all relations
df_relations = ocel.relations[ocel.relations['ocel:type'] == object_type].copy() # for supressing warning.
df_relations['ocel:activity'] = df_relations['ocel:activity'].map(activity_letter_map)
df_relations = df_relations.sort_values(['ocel:oid', 'ocel:timestamp'], axis=0, ascending=True)
del df_relations['ocel:eid']
del df_relations['ocel:type']
del df_relations['ocel:timestamp'] # only possible because sorting already applied!
# del df_relations['ocel:activity']
df_relations.rename({'ocel:activity_short': 'ocel:activity'}, axis=1, inplace=True)
# control flow per object. sorting by timestamp is very important!
res = df_relations.groupby('ocel:oid')['ocel:activity'].agg(tuple)
return res
def df_normalize_columns(df: pd.DataFrame, columns: pd.Index):
for col in columns:
max = df[col].max()
if max <= 0:
max = 1 # possible, as we will have only positive numbers
df[col] = df[col] / max
return df
def df_weight_columns(df: pd.DataFrame, p_attr_weights: dict):
cols = list(set(p_attr_weights.keys()).intersection(df.columns))
for col in cols:
df[col] = df[col] * p_attr_weights[col]
return df
def map_activities_to_letter(unique_activities):
activities_dict = {}
cur_letter = 'a'
for activity in unique_activities:
activities_dict[activity] = cur_letter
cur_letter = chr(ord(cur_letter) + 1)
return activities_dict
def ocel_get_object_distances_lh(ocel, object_type, weights_per_attribute) -> dict['index': list, 'distances': np.matrix]:
# getting all object-types
data_object_types = pm4py.ocel_get_object_types(ocel)
if not object_type in data_object_types: raise Exception('selected object-type-name "' + object_type + '" not present in the data.')
# getting all distinct acitivity names:
activity_letter_map = map_activities_to_letter(pm4py.ocel_object_type_activities(ocel)[object_type])
# getting object-information from ocel
df_object_data = df_get_object_table_for_type(ocel, object_type)
df_object_data.sort_values(['ocel:oid'], inplace=True) # algorithmically useless but helpful for comparison with other techniques
series_cf_per_oid = df_get_control_flow_per_object_of_type(ocel, object_type, activity_letter_map)
# adding control-flow-information to object data
df_object_data[c.DEFAULT_CF_ATTR_NAME] = df_object_data['ocel:oid'].map(series_cf_per_oid)
df_object_data[c.DEFAULT_CF_ATTR_NAME] = df_object_data[c.DEFAULT_CF_ATTR_NAME].fillna('').map(lambda x: ''.join(x))
# Checking datatypes and setting 'special' datatype "control-flow"
object_data_type_map = df_determine_data_types(df_object_data)
object_data_type_map[c.DEFAULT_CF_ATTR_NAME] = 'cf'
# filling NaNs of attributes (control_flow already filled)
df_object_data = df_fill_nans(df_object_data, object_data_type_map, c.DEFAULT_VALUES)
# Setting index to oce:oid
df_object_data = df_object_data.set_index('ocel:oid')
# Creating attribute->distance_function mapping
type_func_dict = {
'cf': lambda x,y: lev(x[0],y[0]),
'str': lambda x,y: 1 if x != y else 0,
'float64': lambda x,y: abs(x-y)
}
attr_func_map = dict.fromkeys(df_object_data.columns)
for attr in df_object_data.columns: attr_func_map[attr] = type_func_dict[object_data_type_map[attr]]
# CALCULATING DISTANCES (20-22s)
df_distance_matrix = df_pairwise_attr_distance(df_object_data, attr_func_map, ('_a', '_b'))
# NORMALIZING / WEIGHTING / AVERAGING
df_distance_matrix = df_normalize_columns(df_distance_matrix, list(attr_func_map.keys()))
df_distance_matrix = df_weight_columns(df_distance_matrix, weights_per_attribute)
divisor = 0.0
for attribute in list(attr_func_map.keys()):
divisor += weights_per_attribute.get(attribute, 1.0)
if divisor == 0.0: divisor = 1.0
df_distance_matrix['distance_avg'] = df_distance_matrix[list(attr_func_map.keys())].sum(axis=1) / divisor
# df_distance_matrix['distance_avg'] = df_distance_matrix[list(attr_func_map.keys())].mean(axis=1) -> mean over 3 values is not the same as weighting and dividing by the weights sum
# df_distance_matrix = df_normalize_columns(df_distance_matrix, ['distance_avg']) -> leads to wrong results compared to stanislav -> but maybe useful normalization
df_distance_matrix = df_distance_matrix.drop(list(attr_func_map.keys()), axis=1) # single distances not relevant anymore
# Creating the matrix-index->ocel:oid map
index_to_id_map = list(df_object_data.index)
index_count = len(index_to_id_map)
# reshaping results to matrix
df_matrix = df_distance_matrix['distance_avg'].to_numpy().reshape(index_count, index_count) # TIMING: 0 ms
return {
'distances': df_matrix,
'index': index_to_id_map
}
# filters the given ocel data to all nodes that contain only the given events (by ocel:eid)
def ocel_filter_by_events(ocel: OCEL, events: np.array) -> OCEL:
# setting up dataframe for joining
df_event_ids = pd.DataFrame(events, columns=['ocel:eid'])
df_event_ids['ocel:eid'] = df_event_ids['ocel:eid'].astype(str)
df_event_ids.set_index('ocel:eid', inplace=True)
# creating relation data
res_relations = ocel.relations.join(df_event_ids, on='ocel:eid', how='right') # get all relations for events, no more (right join)
# creating object data
res_objects = res_relations[['ocel:oid', 'ocel:type']].join(ocel.objects.set_index(['ocel:oid', 'ocel:type']), on=['ocel:oid', 'ocel:type'], how='left')[ocel.objects.columns]
# creating event data
res_events = ocel.events.join(df_event_ids, on=['ocel:eid'], how='right')
# assembling ocel
res = OCEL(res_events, res_objects, res_relations, ocel.globals, ocel.parameters)
return res
#######################################################################
# Calculating the distance matrix based on stanislav's implementation #
# #
# Save unique activies to a list
def get_unique_activities(activities):
activity_list = []
for activity in activitie