Commit 5e250435 authored by Lennart Holzenkamp's avatar Lennart Holzenkamp
Browse files

in mid

parent 94e037c6
python ./code/ ./data/ocel.jsonocel customers existence --attr_weights '{\"bankaccount\": 1.5}' --clustering_mode spectral --cluster_count auto --ocel_file_type xml
......@@ -6,64 +6,108 @@ import pandas as pd
import pm4py
import constants as c
import numpy as np
import functions as f
import ocel_clustering.main as f
import cluster
from sklearn import cluster as skcluster
import json
import argparse
help='The ocel-file to apply the clustering on.',
help='The type of objects from the ocel file which should be clustered. The object type needs to occur in the given ocel file.',
help='The mode to use for event assingments (either "all" or "existence").',
default=[{'producer': 2}],
help='The weights for every attribute of the object instances to use for the internal distance calculation. needs to be a flat attribute->weight json-string.',
choices=['kmeans', 'spectral', 'agglomerative'],
help='The algorithm to use to cluster the data internally. Default is kmeans.',
def validate_cluster_count(val: str) -> int:
if val in ['auto', 'automatic']:
return -1 # symbol for automatic
res = int(val)
if not res >= 2:
raise argparse.ArgumentTypeError('Parameter needs to be at least 2. Less clusters are not possible. You can also use "auto" or "automatic".')
return res
help='The max count of clusters to create. The software uses the optimum between 2 and the given cluster_count. Default is the floor(natural log(object-count))+1.',
choices=['json', 'xml'],
help='Defines the type of output file.',
default=['svg'], # default if nothing is provided
choices=['svg', 'png', 'none'],
help='Defines the type of the exported graph file type. If not given or set to none this file is not generated.',
args = CLI.parse_args()
p_ocel_file = args.p_ocel_file # "./data/ocel.jsonocel"
p_mode = args.p_mode # 'existence' # all | existence
p_object_type = args.p_object_type # 'products' # object-type-name, options depend on the data
p_attr_weights = args.p_attr_weights # {'producer': 2} # attributes that are not given in the data are not used
p_clustering_mode = args.p_clustering_mode # 'kmeans' # optional, default: kmeans
p_cluster_count = args.p_cluster_count # 3 # np.NaN # cluster-count (optional, default: np.NaN which leads to automatic k determination)
p_ocel_file_type = args.p_ocel_file_type # 'json' # json|xml
p_graph_file_type = args.p_graph_file_type # 'svg' # svg|png
p_ocel_file = args.ocel_file # "./data/ocel.jsonocel"
assert exists(p_ocel_file), 'Given file does not exists.'
p_mode = args.mode # 'existence' # all | existence
ocel = pm4py.read_ocel(p_ocel_file)
assert args.object_type in list(ocel.objects['ocel:type'].unique()), 'Given type is not present in the data.'
p_object_type = args.object_type # 'products' # object-type-name, options depend on the data
tmp_weights = json.loads(args.attr_weights)
except json.JSONDecodeError as error_msg:
raise error_msg # further improvement needed
p_attr_weights = tmp_weights # {'producer': 2} # attributes that are not given in the data are not used
# non-data-based assertions
algorithms = {
'kmeans': skcluster.KMeans(),
'spectral': skcluster.SpectralClustering(),
'agglomerative': skcluster.AgglomerativeClustering()
assert args.clustering_mode in algorithms.keys(), 'The given clustering mode "' + args.clustering_mode + '" is not availabel. Use on of: ' + ', '.join(algorithms.keys()) + '.'
p_clustering_mode = args.clustering_mode # 'kmeans' # optional, default: kmeans
p_cluster_count = args.cluster_count # 3 # np.NaN # cluster-count (optional, default: np.NaN which leads to automatic k determination)
p_cluster_count_set = p_cluster_count >= 2
p_ocel_file_type = args.ocel_file_type # 'json' # json|xml
p_graph_file_type = args.graph_file_type # 'svg' # svg|png
......@@ -81,21 +125,35 @@ print('p_ocel_file_type: "' + str(p_ocel_file_type) + '".')
print('p_graph_file_type: "' + str(p_graph_file_type) + '".')
start_ts =
attr_def = f.ocel_get_attr_def(ocel, p_object_type)
clusters = f.ocel_cluster_by_objects(
control_flow_weight=p_attr_weights.get(f.OCEL_CONTROL_FLOW_COLUMN, 1.0))
# non-data-based assertions
assert p_mode in c.MODES, 'selected mode not possible. Use either ''all'' or ''existence'''
assert exists(p_ocel_file), 'file does not exists'
algorithms = {
'kmeans': skcluster.KMeans(),
'spectral': skcluster.SpectralClustering(),
'agglomerative': skcluster.AgglomerativeClustering()
assert p_clustering_mode in algorithms.keys(), 'The given clustering mode "' + p_clustering_mode + '" is not availabel. Use on of: ' + ', '.join(algorithms.keys()) + '.'
ocel = pm4py.read_ocel(p_ocel_file)
res = f.ocel_get_object_distances_sy(ocel, p_object_type, p_attr_weights)
start_ts =
print('raw data:')
print(ocel.objects[ocel.objects['ocel:type'] == p_object_type])
res = f.ocel_get_object_distances(ocel, p_object_type, p_attr_weights)
# res_lh = f.ocel_get_object_distances_lh(ocel, p_object_type, p_attr_weights)
# print(res['distances']-res_lh['distances'])
......@@ -119,12 +177,10 @@ index_to_oid_map = res['index']
algo = algorithms[p_clustering_mode]
if p_cluster_count_set:
cluster_count = int(p_cluster_count)
assert cluster_count >= 2, 'cluster_count needs to be at least 2'
assert cluster_count < len(index_to_oid_map), 'cluster_count needs to be less than the count of distinct objects in the ocel-data.'
cluster_count = cluster.determine_optimal_k(distance_matrix, algorithm=algo, k_max=math.floor(len(index_to_oid_map) / 2)+1)
cluster_count = cluster.determine_optimal_k(distance_matrix, algorithm=algo, k_max= math.ceil(math.log2(len(index_to_oid_map)))+1)
cluster_res = cluster.cluster_matrix(distance_matrix, algorithm=algo)
from collections import namedtuple
import enum
from tkinter.tix import InputOnly
from typing import Any, Callable, Iterable
from matplotlib.pyplot import axis
import numpy as np
import pandas as pd
from pm4py.objects.ocel.obj import OCEL
from Levenshtein import distance as lev
import pm4py
import constants as c
import scipy.spatial.distance as spd
def count_non_nans(data):
count = 0
for entry in data:
if not pd.isnull(entry):
count += 1
return count
def check_suffixes(suffixes=tuple[str, str]):
if len(suffixes) != 2: raise Exception('length of suffixes need to be exact two.')
if suffixes[0] == suffixes[1]: raise Exception('suffixes need to be distinct')
return True
def df_create_cross_df(df: pd.DataFrame, suffixes=('_x', '_y')) -> pd.DataFrame:
Creates a full cross join of a dataframe and preservers the index used before.
check_suffixes(suffixes) # Checking if suffixes are valid.
# setting the new list of index columns (given dataframe could already use a MultiIndex)
new_index_names = list(map(lambda name: name+suffixes[0], df.index.names)) + list(map(lambda name: name+suffixes[1], df.index.names))
# Resetting the index for preserving the index later.
pre = df.reset_index(inplace=False)
cross = pre.join(pre, how='cross', lsuffix=suffixes[0], rsuffix=suffixes[1]) # 2.5 sec
cross.set_index(new_index_names, verify_integrity=False, inplace=True) # 11 sec -> 6.2 sec
return cross
# returns a dataframe based on a cross join of the given dataframe.
# All attributes are replaced by their row wise distance which is calculated via the given functions in attribute_func_map
def df_pairwise_attr_distance(df: pd.DataFrame, attribute_func_map: dict[str, Any], suffixes=('_x', '_y')) -> pd.DataFrame:
# getting relevant attributes
selected_attributes = list(set(attribute_func_map.keys()).intersection(df.columns))
cross = df_create_cross_df(df[selected_attributes], suffixes=suffixes) # ~8-9 seconds
# determining data_types
for attr in selected_attributes:
# creating attribute names for each 'side'
new_attr_names = {0: attr+suffixes[0], 1: attr+suffixes[1]}
# getting unique values
unique_vals = df[attr].unique()
# setting function for distance calculation per attribute
func = attribute_func_map[attr]
if type(func) == None:
raise Exception('No function defined for attribute "' + attr + '".')
# reshaping and calculating the distances (its only done one ways by pdist, hence squareform is necessary)
reshaped_vals = unique_vals.reshape(-1,1)
# CALCULATING (less than 1s)
d_matrix = spd.pdist(reshaped_vals, func)
d_matrix = spd.squareform(d_matrix)
# creating dataframe (matrix like) of the result setting index and columns accordingly
res = pd.DataFrame(d_matrix)
res.index = unique_vals
res.columns = unique_vals
# RESHAPING (less than 10 ms)
res = res.rename_axis(index=new_attr_names[0], columns=new_attr_names[1]).melt(ignore_index=False) # retransfrom from matrix to list
# SETTING MULTIINDEX (less than 100 ms)
res.set_index(list(new_attr_names.values()), verify_integrity=False, inplace=True)
res.rename({'value': attr}, inplace=True, axis=1) # 'value' is the automatic name
# JOINING RESULTS to cross table (~11-12s)
cross = cross.join(res, on=list(new_attr_names.values()), how='left') # some kind of mapping may be faster...
# DELETING COLUMNS (less than 1s)
del cross[new_attr_names[0]]
del cross[new_attr_names[1]]
# FILLING NaNs (less than 1s)
cross[attr] = cross[attr].fillna(0.0)
return cross
# Determines column types by first elements that are not NaN
# becomes: float64, string, list, unknown
def df_determine_data_types(df: pd.DataFrame) -> dict[str, str]:
types = {}
for column in (set(df.columns) - set(types.keys())):
first_non_nan = df[column].loc[~df[column].isnull()].iloc[0]
types[column] = type(first_non_nan).__name__
types['control_flow'] = 'cf' # dirty fix!
# fix that also the dtypes thing gives a string
return types
# only fills known datatypes in DEFAULT_VALUES
def df_fill_nans(df: pd.DataFrame, type_series: pd.Series, default_values: dict):
default_vals = {}
# iterate all columns that are given AND have a defined typing
for column in set(type_series.keys()).intersection(set(df.columns)):
# setting default_value for column based on given constant
default_vals[column] = default_values[type_series[column]]
# special case: lists
if(default_vals[column] == []):
df[column] = df[column].apply(lambda x: x if isinstance(x, list) else default_vals[column])
elif(default_vals[column] == ()):
df[column] = df[column].apply(lambda x: x if isinstance(x, tuple) else default_vals[column])
df[column] = df[column].fillna(default_vals[column])
return df
def df_get_object_table_for_type(ocel: OCEL, object_type: str):
# Retrieving all possible attribute names of the selected object (items are unique in list)
# object table:
# ocel:oid, ocel:type, cost, producer, age, bankaccount
all_objects = ocel.objects
# containing all the objects from the selected type
filtered_objects = pd.DataFrame(all_objects[all_objects['ocel:type'] == object_type])
del filtered_objects['ocel:type'] # contains no information due to filtering to one ocel:type in the preceeding code row.
# removing columns that have only NaN as value
for col in filtered_objects.columns:
if col not in ['ocel:oid', 'ocel:type']:
distinct_vals = filtered_objects[col].unique()
if count_non_nans(distinct_vals) == 0:
del filtered_objects[col]
return filtered_objects
def df_get_control_flow_per_object_of_type(ocel: OCEL, object_type: str, activity_letter_map: dict):
# Getting all relations
df_relations = ocel.relations[ocel.relations['ocel:type'] == object_type].copy() # for supressing warning.
df_relations['ocel:activity'] = df_relations['ocel:activity'].map(activity_letter_map)
df_relations = df_relations.sort_values(['ocel:oid', 'ocel:timestamp'], axis=0, ascending=True)
del df_relations['ocel:eid']
del df_relations['ocel:type']
del df_relations['ocel:timestamp'] # only possible because sorting already applied!
# del df_relations['ocel:activity']
df_relations.rename({'ocel:activity_short': 'ocel:activity'}, axis=1, inplace=True)
# control flow per object. sorting by timestamp is very important!
res = df_relations.groupby('ocel:oid')['ocel:activity'].agg(tuple)
return res
def df_normalize_columns(df: pd.DataFrame, columns: pd.Index):
for col in columns:
max = df[col].max()
if max <= 0:
max = 1 # possible, as we will have only positive numbers
df[col] = df[col] / max
return df
# filters the given ocel data to all nodes that contain only the given events (by ocel:eid)
def ocel_filter_by_events(ocel: OCEL, events: np.array) -> OCEL:
# setting up dataframe for joining
df_event_ids = pd.DataFrame(events, columns=['ocel:eid'])
df_event_ids['ocel:eid'] = df_event_ids['ocel:eid'].astype(str)
df_event_ids.set_index('ocel:eid', inplace=True)
# creating relation data
res_relations = ocel.relations.join(df_event_ids, on='ocel:eid', how='right') # get all relations for events, no more (right join)
# creating object data
res_objects = res_relations[['ocel:oid', 'ocel:type']].join(ocel.objects.set_index(['ocel:oid', 'ocel:type']), on=['ocel:oid', 'ocel:type'], how='left')[ocel.objects.columns]
# creating event data
res_events =, on=['ocel:eid'], how='right')
# assembling ocel
res = OCEL(res_events, res_objects, res_relations, ocel.globals, ocel.parameters)
return res
class ClusteringAlgorithm(enum.Enum):
# default name of columns that contain the control-flow
'object': '', # assuming string
'float64': np.float64(0.0)
'object': lambda x,y: x!=y,
'float64': lambda x,y: abs(x-y)
def series_distance_matrix(series: pd.Series, compare_func: Callable[[Any, Any], float], default_value: any, normalize: bool = True) -> np.matrix:
return np.zeros((series.size, series.size))
def df_distance_matrix(df: pd.DataFrame, attr_def: list[dict['name': str, 'weight': float, 'default': any, 'compare_func': Callable[[Any, Any], float]]]) -> np.matrix:
return np.zeros((df.size, df.size))
def df_cluster_by_distance(df: pd.DataFrame, attr_def: list[dict['name': str, 'weight': float, 'default': any, 'compare_func': Callable[[Any, Any], float]]]) -> list[pd.DataFrame]:
return list()
def df_determine_data_types(df: pd.DataFrame) -> pd.Series:
return df.dtypes
# considers the data. If a column is completly NaN it returns for that column NoneType. Fo strings the compare function is just 1 for inequal and 0 for equal. for numbers its the distance
def ocel_get_attr_def(ocel: OCEL, object_type: str) -> list[dict['name': str, 'weight': float, 'default': any, 'compare_func': Callable[[Any, Any], float]]]:
res = []
# getting all types
types = ocel.objects.dtypes
# filtering to types which we can handle
types = types[ x: x in list(DEFAULT_VALUES.keys()))]
# removing id column, because not used for clustering or anything
types = types.drop(labels=[OCEL_OID_COLUMN_NAME, OCEL_OBJECT_TYPE_COLUMN_NAME], errors='ignore').to_dict()
for label in types.keys():
col_type = str(types[label])
'name': label,
'weight': 1,
'default': DEFAULT_VALUES[col_type],
'compare_func': DEFAULT_COMPARISONS[col_type]
return res
# you may add information to ocel.objects table for using it in the attr_definition. control-flow is used automatically
def ocel_cluster_by_objects(
ocel: OCEL,
object_type: str,
attr_def: list[dict['name': str, 'weight': float, 'default': any, 'compare_func': Callable[[Any, Any], float]]],
clustering_algo: ClusteringAlgorithm = ClusteringAlgorithm.KMEANS,
max_cluster_count: int = -1, # -1 means auto detect (up to itemcount - 1). potential very high runtime complexity
control_flow_active: bool = True, # if true the control flow for every object is determined and used for the distance calculation
control_flow_weight: float = 1.0, # defines the weight of the control flor
control_flow_compare_func: Callable[[Any, Any], float] = lev, # default is levenshtein distance.
control_flow_as_letters: bool = True # defines that the control-flow is translated to unique letters. this is faster to process internally. if a custom compar_function is used, this parameter may be set to false. then two lists are given to the compare function
) -> list[OCEL]:
if object_type not in ocel.objects[OCEL_OBJECT_TYPE_COLUMN_NAME].unique():
raise Exception('Error in ocel_cluster_by_objects: object_type "' + object_type + '" not present in the data.')
object_data = ocel.objects[ocel.objects[OCEL_OBJECT_TYPE_COLUMN_NAME] == object_type].drop(OCEL_OBJECT_TYPE_COLUMN_NAME, axis=1)
object_data = object_data.set_index(OCEL_OID_COLUMN_NAME).dropna(how='all', axis=1)
relevant_attributes = [attr['name'] for attr in attr_def if attr['weight'] != 0 and attr['name'] in object_data.columns] # filter to attributes that have a non zero weight and are in the data and not NaN
if control_flow_active:
control_flows = ocel_get_control_flow_per_object(ocel=ocel, object_types=[object_type])
# contains all the data relevant for the object.
object_data = ocel_get_control_flow_per_object(ocel, [object_type])
return list()
# Returns a dictionary where for every distinct element in the iterable a mapping to a unique letter is given. Complexity is linear.
def get_to_char_map(iterable: Iterable) -> dict[str, str]:
res = {}
letter = 'a'
for x in iterable:
if not x in res:
res[x] = letter
letter = chr(ord(letter)+1)
return res
# returns a data frame which is the same as ocel.objects. additional it contains a column named as specified in the parameter control_flow_col_name
# this column contains a list of the event types occured in relation to the object ordered by timestamp. The natural order is used if two events have the same timestamp
def ocel_get_control_flow_per_object(
ocel: OCEL,
object_types: list = None,
control_flow_col_name = OCEL_CONTROL_FLOW_COLUMN
) -> pd.DataFrame:
relations = ocel.relations
if object_types != None:
# type checking here if object_types is really a 'list'?
relations = relations[relations[OCEL_OBJECT_TYPE_COLUMN_NAME].isin(object_types)]
relations.sort_values([OCEL_OBJECT_TYPE_COLUMN_NAME, OCEL_TIMESTAMP_COLUMN_NAME], axis=0, ascending=True)
def test_func(x):
df_relations = ocel.relations[ocel.relations['ocel:type'] == object_type].copy() # for supressing warning.
df_relations['ocel:activity'] = df_relations['ocel:activity'].map(activity_letter_map)
df_relations = df_relations.sort_values(['ocel:oid', 'ocel:timestamp'], axis=0, ascending=True)
del df_relations['ocel:eid']
del df_relations['ocel:type']
del df_relations['ocel:timestamp'] # only possible because sorting already applied!
# del df_relations['ocel:activity']
df_relations.rename({'ocel:activity_short': 'ocel:activity'}, axis=1, inplace=True)
# control flow per object. sorting by timestamp is very important!
res = df_relations.groupby('ocel:oid')['ocel:activity'].agg(tuple)
return pd.DataFrame()
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment