Commit 68e6af5c authored by Lennart Holzenkamp's avatar Lennart Holzenkamp
Browse files

beautify and removing one normalization step in lh's solution

parent d9cbac6c
import datetime
from functools import cache
from tkinter.tix import InputOnly
from typing import Any
import numpy as np
import pandas as pd
from pm4py.objects.ocel.obj import OCEL
from Levenshtein import distance as lev
import pm4py
from sympy import EX
import constants as c
import scipy.spatial.distance as spd
......@@ -17,36 +15,6 @@ def count_non_nans(data):
count += 1
return count
# useless since lev seems to implement the caching on its own.
@cache
def lev_cached(s: str, t: str):
return lev(s,t)
def print_w_ts(c):
print('-------------')
print(datetime.datetime.now())
print(c)
# return is with min/max column
# 10,366051 seconds; count=859
def generate_ordered_combination_df(count: int):
keys = [x for x in range(count)]
df = pd.DataFrame([], index=pd.MultiIndex.from_product([keys,keys], names=['left', 'right']))
df.reset_index(inplace=True)
# removing columns with left==right
df = df[df['left'] != df['right']]
# calculating min
tmp_min = df.min(axis=1)
# calculating max
tmp_max = df.max(axis=1)
df['min'] = tmp_min
df['max'] = tmp_max
del df['left']
del df['right']
df.drop_duplicates(inplace=True) # 6,679982 sec; count=8159
df.set_index(['min', 'max'], inplace=True) # 1,849814 sec; count=8159
return df
def check_suffixes(suffixes=tuple[str, str]):
if len(suffixes) != 2: raise Exception('length of suffixes need to be exact two.')
if suffixes[0] == suffixes[1]: raise Exception('suffixes need to be distinct')
......@@ -199,6 +167,7 @@ def ocel_get_object_distances_lh(ocel, object_type, weights_per_attribute) -> di
activity_letter_map = map_activities_to_letter(pm4py.ocel_object_type_activities(ocel)[object_type])
# getting object-information from ocel
df_object_data = df_get_object_table_for_type(ocel, object_type)
df_object_data.sort_values(['ocel:oid'], inplace=True) # algorithmically useless but helpful for comparison with other techniques
series_cf_per_oid = df_get_control_flow_per_object_of_type(ocel, object_type, activity_letter_map)
# adding control-flow-information to object data
df_object_data[c.DEFAULT_CF_ATTR_NAME] = df_object_data['ocel:oid'].map(series_cf_per_oid)
......@@ -222,12 +191,12 @@ def ocel_get_object_distances_lh(ocel, object_type, weights_per_attribute) -> di
# CALCULATING DISTANCES (20-22s)
df_distance_matrix = df_pairwise_attr_distance(df_object_data, attr_func_map, ('_a', '_b'))
# NORMALIZING / WEIGHTING / AVERAGING
df_distance_matrix = df_normalize_columns(df_distance_matrix, list(attr_func_map.keys()))
df_distance_matrix = df_weight_columns(df_distance_matrix, weights_per_attribute)
df_distance_matrix['distance_avg'] = df_distance_matrix[list(attr_func_map.keys())].mean(axis=1)
df_distance_matrix = df_normalize_columns(df_distance_matrix, ['distance_avg'])
# df_distance_matrix = df_normalize_columns(df_distance_matrix, ['distance_avg']) -> leads to wrong results compared to stanislav
df_distance_matrix = df_distance_matrix.drop(list(attr_func_map.keys()), axis=1) # single distances not relevant anymore
# Creating the matrix-index->ocel:oid map
......@@ -404,9 +373,6 @@ def calc_distance(df, weight_index, n, weights):
return matrix * weights[weight_index]
# #
# End of stanislav's implementation #
#################################################################
def ocel_get_object_distances_sy(ocel, object_type, weights_per_attribute) -> dict['index': list, 'distances': np.matrix]:
##### CALCULATING DISTANCE MATRIX #####
......@@ -456,3 +422,7 @@ def ocel_get_object_distances_sy(ocel, object_type, weights_per_attribute) -> di
'distances': res,
'index': list_oid
}
# #
# End of stanislav's implementation #
#################################################################
\ No newline at end of file
......@@ -10,56 +10,32 @@ import functions as f
import cluster
from sklearn import cluster as skcluster
#
# TEST-SETTING
# settings for rapid testing during development
sys_test_mode = 0 # 1: use a share of the data, 0: use the original data
test_factor = 0.1 # [0,1]: share of data to use
# END TEST-SETTINGS
#
#
# PARAMETERS
# ocel_file-path
p_ocel_file = './data/ocel.jsonocel'
# all | existence
p_mode = 'existence'
# object-type-name
p_object_type = 'product'
# example data for attribute weighting in distance measurements
p_ocel_file = './data/ocel.jsonocel'
p_mode = 'existence' # all | existence
p_object_type = 'customers' # object-type-name, options depend on the data
p_attr_weights = {
}
# clustering-mode
p_clustering_mode = 'kmeans'
# cluster-count (optional)
p_cluster_count = np.NaN
# ocel_file_type
p_ocel_file_type = 'json'
# graph_file_type
p_graph_file_type = 'svg'
} # attributes that are not given in the data are not used
p_clustering_mode = 'kmeans' # optional, default: kmeans
p_cluster_count = np.NaN # cluster-count (optional, default: np.NaN which leads to automatic k determination)
p_ocel_file_type = 'json' # json|xml
p_graph_file_type = 'svg' # svg|png
# END PARAMETERS
#
print('Program startet...')
print('-------------------------------------------------------')
print(' SETTINGS ')
print('p_ocel_file: "' + str(p_ocel_file) + '".')
print('p_mode: "' + str(p_mode) + '".')
print('p_object_type: "' + str(p_object_type) + '".')
print('p_ocel_file: "' + str(p_ocel_file) + '".')
print('p_mode: "' + str(p_mode) + '".')
print('p_object_type: "' + str(p_object_type) + '".')
print('p_attr_weights:')
print(p_attr_weights)
print('p_clustering_mode: "' + str(p_clustering_mode) + '".')
print('p_cluster_count: "' + str(p_cluster_count) + '".')
print('p_ocel_file_type: "' + str(p_ocel_file_type) + '".')
print('p_graph_file_type: "' + str(p_graph_file_type) + '".')
print('p_clustering_mode: "' + str(p_clustering_mode) + '".')
print('p_cluster_count: "' + str(p_cluster_count) + '".')
print('p_ocel_file_type: "' + str(p_ocel_file_type) + '".')
print('p_graph_file_type: "' + str(p_graph_file_type) + '".')
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
......@@ -76,7 +52,7 @@ assert p_clustering_mode in algorithms.keys(), 'The given clustering mode "' + p
ocel = pm4py.read_ocel(p_ocel_file)
# res = f.ocel_get_object_distances_lh(ocel, p_object_type, p_attr_weights)
# res_lh = f.ocel_get_object_distances_lh(ocel, p_object_type, p_attr_weights)
res = f.ocel_get_object_distances_sy(ocel, p_object_type, p_attr_weights)
duration = datetime.datetime.now() - start_ts
......@@ -91,7 +67,6 @@ print(res['index'][-10:])
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
distance_matrix = res['distances']
......@@ -128,7 +103,6 @@ print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
df_object_data = ocel.relations[ocel.relations['ocel:type'] == p_object_type].set_index('ocel:oid')[['ocel:eid']]
# get relation data for objects of selected type and set index to oid for joinin and only load ocel:eid column
......@@ -200,9 +174,7 @@ print(res)
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
# storing ocels
start_ts = datetime.datetime.now()
print('-------------------------------------------------------')
print(' STORING CLUSTERED-OCEL-FILES ')
directory = os.path.dirname(p_ocel_file) + '/clustered_ocel_files'
......@@ -213,7 +185,10 @@ for ii in range(0, len(res)):
filename = directory + '/cluster_' + str(ii+1).rjust(appendix_len) + '.' + p_ocel_file_type + 'ocel'
pm4py.write_ocel(ocel, filename)
print(str(ii+1).rjust(appendix_len) + '/' + str(cluster_count) + '"' + filename + '" stored.')
duration = datetime.datetime.now() - start_ts
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
quit()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment