Commit 024b3bbd authored by Lennart Holzenkamp's avatar Lennart Holzenkamp
Browse files

performance improvements and removing prints and warnings

parent caa54533
import datetime
from enum import unique
from functools import cache
import itertools
import math
from operator import index
from typing import Any
from matplotlib.cbook import flatten
from matplotlib.pyplot import axis
import numpy as np
import pandas as pd
from pm4py.objects.ocel.obj import OCEL
from Levenshtein import distance as lev
import pm4py
import sklearn
from sympy import EX
import constants as c
import scipy.spatial.distance as spd
from pandarallel import pandarallel
def count_non_nans(data):
count = 0
......@@ -26,94 +35,91 @@ def print_w_ts(c):
print(datetime.datetime.now())
print(c)
# uses the given index in the dataframe. duplicates that (key-> keyA, keyB) resulting in a full cross join. every given attribute is replaced by its distance
def df_pairwise_attr_distance(df: pd.DataFrame, attributes: list, ls_use_cache = 1):
types = df_determine_data_types(df)
attributes = list(set(attributes).intersection(set(df.columns))) # attributes should be 'real' attributes
# If index has no name, set it to key but remember that it should be changed back
remove_index_name = False
if df.index.name == None:
df.index.name = 'key'
remove_index_name = True
index_name = df.index.name
suff_a = '_a'
suff_b = '_b'
index_name_a = df.index.name + suff_a
index_name_b = df.index.name + suff_b
# keep non_attr_fields for later joining
non_attr_fields = list(set(df.columns) - set(attributes))
# has same keys as original df, this can possibly be an empty data set but it will have they keys!
df_attr_no_distance = df[non_attr_fields]
# building a table which consists of two columns with all index combination but one way distinct!
just_index = df[[]].reset_index()
all_ids = set(just_index[index_name])
print_w_ts('creating dataframe with distinct id combinations...')
# following takes a short while increases mem usage a lot
distinct_cross = pd.DataFrame(list(itertools.combinations(all_ids, 2)), columns=[index_name_a, index_name_b])
distinct_cross.set_index([index_name_a, index_name_b], inplace=True)
# distinct_cross has now just two columns: index_name_a and index_name_b.
print_w_ts('double joining attributes...')
# attribute doppelt joinen (für index_a und index_b) --> zweifacher join der aber dank indizierung zügiger gehen sollte
# define a and join
df_tmp = df[attributes]
df_tmp.index.name = index_name_a
df_tmp.rename(mapper=lambda x: x+suff_a, axis=1, inplace=True)
distinct_cross = distinct_cross.join(df_tmp) # increases additional 200 MB
# define b and join
df_tmp = df[attributes]
df_tmp.index.name = index_name_b
df_tmp.rename(mapper=lambda x: x+suff_b, axis=1, inplace=True)
distinct_cross = distinct_cross.join(df_tmp) # increases additional 200 MB
# now df_cross_attributes consists of the tow-column index ocel:oid_a and ocel_oid_b. also for every attribute there exists a '-a' and '_b' column
res = distinct_cross
lev_func = lev_cached if ls_use_cache else lev
if ls_use_cache:
print_w_ts('caching active')
else:
print_w_ts('caching not active')
print_w_ts('calculating distances...')
for attr in attributes:
attr_a = attr + suff_a
attr_b = attr + suff_b
if attr in types.keys():
df_sub = pd.DataFrame(res.groupby([attr_a, attr_b]).size()) # master line for speedup
print_w_ts('calculating for "' + attr + '"...')
# theoretical these lines can be calculated in parallel and also vectorized if thats possible for strings or float64s (may be too long values)
if types[attr] == 'cf': # before: 3338 MB
df_sub[attr] = df_sub.apply(lambda row: lev_func(row.name[0], row.name[1]), axis=1)
elif types[attr] == 'str':
df_sub[attr] = df_sub.apply(lambda row: 0 if row.name[0] == row.name[1] else 1, axis=1)
elif types[attr] == 'float64':
df_sub[attr] = df_sub.apply(lambda row: abs(row.name[0] - row.name[1]), axis=1)
else:
df_sub[attr] = 'Distance calculation undefined for type "' + types[attr] + '".'
del df_sub[0] # nötig, apply läuft ohne value nicht.
res = res.join(df_sub, on=[attr_a, attr_b], how='left')
res.drop([attr_a, attr_b],inplace=True,axis=1)
else:
raise Exception('field "' + attr + '" not in types')
print_w_ts('done calculating.')
if remove_index_name:
res.index.name = None
# return is with min/max column
# 10,366051 seconds; count=859
def generate_ordered_combination_df(count: int):
keys = [x for x in range(count)]
df = pd.DataFrame([], index=pd.MultiIndex.from_product([keys,keys], names=['left', 'right']))
df.reset_index(inplace=True)
# removing columns with left==right
df = df[df['left'] != df['right']]
# calculating min
tmp_min = df.min(axis=1)
# calculating max
tmp_max = df.max(axis=1)
df['min'] = tmp_min
df['max'] = tmp_max
del df['left']
del df['right']
df.drop_duplicates(inplace=True) # 6,679982 sec; count=8159
df.set_index(['min', 'max'], inplace=True) # 1,849814 sec; count=8159
return df
return res
def check_suffixes(suffixes=tuple[str, str]):
if len(suffixes) != 2: raise Exception('length of suffixes need to be exact two.')
if suffixes[0] == suffixes[1]: raise Exception('suffixes need to be distinct')
return True
def df_create_cross_df(df: pd.DataFrame, suffixes=('_x', '_y')) -> pd.DataFrame:
"""
Creates a full cross join of a dataframe and preservers the index used before.
"""
check_suffixes(suffixes) # Checking if suffixes are valid.
# setting the new list of index columns (given dataframe could already use a MultiIndex)
new_index_names = list(map(lambda name: name+suffixes[0], df.index.names)) + list(map(lambda name: name+suffixes[1], df.index.names))
# Resetting the index for preserving the index later.
pre = df.reset_index(inplace=False)
cross = pre.join(pre, how='cross', lsuffix=suffixes[0], rsuffix=suffixes[1]) # 2.5 sec
cross.set_index(new_index_names, verify_integrity=False, inplace=True) # 11 sec -> 6.2 sec
return cross
# returns a dataframe based on a cross join of the given dataframe.
# All attributes are replaced by their row wise distance which is calculated via the given functions in attribute_func_map
def df_pairwise_attr_distance(df: pd.DataFrame, attribute_func_map: dict[str, Any], suffixes=('_x', '_y')) -> pd.DataFrame:
check_suffixes(suffixes)
# getting relevant attributes
selected_attributes = list(set(attribute_func_map.keys()).intersection(df.columns))
# CARTESIAN PRODUCT of all data
cross = df_create_cross_df(df[selected_attributes], suffixes=suffixes) # ~8-9 seconds
# determining data_types
for attr in selected_attributes:
# creating attribute names for each 'side'
new_attr_names = {0: attr+suffixes[0], 1: attr+suffixes[1]}
# getting unique values
unique_vals = df[attr].unique()
# setting function for distance calculation per attribute
func = attribute_func_map[attr]
if type(func) == None:
raise Exception('No function defined for attribute "' + attr + '".')
# reshaping and calculating the distances (its only done one ways by pdist, hence squareform is necessary)
reshaped_vals = unique_vals.reshape(-1,1)
# CALCULATING (less than 1s)
d_matrix = spd.pdist(reshaped_vals, func)
d_matrix = spd.squareform(d_matrix)
# creating dataframe (matrix like) of the result setting index and columns accordingly
res = pd.DataFrame(d_matrix)
res.index = unique_vals
res.columns = unique_vals
# RESHAPING (less than 10 ms)
res = res.rename_axis(index=new_attr_names[0], columns=new_attr_names[1]).melt(ignore_index=False) # retransfrom from matrix to list
res.reset_index(inplace=True)
# SETTING MULTIINDEX (less than 100 ms)
res.set_index(list(new_attr_names.values()), verify_integrity=False, inplace=True)
res.rename({'value': attr}, inplace=True, axis=1) # 'value' is the automatic name
# JOINING RESULTS to cross table (~11-12s)
cross = cross.join(res, on=list(new_attr_names.values()), how='left') # some kind of mapping may be faster...
# DELETING COLUMNS (less than 1s)
del cross[new_attr_names[0]]
del cross[new_attr_names[1]]
# FILLING NaNs (less than 1s)
cross[attr] = cross[attr].fillna(0.0)
return cross
# Determines column types by first elements that are not NaN
# becomes: float64, string, list, unknown
def df_determine_data_types(df: pd.DataFrame):
def df_determine_data_types(df: pd.DataFrame) -> dict[str, str]:
types = {}
for column in (set(df.columns) - set(types.keys())):
first_non_nan = df[column].loc[~df[column].isnull()].iloc[0]
......@@ -158,14 +164,13 @@ def df_get_object_table_for_type(ocel: OCEL, object_type: str):
def df_get_control_flow_per_object_of_type(ocel: OCEL, object_type: str, activity_letter_map: dict):
# Getting all relations
df_relations = ocel.relations[ocel.relations['ocel:type'] == object_type]
df_relations['ocel:activity_short'] = df_relations['ocel:activity'].map(activity_letter_map)
# group data by ocel:oid, get ocel:activity ordered by ocel:timestamp (which is important for later grouping)
df_relations = ocel.relations[ocel.relations['ocel:type'] == object_type].copy() # for supressing warning.
df_relations['ocel:activity'] = df_relations['ocel:activity'].map(activity_letter_map)
df_relations = df_relations.sort_values(['ocel:oid', 'ocel:timestamp'], axis=0, ascending=True)
del df_relations['ocel:eid']
del df_relations['ocel:type']
del df_relations['ocel:timestamp'] # only possible because sorting already applied!
del df_relations['ocel:activity']
# del df_relations['ocel:activity']
df_relations.rename({'ocel:activity_short': 'ocel:activity'}, axis=1, inplace=True)
# control flow per object. sorting by timestamp is very important!
res = df_relations.groupby('ocel:oid')['ocel:activity'].agg(tuple)
......@@ -193,89 +198,52 @@ def map_activities_to_letter(unique_activities):
cur_letter = chr(ord(cur_letter) + 1)
return activities_dict
def ocel_get_object_distances(ocel, object_type, weights_per_attribute, show_log=False):
# Retrieving all possible object types (items are unique in list)
def ocel_get_object_distances(ocel, object_type, weights_per_attribute) -> dict['index': list, 'distances': np.matrix]:
# getting all object-types
data_object_types = pm4py.ocel_get_object_types(ocel)
if not object_type in data_object_types: raise Exception('selected object-type-name "' + object_type + '" not present in the data.')
if show_log: print_w_ts('object types in ocel-file:')
if show_log: print_w_ts(data_object_types)
assert object_type in data_object_types, 'selected object-type-name not present in the data.'
# getting all distinct acitivity names:
activities_names = pm4py.ocel_object_type_activities(ocel)[object_type]
activities_names = sorted(activities_names)
activity_letter_map = map_activities_to_letter(activities_names)
activity_letter_map = map_activities_to_letter(pm4py.ocel_object_type_activities(ocel)[object_type])
# getting object-information from ocel
df_object_data = df_get_object_table_for_type(ocel, object_type)
series_cf_per_oid = df_get_control_flow_per_object_of_type(ocel, object_type, activity_letter_map)
# adding new information to object data
# adding control-flow-information to object data
df_object_data[c.DEFAULT_CF_ATTR_NAME] = df_object_data['ocel:oid'].map(series_cf_per_oid)
df_object_data[c.DEFAULT_CF_ATTR_NAME] = df_object_data[c.DEFAULT_CF_ATTR_NAME].fillna('').map(lambda x: ''.join(x))
# object table is ready
if show_log: print_w_ts('Object table ready including control flow:')
# df_object_data = df_object_data.head(math.floor(df_object_data.shape[0] * test_factor)) if sys_test_mode else df_object_data
print(df_object_data) # showing the data
object_count = len(df_object_data['ocel:oid'].unique())
if show_log: print_w_ts('Object count (high effect on runtime): ' + str(object_count))
# determining data-types
# Checking datatypes and setting 'special' datatype "control-flow"
object_data_type_map = df_determine_data_types(df_object_data)
object_data_type_map[c.DEFAULT_CF_ATTR_NAME] = 'cf'
# filling NaNs
# filling NaNs of attributes (control_flow already filled)
df_object_data = df_fill_nans(df_object_data, object_data_type_map, c.DEFAULT_VALUES)
if show_log: print_w_ts('NaNs filled.')
# resetting index
# Setting index to oce:oid
df_object_data = df_object_data.set_index('ocel:oid')
# all attributes that are used for calculating a distance
attributes = df_object_data.columns # written here, as ocel:oid is now part of the index and no column anymore
if show_log: print_w_ts('used attributes for distances:')
if show_log: print_w_ts(attributes)
if show_log: print_w_ts('calculating distances...')
df_distance_matrix = df_pairwise_attr_distance(df_object_data, set(df_object_data.columns)-set(['ocel:oid']), c.LS_CACHE_ACTIVE)
if show_log: print_w_ts('distances calculated:')
if show_log: print_w_ts(df_distance_matrix)
if show_log: print_w_ts('normalizing...')
df_distance_matrix = df_normalize_columns(df_distance_matrix, attributes)
if show_log: print_w_ts('normalized distances calculated.')
# Creating attribute->distance_function mapping
type_func_dict = {
'cf': lambda x,y: lev(x[0],y[0]),
'str': lambda x,y: 1 if x != y else 0,
'float64': lambda x,y: abs(x-y)
}
attr_func_map = dict.fromkeys(df_object_data.columns)
for attr in df_object_data.columns: attr_func_map[attr] = type_func_dict[object_data_type_map[attr]]
# CALCULATING DISTANCES (20-22s)
df_distance_matrix = df_pairwise_attr_distance(df_object_data, attr_func_map, ('_a', '_b'))
# NORMALIZING / WEIGHTING / AVERAGING
df_distance_matrix = df_normalize_columns(df_distance_matrix, list(attr_func_map.keys()))
df_distance_matrix = df_weight_columns(df_distance_matrix, weights_per_attribute)
if show_log: print_w_ts('weighted distances calculated.')
print(df_distance_matrix)
df_distance_matrix['distance_avg'] = df_distance_matrix[attributes].mean(axis=1)
df_distance_matrix = df_distance_matrix.drop(attributes, axis=1)
if show_log: print_w_ts('averaged distances calculated.')
print(df_distance_matrix)
if show_log: print_w_ts('Sorting values for generating the matrix...')
df_distance_matrix = df_distance_matrix.sort_index() # needs a lot of time...
if show_log: print_w_ts('sorted.')
arr_index_id_map = df_distance_matrix.reset_index()['ocel:oid_a'].unique()
# arr_index_id_map = df_distance_matrix['ocel:oid_a'].unique()
if show_log: print_w_ts('mapping index->ocel:oid')
if show_log: print_w_ts(arr_index_id_map)
distinct_object_count = len(arr_index_id_map)
# tricky, generating an ordered array and then reshaping it.
if show_log: print_w_ts('Reshaping column as matrix...')
arr_distance_list = np.array(df_distance_matrix['distance_avg'])
res = np.zeros((distinct_object_count,distinct_object_count))
count = 0
for i in range(distinct_object_count):
res[i, i] = 0.0
for j in range(i+1, distinct_object_count):
res[i, j] = arr_distance_list[count]
res[j, i] = res[i, j]
count += 1
return res
\ No newline at end of file
df_distance_matrix['distance_avg'] = df_distance_matrix[list(attr_func_map.keys())].mean(axis=1)
df_distance_matrix = df_distance_matrix.drop(list(attr_func_map.keys()), axis=1) # single distances not relevant anymore
# Creating the matrix-index->ocel:oid map
index_to_id_map = list(df_object_data.index)
index_count = len(index_to_id_map)
# reshaping results to matrix
df_matrix = df_distance_matrix['distance_avg'].to_numpy().reshape(index_count, index_count) # TIMING: 0 ms
return {
'distances': df_matrix,
'index': index_to_id_map
}
\ No newline at end of file
......@@ -2,6 +2,7 @@ import datetime
import math
from os.path import exists
import pm4py
import scipy
import constants as c
import numpy as np
import functions as f
......@@ -44,23 +45,26 @@ p_graph_file_type = 'svg'
# END PARAMETERS
#
f.print_w_ts('Program startet...')
print('Program startet...')
start_ts = datetime.datetime.now()
f.print_w_ts('Reading inputs...')
f.print_w_ts('Params read.')
# non-data-based assertions
assert p_mode in c.MODES, 'selected mode not possible. Use either ''all'' or ''existence'''
assert exists(p_ocel_file), 'file does not exists'
f.print_w_ts('Reading ocel data...')
# reading ocel data
ocel = pm4py.read_ocel(p_ocel_file)
f.print_w_ts('ocel data read.')
# calculating distances of objects based on control-flow and all non-NaN attributes of objects
res = f.ocel_get_object_distances(ocel, p_object_type, p_attr_weights)
f.print_w_ts('distance matrix calculated:')
f.print_w_ts(res)
f.print_w_ts('duration: ' + str(datetime.datetime.now() - start_ts))
print('-------------------------------------------------------')
print('distances:')
print(res['distances'])
print('first 10 indexes:')
print(res['index'][:10])
print('last 10 indexes:')
print(res['index'][-10:])
print('duration: ' + str(datetime.datetime.now() - start_ts))
quit()
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment