Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Stanislav Yuliyanov
oc-dfg-clustering
Commits
024b3bbd
Commit
024b3bbd
authored
May 23, 2022
by
Lennart Holzenkamp
Browse files
performance improvements and removing prints and warnings
parent
caa54533
Changes
2
Hide whitespace changes
Inline
Side-by-side
code/functions.py
View file @
024b3bbd
import
datetime
from
enum
import
unique
from
functools
import
cache
import
itertools
import
math
from
operator
import
index
from
typing
import
Any
from
matplotlib.cbook
import
flatten
from
matplotlib.pyplot
import
axis
import
numpy
as
np
import
pandas
as
pd
from
pm4py.objects.ocel.obj
import
OCEL
from
Levenshtein
import
distance
as
lev
import
pm4py
import
sklearn
from
sympy
import
EX
import
constants
as
c
import
scipy.spatial.distance
as
spd
from
pandarallel
import
pandarallel
def
count_non_nans
(
data
):
count
=
0
...
...
@@ -26,94 +35,91 @@ def print_w_ts(c):
print
(
datetime
.
datetime
.
now
())
print
(
c
)
# uses the given index in the dataframe. duplicates that (key-> keyA, keyB) resulting in a full cross join. every given attribute is replaced by its distance
def
df_pairwise_attr_distance
(
df
:
pd
.
DataFrame
,
attributes
:
list
,
ls_use_cache
=
1
):
types
=
df_determine_data_types
(
df
)
attributes
=
list
(
set
(
attributes
).
intersection
(
set
(
df
.
columns
)))
# attributes should be 'real' attributes
# If index has no name, set it to key but remember that it should be changed back
remove_index_name
=
False
if
df
.
index
.
name
==
None
:
df
.
index
.
name
=
'key'
remove_index_name
=
True
index_name
=
df
.
index
.
name
suff_a
=
'_a'
suff_b
=
'_b'
index_name_a
=
df
.
index
.
name
+
suff_a
index_name_b
=
df
.
index
.
name
+
suff_b
# keep non_attr_fields for later joining
non_attr_fields
=
list
(
set
(
df
.
columns
)
-
set
(
attributes
))
# has same keys as original df, this can possibly be an empty data set but it will have they keys!
df_attr_no_distance
=
df
[
non_attr_fields
]
# building a table which consists of two columns with all index combination but one way distinct!
just_index
=
df
[[]].
reset_index
()
all_ids
=
set
(
just_index
[
index_name
])
print_w_ts
(
'creating dataframe with distinct id combinations...'
)
# following takes a short while increases mem usage a lot
distinct_cross
=
pd
.
DataFrame
(
list
(
itertools
.
combinations
(
all_ids
,
2
)),
columns
=
[
index_name_a
,
index_name_b
])
distinct_cross
.
set_index
([
index_name_a
,
index_name_b
],
inplace
=
True
)
# distinct_cross has now just two columns: index_name_a and index_name_b.
print_w_ts
(
'double joining attributes...'
)
# attribute doppelt joinen (für index_a und index_b) --> zweifacher join der aber dank indizierung zügiger gehen sollte
# define a and join
df_tmp
=
df
[
attributes
]
df_tmp
.
index
.
name
=
index_name_a
df_tmp
.
rename
(
mapper
=
lambda
x
:
x
+
suff_a
,
axis
=
1
,
inplace
=
True
)
distinct_cross
=
distinct_cross
.
join
(
df_tmp
)
# increases additional 200 MB
# define b and join
df_tmp
=
df
[
attributes
]
df_tmp
.
index
.
name
=
index_name_b
df_tmp
.
rename
(
mapper
=
lambda
x
:
x
+
suff_b
,
axis
=
1
,
inplace
=
True
)
distinct_cross
=
distinct_cross
.
join
(
df_tmp
)
# increases additional 200 MB
# now df_cross_attributes consists of the tow-column index ocel:oid_a and ocel_oid_b. also for every attribute there exists a '-a' and '_b' column
res
=
distinct_cross
lev_func
=
lev_cached
if
ls_use_cache
else
lev
if
ls_use_cache
:
print_w_ts
(
'caching active'
)
else
:
print_w_ts
(
'caching not active'
)
print_w_ts
(
'calculating distances...'
)
for
attr
in
attributes
:
attr_a
=
attr
+
suff_a
attr_b
=
attr
+
suff_b
if
attr
in
types
.
keys
():
df_sub
=
pd
.
DataFrame
(
res
.
groupby
([
attr_a
,
attr_b
]).
size
())
# master line for speedup
print_w_ts
(
'calculating for "'
+
attr
+
'"...'
)
# theoretical these lines can be calculated in parallel and also vectorized if thats possible for strings or float64s (may be too long values)
if
types
[
attr
]
==
'cf'
:
# before: 3338 MB
df_sub
[
attr
]
=
df_sub
.
apply
(
lambda
row
:
lev_func
(
row
.
name
[
0
],
row
.
name
[
1
]),
axis
=
1
)
elif
types
[
attr
]
==
'str'
:
df_sub
[
attr
]
=
df_sub
.
apply
(
lambda
row
:
0
if
row
.
name
[
0
]
==
row
.
name
[
1
]
else
1
,
axis
=
1
)
elif
types
[
attr
]
==
'float64'
:
df_sub
[
attr
]
=
df_sub
.
apply
(
lambda
row
:
abs
(
row
.
name
[
0
]
-
row
.
name
[
1
]),
axis
=
1
)
else
:
df_sub
[
attr
]
=
'Distance calculation undefined for type "'
+
types
[
attr
]
+
'".'
del
df_sub
[
0
]
# nötig, apply läuft ohne value nicht.
res
=
res
.
join
(
df_sub
,
on
=
[
attr_a
,
attr_b
],
how
=
'left'
)
res
.
drop
([
attr_a
,
attr_b
],
inplace
=
True
,
axis
=
1
)
else
:
raise
Exception
(
'field "'
+
attr
+
'" not in types'
)
print_w_ts
(
'done calculating.'
)
if
remove_index_name
:
res
.
index
.
name
=
None
# return is with min/max column
# 10,366051 seconds; count=859
def
generate_ordered_combination_df
(
count
:
int
):
keys
=
[
x
for
x
in
range
(
count
)]
df
=
pd
.
DataFrame
([],
index
=
pd
.
MultiIndex
.
from_product
([
keys
,
keys
],
names
=
[
'left'
,
'right'
]))
df
.
reset_index
(
inplace
=
True
)
# removing columns with left==right
df
=
df
[
df
[
'left'
]
!=
df
[
'right'
]]
# calculating min
tmp_min
=
df
.
min
(
axis
=
1
)
# calculating max
tmp_max
=
df
.
max
(
axis
=
1
)
df
[
'min'
]
=
tmp_min
df
[
'max'
]
=
tmp_max
del
df
[
'left'
]
del
df
[
'right'
]
df
.
drop_duplicates
(
inplace
=
True
)
# 6,679982 sec; count=8159
df
.
set_index
([
'min'
,
'max'
],
inplace
=
True
)
# 1,849814 sec; count=8159
return
df
return
res
def
check_suffixes
(
suffixes
=
tuple
[
str
,
str
]):
if
len
(
suffixes
)
!=
2
:
raise
Exception
(
'length of suffixes need to be exact two.'
)
if
suffixes
[
0
]
==
suffixes
[
1
]:
raise
Exception
(
'suffixes need to be distinct'
)
return
True
def
df_create_cross_df
(
df
:
pd
.
DataFrame
,
suffixes
=
(
'_x'
,
'_y'
))
->
pd
.
DataFrame
:
"""
Creates a full cross join of a dataframe and preservers the index used before.
"""
check_suffixes
(
suffixes
)
# Checking if suffixes are valid.
# setting the new list of index columns (given dataframe could already use a MultiIndex)
new_index_names
=
list
(
map
(
lambda
name
:
name
+
suffixes
[
0
],
df
.
index
.
names
))
+
list
(
map
(
lambda
name
:
name
+
suffixes
[
1
],
df
.
index
.
names
))
# Resetting the index for preserving the index later.
pre
=
df
.
reset_index
(
inplace
=
False
)
cross
=
pre
.
join
(
pre
,
how
=
'cross'
,
lsuffix
=
suffixes
[
0
],
rsuffix
=
suffixes
[
1
])
# 2.5 sec
cross
.
set_index
(
new_index_names
,
verify_integrity
=
False
,
inplace
=
True
)
# 11 sec -> 6.2 sec
return
cross
# returns a dataframe based on a cross join of the given dataframe.
# All attributes are replaced by their row wise distance which is calculated via the given functions in attribute_func_map
def
df_pairwise_attr_distance
(
df
:
pd
.
DataFrame
,
attribute_func_map
:
dict
[
str
,
Any
],
suffixes
=
(
'_x'
,
'_y'
))
->
pd
.
DataFrame
:
check_suffixes
(
suffixes
)
# getting relevant attributes
selected_attributes
=
list
(
set
(
attribute_func_map
.
keys
()).
intersection
(
df
.
columns
))
# CARTESIAN PRODUCT of all data
cross
=
df_create_cross_df
(
df
[
selected_attributes
],
suffixes
=
suffixes
)
# ~8-9 seconds
# determining data_types
for
attr
in
selected_attributes
:
# creating attribute names for each 'side'
new_attr_names
=
{
0
:
attr
+
suffixes
[
0
],
1
:
attr
+
suffixes
[
1
]}
# getting unique values
unique_vals
=
df
[
attr
].
unique
()
# setting function for distance calculation per attribute
func
=
attribute_func_map
[
attr
]
if
type
(
func
)
==
None
:
raise
Exception
(
'No function defined for attribute "'
+
attr
+
'".'
)
# reshaping and calculating the distances (its only done one ways by pdist, hence squareform is necessary)
reshaped_vals
=
unique_vals
.
reshape
(
-
1
,
1
)
# CALCULATING (less than 1s)
d_matrix
=
spd
.
pdist
(
reshaped_vals
,
func
)
d_matrix
=
spd
.
squareform
(
d_matrix
)
# creating dataframe (matrix like) of the result setting index and columns accordingly
res
=
pd
.
DataFrame
(
d_matrix
)
res
.
index
=
unique_vals
res
.
columns
=
unique_vals
# RESHAPING (less than 10 ms)
res
=
res
.
rename_axis
(
index
=
new_attr_names
[
0
],
columns
=
new_attr_names
[
1
]).
melt
(
ignore_index
=
False
)
# retransfrom from matrix to list
res
.
reset_index
(
inplace
=
True
)
# SETTING MULTIINDEX (less than 100 ms)
res
.
set_index
(
list
(
new_attr_names
.
values
()),
verify_integrity
=
False
,
inplace
=
True
)
res
.
rename
({
'value'
:
attr
},
inplace
=
True
,
axis
=
1
)
# 'value' is the automatic name
# JOINING RESULTS to cross table (~11-12s)
cross
=
cross
.
join
(
res
,
on
=
list
(
new_attr_names
.
values
()),
how
=
'left'
)
# some kind of mapping may be faster...
# DELETING COLUMNS (less than 1s)
del
cross
[
new_attr_names
[
0
]]
del
cross
[
new_attr_names
[
1
]]
# FILLING NaNs (less than 1s)
cross
[
attr
]
=
cross
[
attr
].
fillna
(
0.0
)
return
cross
# Determines column types by first elements that are not NaN
# becomes: float64, string, list, unknown
def
df_determine_data_types
(
df
:
pd
.
DataFrame
):
def
df_determine_data_types
(
df
:
pd
.
DataFrame
)
->
dict
[
str
,
str
]
:
types
=
{}
for
column
in
(
set
(
df
.
columns
)
-
set
(
types
.
keys
())):
first_non_nan
=
df
[
column
].
loc
[
~
df
[
column
].
isnull
()].
iloc
[
0
]
...
...
@@ -158,14 +164,13 @@ def df_get_object_table_for_type(ocel: OCEL, object_type: str):
def
df_get_control_flow_per_object_of_type
(
ocel
:
OCEL
,
object_type
:
str
,
activity_letter_map
:
dict
):
# Getting all relations
df_relations
=
ocel
.
relations
[
ocel
.
relations
[
'ocel:type'
]
==
object_type
]
df_relations
[
'ocel:activity_short'
]
=
df_relations
[
'ocel:activity'
].
map
(
activity_letter_map
)
# group data by ocel:oid, get ocel:activity ordered by ocel:timestamp (which is important for later grouping)
df_relations
=
ocel
.
relations
[
ocel
.
relations
[
'ocel:type'
]
==
object_type
].
copy
()
# for supressing warning.
df_relations
[
'ocel:activity'
]
=
df_relations
[
'ocel:activity'
].
map
(
activity_letter_map
)
df_relations
=
df_relations
.
sort_values
([
'ocel:oid'
,
'ocel:timestamp'
],
axis
=
0
,
ascending
=
True
)
del
df_relations
[
'ocel:eid'
]
del
df_relations
[
'ocel:type'
]
del
df_relations
[
'ocel:timestamp'
]
# only possible because sorting already applied!
del
df_relations
[
'ocel:activity'
]
#
del df_relations['ocel:activity']
df_relations
.
rename
({
'ocel:activity_short'
:
'ocel:activity'
},
axis
=
1
,
inplace
=
True
)
# control flow per object. sorting by timestamp is very important!
res
=
df_relations
.
groupby
(
'ocel:oid'
)[
'ocel:activity'
].
agg
(
tuple
)
...
...
@@ -193,89 +198,52 @@ def map_activities_to_letter(unique_activities):
cur_letter
=
chr
(
ord
(
cur_letter
)
+
1
)
return
activities_dict
def
ocel_get_object_distances
(
ocel
,
object_type
,
weights_per_attribute
,
show_log
=
False
):
# Retrieving all possible object types (items are unique in list)
def
ocel_get_object_distances
(
ocel
,
object_type
,
weights_per_attribute
)
->
dict
[
'index'
:
list
,
'distances'
:
np
.
matrix
]:
# getting all object-types
data_object_types
=
pm4py
.
ocel_get_object_types
(
ocel
)
if
not
object_type
in
data_object_types
:
raise
Exception
(
'selected object-type-name "'
+
object_type
+
'" not present in the data.'
)
if
show_log
:
print_w_ts
(
'object types in ocel-file:'
)
if
show_log
:
print_w_ts
(
data_object_types
)
assert
object_type
in
data_object_types
,
'selected object-type-name not present in the data.'
# getting all distinct acitivity names:
activities_names
=
pm4py
.
ocel_object_type_activities
(
ocel
)[
object_type
]
activities_names
=
sorted
(
activities_names
)
activity_letter_map
=
map_activities_to_letter
(
activities_names
)
activity_letter_map
=
map_activities_to_letter
(
pm4py
.
ocel_object_type_activities
(
ocel
)[
object_type
])
# getting object-information from ocel
df_object_data
=
df_get_object_table_for_type
(
ocel
,
object_type
)
series_cf_per_oid
=
df_get_control_flow_per_object_of_type
(
ocel
,
object_type
,
activity_letter_map
)
# adding new information to object data
# adding control-flow-information to object data
df_object_data
[
c
.
DEFAULT_CF_ATTR_NAME
]
=
df_object_data
[
'ocel:oid'
].
map
(
series_cf_per_oid
)
df_object_data
[
c
.
DEFAULT_CF_ATTR_NAME
]
=
df_object_data
[
c
.
DEFAULT_CF_ATTR_NAME
].
fillna
(
''
).
map
(
lambda
x
:
''
.
join
(
x
))
# object table is ready
if
show_log
:
print_w_ts
(
'Object table ready including control flow:'
)
# df_object_data = df_object_data.head(math.floor(df_object_data.shape[0] * test_factor)) if sys_test_mode else df_object_data
print
(
df_object_data
)
# showing the data
object_count
=
len
(
df_object_data
[
'ocel:oid'
].
unique
())
if
show_log
:
print_w_ts
(
'Object count (high effect on runtime): '
+
str
(
object_count
))
# determining data-types
# Checking datatypes and setting 'special' datatype "control-flow"
object_data_type_map
=
df_determine_data_types
(
df_object_data
)
object_data_type_map
[
c
.
DEFAULT_CF_ATTR_NAME
]
=
'cf'
# filling NaNs
# filling NaNs
of attributes (control_flow already filled)
df_object_data
=
df_fill_nans
(
df_object_data
,
object_data_type_map
,
c
.
DEFAULT_VALUES
)
if
show_log
:
print_w_ts
(
'NaNs filled.'
)
# resetting index
# Setting index to oce:oid
df_object_data
=
df_object_data
.
set_index
(
'ocel:oid'
)
#
all
attribute
s that are used for calculating a distance
attributes
=
df_object_data
.
columns
# written here, as ocel:oid is now part of the index and no column anymore
if
show_log
:
print_w_ts
(
'used attributes for distances:'
)
if
show_log
:
print_w_ts
(
attributes
)
if
show_log
:
print_w_ts
(
'calculating distances...'
)
df_dista
nc
e
_ma
trix
=
df_pairwise_attr_distance
(
df_object_data
,
set
(
df_object_data
.
columns
)
-
set
([
'ocel:oid'
]),
c
.
LS_CACHE_ACTIVE
)
if
show_log
:
print_w_ts
(
'distances calculated:'
)
if
show_log
:
print_w_ts
(
df_distance_matrix
)
if
show_log
:
print_w_ts
(
'normalizing...'
)
df_distance_matrix
=
df_normalize_columns
(
df_distance_matrix
,
attributes
)
if
show_log
:
print_w_ts
(
'normalized distances calculated.'
)
#
Creating
attribute
->distance_function mapping
type_func_dict
=
{
'cf'
:
lambda
x
,
y
:
lev
(
x
[
0
],
y
[
0
]),
'str'
:
lambda
x
,
y
:
1
if
x
!=
y
else
0
,
'float64'
:
lambda
x
,
y
:
abs
(
x
-
y
)
}
attr_fu
nc_ma
p
=
dict
.
fromkeys
(
df_object_data
.
columns
)
for
attr
in
df_object_data
.
columns
:
attr_func_map
[
attr
]
=
type_func_dict
[
object_data_type_map
[
attr
]]
# CALCULATING DISTANCES (20-22s)
df_distance_matrix
=
df_pairwise_attr_distance
(
df_object_data
,
attr_func_map
,
(
'_a'
,
'_b'
)
)
# NORMALIZING / WEIGHTING / AVERAGING
df_distance_matrix
=
df_normalize_columns
(
df_distance_matrix
,
list
(
attr_func_map
.
keys
()))
df_distance_matrix
=
df_weight_columns
(
df_distance_matrix
,
weights_per_attribute
)
if
show_log
:
print_w_ts
(
'weighted distances calculated.'
)
print
(
df_distance_matrix
)
df_distance_matrix
[
'distance_avg'
]
=
df_distance_matrix
[
attributes
].
mean
(
axis
=
1
)
df_distance_matrix
=
df_distance_matrix
.
drop
(
attributes
,
axis
=
1
)
if
show_log
:
print_w_ts
(
'averaged distances calculated.'
)
print
(
df_distance_matrix
)
if
show_log
:
print_w_ts
(
'Sorting values for generating the matrix...'
)
df_distance_matrix
=
df_distance_matrix
.
sort_index
()
# needs a lot of time...
if
show_log
:
print_w_ts
(
'sorted.'
)
arr_index_id_map
=
df_distance_matrix
.
reset_index
()[
'ocel:oid_a'
].
unique
()
# arr_index_id_map = df_distance_matrix['ocel:oid_a'].unique()
if
show_log
:
print_w_ts
(
'mapping index->ocel:oid'
)
if
show_log
:
print_w_ts
(
arr_index_id_map
)
distinct_object_count
=
len
(
arr_index_id_map
)
# tricky, generating an ordered array and then reshaping it.
if
show_log
:
print_w_ts
(
'Reshaping column as matrix...'
)
arr_distance_list
=
np
.
array
(
df_distance_matrix
[
'distance_avg'
])
res
=
np
.
zeros
((
distinct_object_count
,
distinct_object_count
))
count
=
0
for
i
in
range
(
distinct_object_count
):
res
[
i
,
i
]
=
0.0
for
j
in
range
(
i
+
1
,
distinct_object_count
):
res
[
i
,
j
]
=
arr_distance_list
[
count
]
res
[
j
,
i
]
=
res
[
i
,
j
]
count
+=
1
return
res
\ No newline at end of file
df_distance_matrix
[
'distance_avg'
]
=
df_distance_matrix
[
list
(
attr_func_map
.
keys
())].
mean
(
axis
=
1
)
df_distance_matrix
=
df_distance_matrix
.
drop
(
list
(
attr_func_map
.
keys
()),
axis
=
1
)
# single distances not relevant anymore
# Creating the matrix-index->ocel:oid map
index_to_id_map
=
list
(
df_object_data
.
index
)
index_count
=
len
(
index_to_id_map
)
# reshaping results to matrix
df_matrix
=
df_distance_matrix
[
'distance_avg'
].
to_numpy
().
reshape
(
index_count
,
index_count
)
# TIMING: 0 ms
return
{
'distances'
:
df_matrix
,
'index'
:
index_to_id_map
}
\ No newline at end of file
code/main.py
View file @
024b3bbd
...
...
@@ -2,6 +2,7 @@ import datetime
import
math
from
os.path
import
exists
import
pm4py
import
scipy
import
constants
as
c
import
numpy
as
np
import
functions
as
f
...
...
@@ -44,23 +45,26 @@ p_graph_file_type = 'svg'
# END PARAMETERS
#
f
.
print
_w_ts
(
'Program startet...'
)
print
(
'Program startet...'
)
start_ts
=
datetime
.
datetime
.
now
()
f
.
print_w_ts
(
'Reading inputs...'
)
f
.
print_w_ts
(
'Params read.'
)
# non-data-based assertions
assert
p_mode
in
c
.
MODES
,
'selected mode not possible. Use either ''all'' or ''existence'''
assert exists(p_ocel_file), '
file
does
not
exists
'
f.print_w_ts('
R
eading
ocel
data
...
')
# r
eading ocel data
ocel = pm4py.read_ocel(p_ocel_file)
f.print_w_ts('
ocel
data
read
.
')
# calculating distances of objects based on control-flow and all non-NaN attributes of objects
res = f.ocel_get_object_distances(ocel, p_object_type, p_attr_weights)
f.print_w_ts('
distance
matrix
calculated
:
')
f.print_w_ts(res)
f.print_w_ts('
duration
:
' + str(datetime.datetime.now() - start_ts))
print('
-------------------------------------------------------
')
print('
distances
:
')
print(res['
distances
'])
print('
first
10
indexes
:
')
print(res['
index
'][:10])
print('
last
10
indexes
:
')
print(res['
index
'][-10:])
print('
duration
:
' + str(datetime.datetime.now() - start_ts))
quit()
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment