Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Stanislav Yuliyanov
oc-dfg-clustering
Commits
b9deb382
Commit
b9deb382
authored
May 17, 2022
by
Stanislav Yuliyanov
Browse files
Merge branch 'revert-
f2cd2bdf
' into 'main'
Revert "added distance matrix implementation" See merge request
!2
parents
f2cd2bdf
c6fe8ac7
Changes
2
Hide whitespace changes
Inline
Side-by-side
code/functions.py
View file @
b9deb382
...
...
@@ -278,155 +278,4 @@ def ocel_get_object_distances(ocel, object_type, weights_per_attribute, show_log
res
[
j
,
i
]
=
res
[
i
,
j
]
count
+=
1
return
res
#######################################################################
# Calculating the distance matrix based on stanislav's implementation #
# #
# Save unique activies to a list
def
get_unique_activities
(
activities
):
activity_list
=
[]
for
activity
in
activities
:
if
activity
in
activity_list
:
continue
else
:
activity_list
.
append
(
activity
)
return
activity_list
# Create hash-table of activities to represent
# each one as a UNICODE-character
def
map_activities_to_letter
(
unique_activities
):
activities_dict
=
{}
cur_letter
=
'a'
for
activity
in
unique_activities
:
activities_dict
[
activity
]
=
cur_letter
cur_letter
=
chr
(
ord
(
cur_letter
)
+
1
)
return
activities_dict
# Create data frame with either the desired object type or the relations
def
get_df
(
df_type
,
o_type
,
df_input
):
if
(
df_type
==
"objects"
):
df
=
pd
.
DataFrame
(
df_input
.
objects
)
df
=
df
[
df
[
'ocel:type'
]
==
o_type
]
# df.drop(['ocel:type', 'ocel:oid'], axis = 1, inplace = True)
df
.
drop
([
'ocel:type'
],
axis
=
1
,
inplace
=
True
)
elif
(
df_type
==
"relations"
):
df
=
pd
.
DataFrame
(
df_input
.
relations
)
df
=
df
[
df
[
'ocel:type'
]
==
o_type
]
df
=
df
.
filter
(
items
=
[
'ocel:oid'
,
'ocel:activity'
,
'ocel:timestamp'
])
df
=
df
.
sort_values
(
by
=
[
'ocel:oid'
,
'ocel:timestamp'
],
ascending
=
[
True
,
True
])
return
df
# Store control flow of every object to a list
def
find_cflow
(
relations_df
,
activity_dict
):
cflow_list
=
[]
prev
=
''
string
=
''
for
index
,
row
in
relations_df
.
iterrows
():
if
(
prev
!=
row
[
'ocel:oid'
]
and
prev
!=
''
):
cflow_list
.
append
(
string
)
string
=
''
prev
=
row
[
'ocel:oid'
]
string
+=
activity_dict
[
row
[
'ocel:activity'
]]
cflow_list
.
append
(
string
)
return
cflow_list
# Drop columns with NaN-only values
def
drop_nan_columns
(
df
):
for
column
in
df
.
columns
:
if
(
df
[
column
].
isna
().
sum
()
==
len
(
df
)):
df
.
drop
([
column
],
axis
=
1
,
inplace
=
True
)
return
df
# Create list of NaN replacements for each column
def
create_dict_nan
(
df
):
dict_nan
=
{}
for
index
,
value
in
df
.
dtypes
.
items
():
if
value
==
"object"
:
dict_nan
[
index
]
=
""
else
:
dict_nan
[
index
]
=
0.0
return
dict_nan
# Calculates the distance matrix
def
distance_matrix
(
df
,
weights
):
n
=
int
(
df
.
shape
[
0
])
columns
=
df
.
columns
matrix
=
np
.
array
([[
0
]
*
n
]
*
n
)
for
i
in
range
(
df
.
shape
[
1
]):
single_column_df
=
df
[
columns
[
i
]]
matrix
=
matrix
+
calc_distance
(
single_column_df
,
0
,
n
,
weights
)
return
matrix
/
df
.
shape
[
1
]
# Helper function for the distance matrix calculation
def
calc_distance
(
df
,
weight_index
,
n
,
weights
):
matrix
=
np
.
array
([[
0
]
*
n
]
*
n
)
npm
=
df
.
values
if
(
isinstance
(
npm
[
0
],
str
)
and
df
.
name
!=
"cflow"
):
for
j
in
range
(
n
):
matrix
[
j
][
j
]
=
0
first_elem
=
npm
[
j
]
for
k
in
range
(
j
+
1
,
n
):
matrix
[
j
][
k
]
=
first_elem
==
npm
[
k
]
matrix
[
k
][
j
]
=
matrix
[
j
][
k
]
elif
(
isinstance
(
npm
[
0
],
float
)):
for
j
in
range
(
n
):
matrix
[
j
][
j
]
=
0
first_elem
=
npm
[
j
]
for
k
in
range
(
j
+
1
,
n
):
matrix
[
j
][
k
]
=
abs
(
first_elem
-
npm
[
k
])
matrix
[
k
][
j
]
=
matrix
[
j
][
k
]
matrix
=
matrix
/
matrix
.
max
()
else
:
for
j
in
range
(
n
):
first_elem
=
npm
[
j
]
if
(
j
%
100
==
0
):
print
(
j
)
for
k
in
range
(
j
+
1
,
n
):
matrix
[
j
][
k
]
=
lev
(
first_elem
,
npm
[
k
])
matrix
[
k
][
j
]
=
matrix
[
j
][
k
]
matrix
=
matrix
/
matrix
.
max
()
return
matrix
*
weights
[
weight_index
]
# #
# End of stanislav's implementation #
#################################################################
\ No newline at end of file
return
res
\ No newline at end of file
code/main.py
View file @
b9deb382
...
...
@@ -57,61 +57,10 @@ f.print_w_ts('Reading ocel data...')
ocel = pm4py.read_ocel(p_ocel_file)
f.print_w_ts('
ocel
data
read
.
')
# res = f.ocel_get_object_distances(ocel, p_object_type, p_attr_weights)
##### CALCULATING DISTANCE MATRIX #####
# Setting object type
obj_type = "items"
# List of all weights
weights = [1]
# Save unique activies to a list
unique_activities = f.get_unique_activities(sorted(pm4py.ocel_object_type_activities(ocel)[obj_type], reverse=False))
# Create hash-table of activities to represent each one as a UNICODE-character
activity_dict = f.map_activities_to_letter(unique_activities)
# Create data frame with desired object type
oid_filtered = f.get_df("objects", obj_type, ocel)
# Create data frame with all relations
relations_df = f.get_df("relations", obj_type, ocel)
# Store control flow of every object to a list
cflow_list = f.find_cflow(relations_df, activity_dict)
# Sorts data frame and appends corresponding control flow
oid_filtered.sort_values(by=['
ocel
:
oid
'], ascending=[True], inplace=True)
oid_filtered = oid_filtered.assign(cflow=cflow_list)
# Create list of NaN replacements for each column
na_values = f.create_dict_nan(oid_filtered)
# Drop columns with NaN-only values
oid_filtered = f.drop_nan_columns(oid_filtered)
# Replace the remaining NaN with empty string or 0)
oid_filtered = oid_filtered.fillna(value=na_values)
# Create list of oids
list_oid = oid_filtered.filter(items=['
ocel
:
oid
']).values
# Drop oid because it is not needed for the matrix calculation
oid_filtered.drop(['
ocel
:
oid
'], axis = 1, inplace = True)
# Calculate distance matrix
res = f.distance_matrix(oid_filtered, weights)
#############################################
# res - contains distance matrix
# list_oid - containst list of oid
res = f.ocel_get_object_distances(ocel, p_object_type, p_attr_weights)
f.print_w_ts('
distance
matrix
calculated
:
')
f.print_w_ts(res)
f.print_w_ts('
duration
:
' + str(datetime.datetime.now() - start_ts))
print('
oid
list
:
')
print(list_oid)
quit()
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment