Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Stanislav Yuliyanov
oc-dfg-clustering
Commits
8e367827
Commit
8e367827
authored
May 24, 2022
by
Stanislav
Browse files
clustering algorithm, empty oid:id fixed
parent
2f351671
Changes
2
Hide whitespace changes
Inline
Side-by-side
code/functions.py
View file @
8e367827
...
...
@@ -388,6 +388,7 @@ def ocel_get_object_distances_sy(ocel, object_type, weights_per_attribute) -> di
cflow_list
=
find_cflow
(
relations_df
,
activity_dict
)
# Sorts data frame and appends corresponding control flow
oid_filtered
=
oid_filtered
.
loc
[
oid_filtered
[
'ocel:oid'
]
!=
''
]
oid_filtered
.
sort_values
(
by
=
[
'ocel:oid'
],
ascending
=
[
True
],
inplace
=
True
)
oid_filtered
=
oid_filtered
.
assign
(
cflow
=
cflow_list
)
...
...
code/main.py
View file @
8e367827
...
...
@@ -10,16 +10,19 @@ import functions as f
import
cluster
from
sklearn
import
cluster
as
skcluster
from
sklearn_extra
import
cluster
as
cluster_extra
from
clusteval
import
clusteval
from
scipy.spatial
import
distance
#
# PARAMETERS
p_ocel_file
=
'./data/ocel.jsonocel'
p_mode
=
'existence'
# all | existence
p_object_type
=
'
item
s'
# object-type-name, options depend on the data
p_object_type
=
'
package
s'
# object-type-name, options depend on the data
p_attr_weights
=
{
}
# attributes that are not given in the data are not used
p_clusteval_mode
=
'silhouette'
p_clustering_mode
=
'kmeans'
# optional, default: kmeans
p_max_cluster_count
=
10
# np.NaN # cluster-count (optional, default: np.NaN which leads to automatic k determination)
#
p_max_cluster_count =
25
# np.NaN # cluster-count (optional, default: np.NaN which leads to automatic k determination)
p_ocel_file_type
=
'json'
# json|xml
p_graph_file_type
=
'svg'
# svg|png
# END PARAMETERS
...
...
@@ -33,8 +36,9 @@ print('p_mode: "' + str(p_mode) + '".')
print
(
'p_object_type: "'
+
str
(
p_object_type
)
+
'".'
)
print
(
'p_attr_weights:'
)
print
(
p_attr_weights
)
print
(
'p_clusteval_mode: "'
+
str
(
p_clusteval_mode
)
+
'".'
)
print
(
'p_clustering_mode: "'
+
str
(
p_clustering_mode
)
+
'".'
)
print
(
'p_cluster_count: "'
+
str
(
p_max_cluster_count
)
+
'".'
)
#
print('p_
max_
cluster_count: "' + str(p_max_cluster_count) + '".')
print
(
'p_ocel_file_type: "'
+
str
(
p_ocel_file_type
)
+
'".'
)
print
(
'p_graph_file_type: "'
+
str
(
p_graph_file_type
)
+
'".'
)
print
(
'-------------------------------------------------------'
)
...
...
@@ -78,28 +82,30 @@ index_to_oid_map = res['index']
algo = algorithms[p_clustering_mode]
try:
max_cluster_count = int(p_max_cluster_count)
except:
max_cluster_count =
10
assert max_cluster_count >= 2, '
cluster_count
needs
to
be
at
least
2
'
assert max_cluster_count < len(index_to_oid_map), '
cluster_count
needs
to
be
less
than
the
count
of
distinct
objects
in
the
ocel
-
data
.
'
#
try:
#
max_cluster_count = int(p_max_cluster_count)
#
except:
#
max_cluster_count =
25
#
assert max_cluster_count >= 2, '
cluster_count
needs
to
be
at
least
2
'
#
assert max_cluster_count < len(index_to_oid_map), '
cluster_count
needs
to
be
less
than
the
count
of
distinct
objects
in
the
ocel
-
data
.
'
cluster_count = cluster.determine_optimal_k(distance_matrix, algorithm=algo, k_max=max_cluster_count)
# cluster_count = cluster.determine_optimal_k(distance_matrix, algorithm=algo, k_max=max_cluster_count)
results = clusteval(evaluate = p_clusteval_mode).fit(distance_matrix)
cluster_count = results['
score
']['
clusters
'].iloc[np.where(results['
score
']['
score
'] == results['
score
']['
score
'].max())[0][0]]
algo.set_params(n_clusters=cluster_count)
cluster_res = cluster.cluster_matrix(distance_matrix, algorithm=algo)
df_clusters = pd.DataFrame({'
cluster
': cluster_res[1]}, index=index_to_oid_map) # creating dataframe
df_clusters.index.name='
ocel
:
oid
' # setting index name for joining
duration = datetime.datetime.now() - start_ts
print('
-------------------------------------------------------
')
print('
OBJECT
-
CLUSTERING
')
print('
clustering
-
technique
:
')
print(p_clustering_mode)
print('
max
cluster
count
given
:
')
print(max_cluster_count)
#
print('
max
cluster
count
given
:
')
#
print(max_cluster_count)
print('
cluster
-
count
used
:
')
print(cluster_count)
print('
object
-
cluster
-
dataframe
:
')
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment