Commit 8e367827 authored by Stanislav's avatar Stanislav
Browse files

clustering algorithm, empty oid:id fixed

parent 2f351671
......@@ -388,6 +388,7 @@ def ocel_get_object_distances_sy(ocel, object_type, weights_per_attribute) -> di
cflow_list = find_cflow(relations_df, activity_dict)
# Sorts data frame and appends corresponding control flow
oid_filtered = oid_filtered.loc[oid_filtered['ocel:oid'] != '']
oid_filtered.sort_values(by=['ocel:oid'], ascending=[True], inplace=True)
oid_filtered = oid_filtered.assign(cflow=cflow_list)
......
......@@ -10,16 +10,19 @@ import functions as f
import cluster
from sklearn import cluster as skcluster
from sklearn_extra import cluster as cluster_extra
from clusteval import clusteval
from scipy.spatial import distance
#
# PARAMETERS
p_ocel_file = './data/ocel.jsonocel'
p_mode = 'existence' # all | existence
p_object_type = 'items' # object-type-name, options depend on the data
p_object_type = 'packages' # object-type-name, options depend on the data
p_attr_weights = {
} # attributes that are not given in the data are not used
p_clusteval_mode = 'silhouette'
p_clustering_mode = 'kmeans' # optional, default: kmeans
p_max_cluster_count = 10 # np.NaN # cluster-count (optional, default: np.NaN which leads to automatic k determination)
# p_max_cluster_count = 25 # np.NaN # cluster-count (optional, default: np.NaN which leads to automatic k determination)
p_ocel_file_type = 'json' # json|xml
p_graph_file_type = 'svg' # svg|png
# END PARAMETERS
......@@ -33,8 +36,9 @@ print('p_mode: "' + str(p_mode) + '".')
print('p_object_type: "' + str(p_object_type) + '".')
print('p_attr_weights:')
print(p_attr_weights)
print('p_clusteval_mode: "' + str(p_clusteval_mode) + '".')
print('p_clustering_mode: "' + str(p_clustering_mode) + '".')
print('p_cluster_count: "' + str(p_max_cluster_count) + '".')
# print('p_max_cluster_count: "' + str(p_max_cluster_count) + '".')
print('p_ocel_file_type: "' + str(p_ocel_file_type) + '".')
print('p_graph_file_type: "' + str(p_graph_file_type) + '".')
print('-------------------------------------------------------')
......@@ -78,28 +82,30 @@ index_to_oid_map = res['index']
algo = algorithms[p_clustering_mode]
try:
max_cluster_count = int(p_max_cluster_count)
except:
max_cluster_count = 10
assert max_cluster_count >= 2, 'cluster_count needs to be at least 2'
assert max_cluster_count < len(index_to_oid_map), 'cluster_count needs to be less than the count of distinct objects in the ocel-data.'
# try:
# max_cluster_count = int(p_max_cluster_count)
# except:
# max_cluster_count = 25
# assert max_cluster_count >= 2, 'cluster_count needs to be at least 2'
# assert max_cluster_count < len(index_to_oid_map), 'cluster_count needs to be less than the count of distinct objects in the ocel-data.'
cluster_count = cluster.determine_optimal_k(distance_matrix, algorithm=algo, k_max=max_cluster_count)
# cluster_count = cluster.determine_optimal_k(distance_matrix, algorithm=algo, k_max=max_cluster_count)
results = clusteval(evaluate = p_clusteval_mode).fit(distance_matrix)
cluster_count = results['score']['clusters'].iloc[np.where(results['score']['score'] == results['score']['score'].max())[0][0]]
algo.set_params(n_clusters=cluster_count)
cluster_res = cluster.cluster_matrix(distance_matrix, algorithm=algo)
df_clusters = pd.DataFrame({'cluster': cluster_res[1]}, index=index_to_oid_map) # creating dataframe
df_clusters.index.name='ocel:oid' # setting index name for joining
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' OBJECT-CLUSTERING ')
print('clustering-technique:')
print(p_clustering_mode)
print('max cluster count given:')
print(max_cluster_count)
# print('max cluster count given:')
# print(max_cluster_count)
print('cluster-count used:')
print(cluster_count)
print('object-cluster-dataframe:')
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment