Commit 519abbb4 authored by SimonGlomb's avatar SimonGlomb
Browse files

temp state

parent 94e037c6
from flask import Flask, render_template, request
import os
from sympy import arg
import model
import main
app = Flask(__name__)
params = []
@app.route("/")
def hello_world():
form = model.MyForm()
......@@ -15,9 +14,6 @@ def hello_world():
@app.route("/run_code", methods=["GET"])
def run_code():
global params
arguments = " ".join(params)
os.system("python ./code/main.py " + str(arguments))
return "<p>Back to Main Page</p>"
@app.route("/set_params", methods=["POST"])
......@@ -25,14 +21,15 @@ def set_params():
global params
ocel_file = request.form.get("ocel_file")
form = model.MyForm()
params = [form.ocel_file.data,
form.mode.data,
form.attr_weights.data,
form.object_type.data,
form.clustering_mode.data,
form.cluster_count.data,
form.ocel_file_type.data,
form.graph_file_type.data]
main.p_ocel_file = form.ocel_file.data
main.p_mode = form.mode.data
main.p_attr_weights = form.attr_weights.data
main.p_object_type = form.object_type.data
main.p_clusteval_mode = form.clusteval_mode.data
main.p_clustering_mode = form.clustering_mode.data
main.p_cluster_count = form.cluster_count.data
main.p_ocel_file_type = form.ocel_file_type.data
main.p_graph_file_type = form.graph_file_type.data
if form.validate_on_submit():
return "<p>Parameters where set successfully.</p>"
return render_template('start_page.html', form=form)
......
......@@ -10,230 +10,202 @@ import functions as f
import cluster
from sklearn import cluster as skcluster
import argparse
CLI=argparse.ArgumentParser()
CLI.add_argument(
"p_ocel_file",
type=str,
default=["./data/ocel.jsonocel"],
)
CLI.add_argument(
"p_mode",
type=str,
default=['existence'],
)
CLI.add_argument(
"p_object_type",
type=str,
default=['products'],
)
CLI.add_argument(
"p_attr_weights",
type=dict,
default=[{'producer': 2}],
)
CLI.add_argument(
"p_clustering_mode",
type=str,
default=['kmeans'],
)
CLI.add_argument(
"p_cluster_count",
type=int,
default=[3],
)
CLI.add_argument(
"p_ocel_file_type",
type=str,
default=['json'],
)
CLI.add_argument(
"p_graph_file_type",
type=str,
default=['svg'], # default if nothing is provided
)
args = CLI.parse_args()
#
# PARAMETERS
p_ocel_file = args.p_ocel_file # "./data/ocel.jsonocel"
p_mode = args.p_mode # 'existence' # all | existence
p_object_type = args.p_object_type # 'products' # object-type-name, options depend on the data
p_attr_weights = args.p_attr_weights # {'producer': 2} # attributes that are not given in the data are not used
p_clustering_mode = args.p_clustering_mode # 'kmeans' # optional, default: kmeans
p_cluster_count = args.p_cluster_count # 3 # np.NaN # cluster-count (optional, default: np.NaN which leads to automatic k determination)
p_ocel_file_type = args.p_ocel_file_type # 'json' # json|xml
p_graph_file_type = args.p_graph_file_type # 'svg' # svg|png
p_ocel_file = './data/ocel.jsonocel'
p_mode = 'existence' # all | existence
p_object_type = 'packages' # object-type-name, options depend on the data
p_attr_weights = {
} # attributes that are not given in the data are not used
p_clusteval_mode = 'silhouette'
p_clustering_mode = 'kmeans' # optional, default: kmeans
p_cluster_count = 3 # np.NaN # np.NaN # cluster-count (optional, default: np.NaN which leads to automatic k determination)
p_ocel_file_type = 'json' # json|xml
p_graph_file_type = 'svg' # svg|png
# END PARAMETERS
#
print('Program startet...')
print('-------------------------------------------------------')
print(' SETTINGS ')
print('p_ocel_file: "' + str(p_ocel_file) + '".')
print('p_mode: "' + str(p_mode) + '".')
print('p_object_type: "' + str(p_object_type) + '".')
print('p_attr_weights:')
print(p_attr_weights)
print('p_clustering_mode: "' + str(p_clustering_mode) + '".')
print('p_cluster_count: "' + str(p_cluster_count) + '".')
print('p_ocel_file_type: "' + str(p_ocel_file_type) + '".')
print('p_graph_file_type: "' + str(p_graph_file_type) + '".')
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
# non-data-based assertions
assert p_mode in c.MODES, 'selected mode not possible. Use either ''all'' or ''existence'''
assert exists(p_ocel_file), 'file does not exists'
algorithms = {
'kmeans': skcluster.KMeans(),
'spectral': skcluster.SpectralClustering(),
'agglomerative': skcluster.AgglomerativeClustering()
}
assert p_clustering_mode in algorithms.keys(), 'The given clustering mode "' + p_clustering_mode + '" is not availabel. Use on of: ' + ', '.join(algorithms.keys()) + '.'
ocel = pm4py.read_ocel(p_ocel_file)
res = f.ocel_get_object_distances_sy(ocel, p_object_type, p_attr_weights)
# res_lh = f.ocel_get_object_distances_lh(ocel, p_object_type, p_attr_weights)
# print(res['distances']-res_lh['distances'])
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' DISTANCE CALCULATION ')
print('distances:')
print(res['distances'])
print('first 10 indexes of map:')
print(res['index'][:10])
print('last 10 indexes of map:')
print(res['index'][-10:])
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
distance_matrix = res['distances']
index_to_oid_map = res['index']
algo = algorithms[p_clustering_mode]
try:
cluster_count = int(p_cluster_count)
assert cluster_count >= 2, 'cluster_count needs to be at least 2'
assert cluster_count < len(index_to_oid_map), 'cluster_count needs to be less than the count of distinct objects in the ocel-data.'
except:
cluster_count = cluster.determine_optimal_k(distance_matrix, algorithm=algo, k_max=math.floor(len(index_to_oid_map) / 2)+1)
algo.set_params(n_clusters=cluster_count)
cluster_res = cluster.cluster_matrix(distance_matrix, algorithm=algo)
df_clusters = pd.DataFrame({'cluster': cluster_res[1]}, index=index_to_oid_map) # creating dataframe
df_clusters.index.name='ocel:oid' # setting index name for joining
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' OBJECT-CLUSTERING ')
print('clustering-technique:')
print(p_clustering_mode)
print('given cluster-count:')
print(p_cluster_count)
print('cluster-count used:')
print(cluster_count)
print('object-cluster-dataframe:')
print(df_clusters)
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
df_object_data = ocel.relations[ocel.relations['ocel:type'] == p_object_type].set_index('ocel:oid')[['ocel:eid']]
# get relation data for objects of selected type and set index to oid for joinin and only load ocel:eid column
df_event_object_clusters = df_clusters.join(df_object_data, how='right').reset_index()
# join ob object-cluster and object-event data. hence every event has now its potential cluster candidates assigned
df_cluster_candidates = df_event_object_clusters[['ocel:eid', 'cluster']].drop_duplicates()
# contains all ocel:eid->cluster combinations. now a special aggregation has to be done (existence/all approach)
# function for all approach. returns the number of the cluster (0..n) or -1 if cluster is not unique and -2 if cluster list is empty
def get_single_value(series: pd.Series):
arr = series.unique()
length = len(arr)
if length > 1:
return -1
elif length == 0:
return -2 # error, should not occur
else:
return int(arr[0]) # first element (all are the same)
if p_mode == 'all':
clusters = df_cluster_candidates.groupby('ocel:eid').agg({'cluster': get_single_value})
clusters = clusters[clusters['cluster']>=0]
elif p_mode == 'existence':
clusters = df_cluster_candidates.set_index('ocel:eid')
# clusters contains now a event->cluster mapping.
# All-approach: clusters contains less or equal event ids compared to all events that have a relation to a instance of the specified object type
# Cause: One event can have relations to multiple instances (shopping multiple products) and they can occur in different clusters, hence the event is assigned to no cluster
# Existence-approach: clusters contains more or equal event ids compared to all events that have a relation to a instance of the specified object type.
# Cause: One event can have relations to multiple instances (shopping multiple products) and they can occur in different clusters, hence the event is assigned to multiple clusters
clusters.reset_index(inplace=True)
ocel_clusters = []
groups = clusters.groupby('cluster')['ocel:eid'].apply(list).reset_index(name='event_list').set_index('cluster')
groups = groups.to_dict()['event_list'] # now: {<cluster_0>: [event_01, ..., evenet_m], ..., <cluster_n>: [event_234, ..., evenet_8992]}
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' EVENT-ASSIGNING ')
print('selected mode: ' + p_mode)
print('Event groups:')
for dict_key in groups.keys():
print('cluster "' + str(dict_key) + '":')
group = groups[dict_key]
len_group = len(group)
print('-- element count: ' + str(len_group))
print('-- first 5 elements:')
for ii in range(0, min(5, len_group)):
print('-- -- ' + group[ii])
print('-- last 5 elements:')
for ii in range(max(0, len_group-5), len_group):
print('-- -- ' + group[ii])
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
res = []
for ii in range(0, cluster_count):
tmp_ocel = f.ocel_filter_by_events(ocel, groups[ii])
res.append(tmp_ocel)
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' CREATING SEPARATE OCELs ')
print('ocels:')
print(res)
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
print('-------------------------------------------------------')
print(' STORING CLUSTERED-OCEL-FILES ')
directory = os.path.dirname(p_ocel_file) + '/clustered_ocel_files'
if not os.path.exists(directory):
os.makedirs(directory)
appendix_len = len(str(cluster_count))
for ii in range(0, len(res)):
filename = directory + '/cluster_' + str(ii+1).rjust(appendix_len) + '.' + p_ocel_file_type + 'ocel'
pm4py.write_ocel(ocel, filename)
print(str(ii+1).rjust(appendix_len) + '/' + str(cluster_count) + '"' + filename + '" stored.')
duration = datetime.datetime.now() - start_ts
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
quit()
def main():
global p_ocel_file
global p_mode
global p_object_type
global p_attr_weights
global p_clusteval_mode
global p_clustering_mode
global p_cluster_count
global p_ocel_file_type
global p_graph_file_type
print('Program startet...')
print('-------------------------------------------------------')
print(' SETTINGS ')
print('p_ocel_file: "' + str(p_ocel_file) + '".')
print('p_mode: "' + str(p_mode) + '".')
print('p_object_type: "' + str(p_object_type) + '".')
print('p_attr_weights:')
print(p_attr_weights)
print('p_clustering_mode: "' + str(p_clustering_mode) + '".')
print('p_cluster_count: "' + str(p_cluster_count) + '".')
print('p_ocel_file_type: "' + str(p_ocel_file_type) + '".')
print('p_graph_file_type: "' + str(p_graph_file_type) + '".')
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
# non-data-based assertions
assert p_mode in c.MODES, 'selected mode not possible. Use either ''all'' or ''existence'''
assert exists(p_ocel_file), 'file does not exists'
algorithms = {
'kmeans': skcluster.KMeans(),
'spectral': skcluster.SpectralClustering(),
'agglomerative': skcluster.AgglomerativeClustering()
}
assert p_clustering_mode in algorithms.keys(), 'The given clustering mode "' + p_clustering_mode + '" is not availabel. Use on of: ' + ', '.join(algorithms.keys()) + '.'
ocel = pm4py.read_ocel(p_ocel_file)
res = f.ocel_get_object_distances_sy(ocel, p_object_type, p_attr_weights)
# res_lh = f.ocel_get_object_distances_lh(ocel, p_object_type, p_attr_weights)
# print(res['distances']-res_lh['distances'])
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' DISTANCE CALCULATION ')
print('distances:')
print(res['distances'])
print('first 10 indexes of map:')
print(res['index'][:10])
print('last 10 indexes of map:')
print(res['index'][-10:])
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
distance_matrix = res['distances']
index_to_oid_map = res['index']
algo = algorithms[p_clustering_mode]
try:
cluster_count = int(p_cluster_count)
assert cluster_count >= 2, 'cluster_count needs to be at least 2'
assert cluster_count < len(index_to_oid_map), 'cluster_count needs to be less than the count of distinct objects in the ocel-data.'
except:
cluster_count = cluster.determine_optimal_k(distance_matrix, algorithm=algo, k_max=math.floor(len(index_to_oid_map) / 2)+1)
algo.set_params(n_clusters=cluster_count)
cluster_res = cluster.cluster_matrix(distance_matrix, algorithm=algo)
df_clusters = pd.DataFrame({'cluster': cluster_res[1]}, index=index_to_oid_map) # creating dataframe
df_clusters.index.name='ocel:oid' # setting index name for joining
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' OBJECT-CLUSTERING ')
print('clustering-technique:')
print(p_clustering_mode)
print('given cluster-count:')
print(p_cluster_count)
print('cluster-count used:')
print(cluster_count)
print('object-cluster-dataframe:')
print(df_clusters)
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
df_object_data = ocel.relations[ocel.relations['ocel:type'] == p_object_type].set_index('ocel:oid')[['ocel:eid']]
# get relation data for objects of selected type and set index to oid for joinin and only load ocel:eid column
df_event_object_clusters = df_clusters.join(df_object_data, how='right').reset_index()
# join ob object-cluster and object-event data. hence every event has now its potential cluster candidates assigned
df_cluster_candidates = df_event_object_clusters[['ocel:eid', 'cluster']].drop_duplicates()
# contains all ocel:eid->cluster combinations. now a special aggregation has to be done (existence/all approach)
# function for all approach. returns the number of the cluster (0..n) or -1 if cluster is not unique and -2 if cluster list is empty
def get_single_value(series: pd.Series):
arr = series.unique()
length = len(arr)
if length > 1:
return -1
elif length == 0:
return -2 # error, should not occur
else:
return int(arr[0]) # first element (all are the same)
if p_mode == 'all':
clusters = df_cluster_candidates.groupby('ocel:eid').agg({'cluster': get_single_value})
clusters = clusters[clusters['cluster']>=0]
elif p_mode == 'existence':
clusters = df_cluster_candidates.set_index('ocel:eid')
# clusters contains now a event->cluster mapping.
# All-approach: clusters contains less or equal event ids compared to all events that have a relation to a instance of the specified object type
# Cause: One event can have relations to multiple instances (shopping multiple products) and they can occur in different clusters, hence the event is assigned to no cluster
# Existence-approach: clusters contains more or equal event ids compared to all events that have a relation to a instance of the specified object type.
# Cause: One event can have relations to multiple instances (shopping multiple products) and they can occur in different clusters, hence the event is assigned to multiple clusters
clusters.reset_index(inplace=True)
ocel_clusters = []
groups = clusters.groupby('cluster')['ocel:eid'].apply(list).reset_index(name='event_list').set_index('cluster')
groups = groups.to_dict()['event_list'] # now: {<cluster_0>: [event_01, ..., evenet_m], ..., <cluster_n>: [event_234, ..., evenet_8992]}
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' EVENT-ASSIGNING ')
print('selected mode: ' + p_mode)
print('Event groups:')
for dict_key in groups.keys():
print('cluster "' + str(dict_key) + '":')
group = groups[dict_key]
len_group = len(group)
print('-- element count: ' + str(len_group))
print('-- first 5 elements:')
for ii in range(0, min(5, len_group)):
print('-- -- ' + group[ii])
print('-- last 5 elements:')
for ii in range(max(0, len_group-5), len_group):
print('-- -- ' + group[ii])
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
res = []
for ii in range(0, cluster_count):
tmp_ocel = f.ocel_filter_by_events(ocel, groups[ii])
res.append(tmp_ocel)
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
print(' CREATING SEPARATE OCELs ')
print('ocels:')
print(res)
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
start_ts = datetime.datetime.now()
print('-------------------------------------------------------')
print(' STORING CLUSTERED-OCEL-FILES ')
directory = os.path.dirname(p_ocel_file) + '/clustered_ocel_files'
if not os.path.exists(directory):
os.makedirs(directory)
appendix_len = len(str(cluster_count))
for ii in range(0, len(res)):
filename = directory + '/cluster_' + str(ii+1).rjust(appendix_len) + '.' + p_ocel_file_type + 'ocel'
pm4py.write_ocel(ocel, filename)
print(str(ii+1).rjust(appendix_len) + '/' + str(cluster_count) + '"' + filename + '" stored.')
duration = datetime.datetime.now() - start_ts
print('--------------------------')
print('duration: ' + str(duration))
print('-------------------------------------------------------')
if __name__ =="__main__":
main()
\ No newline at end of file
......@@ -7,6 +7,7 @@ class MyForm(FlaskForm):
mode = StringField('mode', validators=[DataRequired()])
object_type = StringField('object_type', validators=[DataRequired()])
attr_weights = StringField('attr_weights', validators=[DataRequired()])
clusteval_mode = StringField('clusteval_mode')
clustering_mode = StringField('clustering_mode')
cluster_count = StringField('cluster_count')
ocel_file_type = StringField('ocel_file_type')
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment