Commit 7dd9701e authored by Stanislav Yuliyanov's avatar Stanislav Yuliyanov
Browse files

Merge branch 'flask-only-api' into 'main'

Flask only api

See merge request !3
parents 45926e5a 0966d9d7
......@@ -12,4 +12,6 @@ documentation/*/*.blg
data/*
legacy.md
**/__pycache__
data/clustered_ocel_files/*
\ No newline at end of file
data/clustered_ocel_files/*
venv
uploads
\ No newline at end of file
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
This diff is collapsed.
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (oc-dfg-clustering)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/oc-dfg-clustering.iml" filepath="$PROJECT_DIR$/.idea/oc-dfg-clustering.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="Flask">
<option name="enabled" value="true" />
</component>
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
<option name="TEMPLATE_FOLDERS">
<list>
<option value="$MODULE_DIR$/code/templates" />
</list>
</option>
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
# oc-dfg-clustering
Implementation of two clustering techniques, 'Existential' and 'All', to Object-Centric Event Logs (OCEL) to reduce the complexity of Object-Centric Direct Follow Graphs (OC-DFG).
Small library to cluster objects in ocel-files by their control-flow and their attributes. Therefor distances are calculated to later apply clustering on the corresponding distance matrix.
# Install the App
# Library
For using the library import the file from code/ocel_clustering/main. There you may use the method ocel_cluster_by_objects. Fill in the arguments according to the in-code documentation. If you would like to play around with the code try the CLI version.
# CLI
For more information execute the following as the cli-version is self documenting:
For examples how to use the CLI check out the file code/sample-cli-execution.sh, there are some examples.
```powershell
python ./code/cli.py --help
```
# REST-API
For using the REST-API please execute the file json_api.py.
Then a local server starts and you can use the following API-Endpoints. You may use tools like Postman to test the API. Many functions are also easily accessible via your browser.
## ocel-upload
### GET
```http
GET http://127.0.0.1:5000/ocel-upload
```
Returns a json with a list of already uploaded files.
### POST
```http
POST http://127.0.0.1:5000/ocel-upload
```
Using the key-field "file" (which is per default used in a file-upload formular in HTML) a new file is uploaded to the system. If the file already exists an error occurs.
## ocel-upload/\<filename\>
### DELETE
```http
POST http://127.0.0.1:5000/ocel-upload/<filename>
```
Deletes the file with the name 'filename'.
## ocel-cluster/\<filename\>
### POST
```http
POST http://127.0.0.1:5000/ocel-cluster/<filename>
```
Applies the core clustering algorithm for the file described via the filename.
Parameters of the algorithm can be described in the body of the request as json. The following is an example.
```json
{
"data": {
"object-type": "customers",
"mode": "all",
"max-cluster-count": 4,
"attr-weights": {
"cflow": 2
}
}
}
```
A list of all possible parameters:
- _object-type_: type of object to apply the clustering algorithm on.
- _mode_: either "all" or "existence" which defines how the assignment of events to object clusters is done.
- _attr-weights_: A JSON-object to define the weights of attributes (including the pseudo-attribute "cflow" which described the control-flow of events corresponding to a object in the ocel-file).
- _cluster-algorithm_: Defines the clustering-technique to be used. Default is kmeans. possible is also agglomerative or spectral
- _max-cluster-count_: Defines the maximum cluster count. This value can be -1 for auto-maximum (which is the default value) or anything greater or equal to two. ATTENTION: This parameter highly affects the runtime! Be careful/patient with using high values.
- _cluster-count_: If its set to 2 or greater (default is -1 which means automatic) the cluster-count is fixed and max-cluster-count is ignored.
## ocel-cluster/\<filename\>/clusters
### GET
```http
GET http://127.0.0.1:5000/ocel-cluster/<filename>/clusters
```
Returns a list of the found cluster-ids.
## ocel-cluster/\<filename\>/clusters/\<number\>
### GET
```http
GET http://127.0.0.1:5000/ocel-cluster/<filename>/clusters/<number>
```
Returns the ocel-file corresponding to the cluster with the index "number".
## ocel-cluster/\<filename\>/clusters/\<number\>/graph
### GET
```http
GET http://127.0.0.1:5000/ocel-cluster/<filename>/clusters/<number>/graph?activity_threshold=0&edge_threshold=0
```
Creates a graph based on the ocel-file and returns it as download. The parameter define minimum threshold for activities or edges.
If the parameters are omitted theire values are interpreted as 0.
# Docker
## Install the Docker-App
This command will load the necessary docker-image, mount your directories and start the container.
- run .\install_app.bat from the oc-dfg-clustering directory
# Start the App
## Start the Docker-App
This command will only work if you already installed the app. Then it will just start the container and bind the CLI to the program.
- Start your docker daemon.
- run .\start_app.bat from the oc-dfg-clustering directory
\ No newline at end of file
import datetime
import os
from os.path import exists
import pm4py
import ocel_clustering.main as ocel_clustering
import json
import argparse
##########################################################################################################
# Module for executing ocel_clustering algorithm as a cli version.
# parameters are documentet. execute with --help for retrieving detailed information for these parameters
##########################################################################################################
###############
## CONSTANTS ##
###############
DEFAULT_TARGET_DIR = 'ocels_clustered' # default target directory for resulting ocel-files
CLUSTER_MODES = ['existence', 'all']
####################################################
## DEFINING, VALIDATING AND READING CLI-ARGUMENTS ##
####################################################
CLI=argparse.ArgumentParser()
CLI.add_argument(
"ocel_file",
help='The ocel-file to apply the clustering on. File type and suffix has to be either ".jsonocel" or ".xmlocel".',
type=str
)
CLI.add_argument(
"object_type",
help='The type of objects from the ocel file which should be clustered. The object type needs to occur in the given ocel file.',
type=str
)
CLI.add_argument(
"mode",
choices=CLUSTER_MODES,
help='The mode to use for event assingments (either "all" or "existence").',
type=str
)
CLI.add_argument(
"--attr_weights",
type=str,
default="{}",
help='The weights for every attribute of the object instances to use for the internal distance calculation. Needs to be a flat attribute->weight json-string. If you would like to set the weight of the internally computed control-flow distance use the following attribute-name: "' + ocel_clustering.OCEL_COL_NAME_CONTROL_FLOW + '". If no weights are set all weights are set equally.',
required=False
)
CLI.add_argument(
"--clustering_mode",
type=str,
choices=['kmeans', 'spectral', 'agglomerative'],
default='kmeans',
help='The algorithm to use to cluster the data internally. Default value is kmeans.',
required=False
)
def validate_cluster_count(val: str) -> int:
if val in ['auto', 'automatic']:
return -1 # symbol for automatic
res = int(val)
if not res >= 2:
raise argparse.ArgumentTypeError('Parameter needs to be at least 2. Less clusters are not possible. You can also use "auto" or "automatic".')
return res
CLI.add_argument(
"--max_cluster_count",
type=validate_cluster_count,
default='auto',
help='The max count of clusters to create. The software uses the optimum between 2 and the given max_cluster_count. Default is the number of distinct items of the given type (attetion!). Default value is auto.',
required=False
)
CLI.add_argument(
"--cluster_count",
type=validate_cluster_count,
default='auto',
help='The count of clusters to create. If the value is "auto" or "automatic" the software uses the optimum between 2 and the given max_cluster_count. Default value is auto.'
)
CLI.add_argument(
"--ocel_file_type",
type=str,
choices=['json', 'xml'],
default='json',
help='Defines the type of output file. Default value is json.',
required=False
)
CLI.add_argument(
"--graph_file_type",
type=str,
choices=['svg', 'png', 'none'],
default='none',
help='Defines the type of the exported graph file type. If not given or set to none this file is not generated. Default value is none',
required=False
)
CLI.add_argument(
"--graph_activity_threshold",
type=int,
default=0,
help='Defines the activity threshold for a generated graph (only if --graph_file_type is not "none"). Default value is 0.',
required=False
)
CLI.add_argument(
"--graph_edge_threshold",
type=int,
default=0,
help='Defines the edge threshold for a generated graph (only if --graph_file_type is not "none"). Default value is 0.',
required=False
)
def validate_directory(dir: str) -> str:
res = dir
while(len(res) > 0 and res[0] == '/'):
res = res[1:]
while(len(res) > 0 and res[-1] == '/'):
res = res[:len(res)-1]
return str(os.getcwd() + '/' + res)
CLI.add_argument(
"--target_dir",
type=validate_directory,
default=DEFAULT_TARGET_DIR,
required=False,
help='Defines the target directory relative to the current directory or a full path. Default value is: ' + DEFAULT_TARGET_DIR
)
args = CLI.parse_args()
########################################
## DATA-BASED-VALIDATING OF ARGUMENTS ##
########################################
# Check file existence
assert exists(args.ocel_file), 'Given file does not exists.'
# Reading file to OCEL-structure and possible object types
ocel = pm4py.read_ocel(args.ocel_file)
ocel_object_types = list(ocel.objects['ocel:type'].unique())
# Validating if given args.object_type is possible
assert args.object_type in ocel_object_types, 'Given type "' + args.object_type + '" is not present in the data please use one of: "' + '", "'.join(ocel_object_types) + '".'
# reading args.attr_weights as json
try:
print(args.attr_weights)
args.attr_weights = json.loads(args.attr_weights)
except json.JSONDecodeError as error_msg:
raise error_msg # further improvement needed
# checking if clustering mode is possible
assert args.clustering_mode in ocel_clustering.OCEL_CLUSTER_ALGORITHMS, 'The given clustering mode "' + args.clustering_mode + '" is not availabel. Use on of: ' + ', '.join(ocel_clustering.OCEL_CLUSTER_ALGORITHMS) + '.'
print('-------------------------------------------------------')
print(' SETTINGS ')
print('ocel_file: ' + str(args.ocel_file))
print('mode: ' + str(args.mode))
print('object_type: ' + str(args.object_type))
print('attr_weights:')
print(args.attr_weights)
print('clustering_mode: ' + str(args.clustering_mode))
print('cluster_count: ' + str(args.cluster_count))
print('max_cluster_count: ' + str(args.max_cluster_count))
print('target_dir: ' + str(args.target_dir))
print('ocel_file_type: ' + str(args.ocel_file_type))
print('graph_file_type: ' + str(args.graph_file_type))
print('graph_activity_threshold: ' + str(args.graph_activity_threshold))
print('graph_edge_threshold: ' + str(args.graph_edge_threshold))
print('-------------------------------------------------------')
#################################
## PREPARING ATTRIBUTE WEIGHTS ##
#################################
# getting default full attribute definition
attr_def = ocel_clustering.ocel_get_attr_def(ocel)
def set_weights_function(attr_def: dict) -> dict:
res = attr_def
if res['name'] in args.attr_weights.keys():
res['weight'] = args.attr_weights[res['name']]
return res
attr_def = list(map(set_weights_function, attr_def))
control_flow_weight = args.attr_weights.get(ocel_clustering.OCEL_COL_NAME_CONTROL_FLOW, 1.0)
##############################
## EXECUTING CORE ALGORITHM ##
##############################
print('calculating...')
start_ts = datetime.datetime.now()
res = ocel_clustering.ocel_cluster_by_objects(
ocel=ocel,
object_type = args.object_type,
event_assignment_mode = args.mode,
attr_def = attr_def,
clustering_algorithm = args.clustering_mode,
max_cluster_count = args.max_cluster_count,
cluster_count = args.cluster_count,
control_flow_active = control_flow_weight != 0.0,
control_flow_weight = control_flow_weight
)
print('duration: ' + str(datetime.datetime.now() - start_ts))
##########################
## STORING OUTPUT FILES ##
##########################
print('-------------------------------------------------------')
print(' STORING CLUSTERED-OCEL-FILES ')
if not os.path.exists(args.target_dir):
os.makedirs(args.target_dir)
appendix_len = len(str(len(res)))
for ii in range(0, len(res)):
filename = args.target_dir + '/cluster_' + str(ii+1).rjust(appendix_len, '0') + '.' + args.ocel_file_type + 'ocel'
pm4py.write_ocel(res[ii], filename)
print(str(ii+1).rjust(appendix_len, '0') + '/' + str(len(res)) + ' "' + filename + '" stored.')
####################################################
## OPTIONAL DISCOVERING OCDFGS AND STORING IMAGES ##
####################################################
if args.graph_file_type != 'none':
print('-------------------------------------------------------')
print(' GENERATING AND STORING GRAPHS ')
for ii in range(0, len(res)):
filename = args.target_dir + '/cluster_' + str(ii+1).rjust(appendix_len, '0') + '_ocdfg.' + args.graph_file_type
ocdfg = pm4py.discover_ocdfg(res[ii])
pm4py.save_vis_ocdfg(ocdfg, filename, act_threshold=args.graph_activity_threshold, edge_threshold=args.graph_edge_threshold)
print(str(ii+1).rjust(appendix_len, '0') + '/' + str(len(res)) + ' "' + filename + '" stored.')
quit()
from bisect import bisect_left
from sklearn import cluster
from sklearn_extra import cluster as cluster_extra
from sklearn.metrics import silhouette_score
#matrix = np.array([[0,0.7,0.9], [0.7,0,0.2], [0.9,0.2,0]])
# the list_of_clusters elements which are returned in a list are ordered by index
def cluster_matrix(matrix, algorithm=cluster.KMeans(n_clusters=2, random_state=0)):
model = algorithm.fit(matrix)
labels = model.labels_
num_clusters = model.n_clusters
list_of_clusters = []
# list_of_clusters_values = []
for _ in range(num_clusters):
list_of_clusters.append([])
# list_of_clusters_values.append([])
for i, label in enumerate(labels):
list_of_clusters[label].append(i)
# list_of_clusters_values[label].append(matrix[i])
return list_of_clusters, labels
# k = number of clusters
# m = len(list_of_clusters[i]) \forall i \in [k]
# n = len(items)
# creating a set: O(n), thus not worth creating it for lookup
# O(k*n*m)
# For all-assigning, only one cluster is possible
def assign_event_to_cluster_all(items: list[int], list_of_clusters):
for cluster in list_of_clusters:
is_in_all = True
for item in items:
if item not in cluster:
is_in_all = False
if is_in_all:
return cluster
return False
# We use the property that the cluster-lists are ordered
# Thus O(k*n*log(m))
# For exist-assigning, we will return all suitable clusters
# time complexity of "cluster not in suitable_clusters" is negligible, because size of suitable_clusters is assumed to be small ( <k )
def assign_event_to_cluster_exists(items: list[int], list_of_clusters):
suitable_clusters = []
for cluster in list_of_clusters:
for item in items:
if bin_search(cluster, item) and cluster not in suitable_clusters:
suitable_clusters.append(cluster)
if suitable_clusters == []:
return False
return suitable_clusters
# O(log(n))
def bin_search(list, x):
'Locate the leftmost value exactly equal to x'
i = bisect_left(list, x)
if i != len(list) and list[i] == x:
return True
else:
return False
def determine_optimal_k(X, algorithm, k_max=20):
score_max = 0.0
best_k = 2
results = []
for k in range(2, k_max):
alg = algorithm.set_params(n_clusters=k)
alg.fit(X)
labels = alg.labels_
score = silhouette_score(X, labels, metric = 'euclidean')
results.append([k, score])
if score > score_max:
best_k = k
score_max = score
return best_k
# cluster_matrix(matrix, algorithm=cluster_extra.KMedoids(n_clusters=3, random_state=0))
# list_of_clusters, labels = cluster_matrix(matrix, cluster.KMeans(n_clusters=2, random_state=0))
# list_of_clusters, labels = cluster_matrix(matrix, algorithm=cluster.SpectralClustering(n_clusters=3, random_state=0))
#list_of_clusters, labels = cluster_matrix(matrix, algorithm=cluster.AgglomerativeClustering(n_clusters=3))
# print(determine_optimal_k(matrix, cluster.KMeans(random_state=0)))
# assign_event_to_cluster_exists([1,4], list_of_clusters)
\ No newline at end of file
MODES = ['all', 'existence']
DEFAULT_VALUES = {
'float64': 0.0,
'str': '',
'list': [],
'tuple': (),
'cf': ''
}
DEFAULT_CF_ATTR_NAME = 'control_flow'
LS_CACHE_ACTIVE = 0 # useless since library function uses cache (it seems like that)
\ No newline at end of file
from tkinter.tix import InputOnly
from typing import Any
import numpy as np
import pandas as pd
from pm4py.objects.ocel.obj import OCEL
from Levenshtein import distance as lev
import pm4py
import constants as c
import scipy.spatial.distance as spd
def count_non_nans(data):
count = 0
for entry in data:
if not pd.isnull(entry):
count += 1
return count
def check_suffixes(suffixes=tuple[str, str]):
if len(suffixes) != 2: raise Exception('length of suffixes need to be exact two.')
if suffixes[0] == suffixes[1]: raise Exception('suffixes need to be distinct')
return True
def df_create_cross_df(df: pd.DataFrame, suffixes=('_x', '_y')) -> pd.DataFrame:
"""
Creates a full cross join of a dataframe and preservers the index used before.
"""
check_suffixes(suffixes) # Checking if suffixes are valid.
# setting the new list of index columns (given dataframe could already use a MultiIndex)
new_index_names = list(map(lambda name: name+suffixes[0], df.index.names)) + list(map(lambda name: name+suffixes[1], df.index.names))
# Resetting the index for preserving the index later.
pre = df.reset_index(inplace=False)
cross = pre.join(pre, how='cross', lsuffix=suffixes[0], rsuffix=suffixes[1]) # 2.5 sec
cross.set_index(new_index_names, verify_integrity=False, inplace=True) # 11 sec -> 6.2 sec
return cross
# returns a dataframe based on a cross join of the given dataframe.
# All attributes are replaced by their row wise distance which is calculated via the given functions in attribute_func_map
def df_pairwise_attr_distance(df: pd.DataFrame, attribute_func_map: dict[str, Any], suffixes=('_x', '_y')) -> pd.DataFrame:
check_suffixes(suffixes)
# getting relevant attributes
selected_attributes = list(set(attribute_func_map.keys()).intersection(df.columns))
# CARTESIAN PRODUCT of all data
cross = df_create_cross_df(df[selected_attributes], suffixes=suffixes) # ~8-9 seconds
# determining data_types
for attr in selected_attributes:
# creating attribute names for each 'side'
new_attr_names = {0: attr+suffixes[0], 1: attr+suffixes[1]}
# getting unique values
unique_vals = df[attr].unique()
# setting function for distance calculation per attribute
func = attribute_func_map[attr]
if type(func) == None:
raise Exception('No function defined for attribute "' + attr + '".')
# reshaping and calculating the distances (its only done one ways by pdist, hence squareform is necessary)
reshaped_vals = unique_vals.reshape(-1,1)
# CALCULATING (less than 1s)
d_matrix = spd.pdist(reshaped_vals, func)
d_matrix = spd.squareform(d_matrix)
# creating dataframe (matrix like) of the result setting index and columns accordingly
res = pd.DataFrame(d_matrix)
res.index = unique_vals
res.columns = unique_vals
# RESHAPING (less than 10 ms)
res = res.rename_axis(index=new_attr_names[0], columns=new_attr_names[1]).melt(ignore_index=False) # retransfrom from matrix to list
res.reset_index(inplace=True)
# SETTING MULTIINDEX (less than 100 ms)
res.set_index(list(new_attr_names.values()), verify_integrity=False, inplace=True)
res.rename({'value': attr}, inplace=True, axis=1) # 'value' is the automatic name
# JOINING RESULTS to cross table (~11-12s)
cross = cross.join(res, on=list(new_attr_names.values()), how='left') # some kind of mapping may be faster...
# DELETING COLUMNS (less than 1s)
del cross[new_attr_names[0]]
del cross[new_attr_names[1]]
# FILLING NaNs (less than 1s)
cross[attr] = cross[attr].fillna(0.0)
return cross
# Determines column types by first elements that are not NaN
# becomes: float64, string, list, unknown
def df_determine_data_types(df: pd.DataFrame) -> dict[str, str]:
types = {}
for column in (set(df.columns) - set(types.keys())):
first_non_nan = df[column].loc[~df[column].isnull()].iloc[0]
types[column] = type(first_non_nan).__name__
types['control_flow'] = 'cf' # dirty fix!
# fix that also the dtypes thing gives a string
return types