Commit af8f7801 authored by Lennart Holzenkamp's avatar Lennart Holzenkamp
Browse files

merging and externalising stanislavs code to a function.

parents fc0d6360 015aad52
......@@ -190,7 +190,7 @@ def map_activities_to_letter(unique_activities):
cur_letter = chr(ord(cur_letter) + 1)
return activities_dict
def ocel_get_object_distances(ocel, object_type, weights_per_attribute) -> dict['index': list, 'distances': np.matrix]:
def ocel_get_object_distances_lh(ocel, object_type, weights_per_attribute) -> dict['index': list, 'distances': np.matrix]:
# getting all object-types
data_object_types = pm4py.ocel_get_object_types(ocel)
if not object_type in data_object_types: raise Exception('selected object-type-name "' + object_type + '" not present in the data.')
......@@ -257,4 +257,202 @@ def ocel_filter_by_events(ocel: OCEL, events: np.array) -> OCEL:
# assembling ocel
res = OCEL(res_events, res_objects, res_relations, ocel.globals, ocel.parameters)
return res
\ No newline at end of file
return res
#######################################################################
# Calculating the distance matrix based on stanislav's implementation #
# #
# Save unique activies to a list
def get_unique_activities(activities):
activity_list = []
for activity in activities:
if activity in activity_list:
continue
else:
activity_list.append(activity)
return activity_list
# Create hash-table of activities to represent
# each one as a UNICODE-character
def map_activities_to_letter(unique_activities):
activities_dict = {}
cur_letter = 'a'
for activity in unique_activities:
activities_dict[activity] = cur_letter
cur_letter = chr(ord(cur_letter)+1)
return activities_dict
# Create data frame with either the desired object type or the relations
def get_df (df_type, o_type, df_input):
if (df_type == "objects"):
df = pd.DataFrame(df_input.objects)
df = df[df['ocel:type'] == o_type]
# df.drop(['ocel:type', 'ocel:oid'], axis = 1, inplace = True)
df.drop(['ocel:type'], axis = 1, inplace = True)
elif (df_type == "relations"):
df = pd.DataFrame(df_input.relations)
df = df[df['ocel:type'] == o_type]
df = df.filter(items=['ocel:oid', 'ocel:activity', 'ocel:timestamp'])
df = df.sort_values(by=['ocel:oid', 'ocel:timestamp'], ascending=[True, True])
return df
# Store control flow of every object to a list
def find_cflow(relations_df, activity_dict):
cflow_list = []
prev = ''
string = ''
for index, row in relations_df.iterrows():
if (prev != row['ocel:oid'] and prev != ''):
cflow_list.append(string)
string = ''
prev = row['ocel:oid']
string += activity_dict[row['ocel:activity']]
cflow_list.append(string)
return cflow_list
# Drop columns with NaN-only values
def drop_nan_columns(df):
for column in df.columns:
if (df[column].isna().sum() == len(df)):
df.drop([column], axis = 1, inplace = True)
return df
# Create list of NaN replacements for each column
def create_dict_nan(df):
dict_nan = {}
for index, value in df.dtypes.items():
if value == "object":
dict_nan[index] = ""
else:
dict_nan[index] = 0.0
return dict_nan
# Calculates the distance matrix
def distance_matrix(df, weights):
n = int(df.shape[0])
columns = df.columns
matrix = np.zeros([n, n], dtype=float)
for i in range(df.shape[1]):
single_column_df = df[columns[i]]
matrix = matrix + calc_distance(single_column_df, i, n, weights)
return matrix / sum(weights)
# Helper function for the distance matrix calculation
def calc_distance(df, weight_index, n, weights):
matrix = np.empty([n, n], dtype=float)
npm = df.values
if (isinstance(npm[0], str) and df.name != "cflow"):
for j in range(n):
matrix[j][j] = 0
first_elem = npm[j]
for k in range(j+1,n):
matrix[j][k] = first_elem == npm[k]
matrix[k][j] = matrix[j][k]
elif (isinstance(npm[0], float)):
for j in range(n):
matrix[j][j] = 0
first_elem = npm[j]
for k in range(j+1,n):
matrix[j][k] = abs(first_elem - npm[k])
matrix[k][j] = matrix[j][k]
matrix = matrix / matrix.max()
else:
for j in range(n):
matrix[j][j] = 0
first_elem = npm[j]
for k in range(j+1,n):
matrix[j][k] = lev(first_elem, npm[k])
matrix[k][j] = matrix[j][k]
matrix = matrix / matrix.max()
return matrix * weights[weight_index]
# #
# End of stanislav's implementation #
#################################################################
def ocel_get_object_distances_sy(ocel, object_type, weights_per_attribute) -> dict['index': list, 'distances': np.matrix]:
##### CALCULATING DISTANCE MATRIX #####
# Save unique activies to a list
unique_activities = get_unique_activities(sorted(pm4py.ocel_object_type_activities(ocel)[object_type], reverse=False))
# Create hash-table of activities to represent each one as a UNICODE-character
activity_dict = map_activities_to_letter(unique_activities)
# Create data frame with desired object type
oid_filtered = get_df("objects", object_type, ocel)
# Create data frame with all relations
relations_df = get_df("relations", object_type, ocel)
# Store control flow of every object to a list
cflow_list = find_cflow(relations_df, activity_dict)
# Sorts data frame and appends corresponding control flow
oid_filtered.sort_values(by=['ocel:oid'], ascending=[True], inplace=True)
oid_filtered = oid_filtered.assign(cflow=cflow_list)
# Create list of NaN replacements for each column
na_values = create_dict_nan(oid_filtered)
# Drop columns with NaN-only values
oid_filtered = drop_nan_columns(oid_filtered)
# Replace the remaining NaN with empty string or 0)
oid_filtered = oid_filtered.fillna(value=na_values)
# Create list of oids
list_oid = oid_filtered['ocel:oid'].values
# Drop oid because it is not needed for the matrix calculation
oid_filtered.drop(['ocel:oid'], axis = 1, inplace = True)
columns = list(oid_filtered.columns)
weights = []
for column in columns:
weights.append(weights_per_attribute.get(column, 1.0))
# Calculate distance matrix
res = distance_matrix(oid_filtered, weights)
return {
'distances': res,
'index': list_oid
}
......@@ -27,10 +27,12 @@ p_ocel_file = './data/ocel.jsonocel'
p_mode = 'existence'
# object-type-name
p_object_type = 'products'
p_object_type = 'customers'
# example data for attribute weighting in distance measurements
p_attr_weights = {}
p_attr_weights = {
'bankaccount': 1
}
# clustering-mode
p_clustering_mode = 'kmeans'
......@@ -73,11 +75,10 @@ algorithms = {
}
assert p_clustering_mode in algorithms.keys(), 'The given clustering mode "' + p_clustering_mode + '" is not availabel. Use on of: ' + ', '.join(algorithms.keys()) + '.'
# reading ocel data
ocel = pm4py.read_ocel(p_ocel_file)
# calculating distances of objects based on control-flow and all non-NaN attributes of objects
res = f.ocel_get_object_distances(ocel, p_object_type, p_attr_weights)
# res = f.ocel_get_object_distances_lh(ocel, p_object_type, p_attr_weights)
res = f.ocel_get_object_distances_sy(ocel, p_object_type, p_attr_weights)
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment