Commit 53d4e8a7 authored by Lennart Holzenkamp's avatar Lennart Holzenkamp
Browse files

fixing stanislavs and lennarts implementation so that they always give the same result.

parent 0d4c8467
......@@ -195,8 +195,15 @@ def ocel_get_object_distances_lh(ocel, object_type, weights_per_attribute) -> di
# NORMALIZING / WEIGHTING / AVERAGING
df_distance_matrix = df_normalize_columns(df_distance_matrix, list(attr_func_map.keys()))
df_distance_matrix = df_weight_columns(df_distance_matrix, weights_per_attribute)
df_distance_matrix['distance_avg'] = df_distance_matrix[list(attr_func_map.keys())].mean(axis=1)
# df_distance_matrix = df_normalize_columns(df_distance_matrix, ['distance_avg']) -> leads to wrong results compared to stanislav
divisor = 0.0
for attribute in list(attr_func_map.keys()):
divisor += weights_per_attribute.get(attribute, 1.0)
if divisor == 0.0: divisor = 1.0
df_distance_matrix['distance_avg'] = df_distance_matrix[list(attr_func_map.keys())].sum(axis=1) / divisor
# df_distance_matrix['distance_avg'] = df_distance_matrix[list(attr_func_map.keys())].mean(axis=1) -> mean over 3 values is not the same as weighting and dividing by the weights sum
# df_distance_matrix = df_normalize_columns(df_distance_matrix, ['distance_avg']) -> leads to wrong results compared to stanislav -> but maybe useful normalization
df_distance_matrix = df_distance_matrix.drop(list(attr_func_map.keys()), axis=1) # single distances not relevant anymore
# Creating the matrix-index->ocel:oid map
......@@ -335,47 +342,32 @@ def distance_matrix(df, weights):
single_column_df = df[columns[i]]
matrix = matrix + calc_distance(single_column_df, i, n, weights)
print('weight-sum:' + str(sum(weights)))
return matrix / sum(weights)
# Helper function for the distance matrix calculation
def calc_distance(df, weight_index, n, weights):
matrix = np.empty([n, n], dtype=float)
matrix = np.zeros([n, n], dtype=float)
npm = df.values
distance_func = lambda x,y: abs(x-y)
if (isinstance(npm[0], str) and df.name != "cflow"):
for j in range(n):
matrix[j][j] = 0
first_elem = npm[j]
for k in range(j+1,n):
matrix[j][k] = first_elem == npm[k]
matrix[k][j] = matrix[j][k]
distance_func = lambda x,y: x!=y
elif (isinstance(npm[0], float)):
matrix_temp = np.empty([n], dtype=float)
for j in range(n):
first_elem = npm[j]
for k in range(j + 1, n):
matrix_temp[k] = abs(first_elem - npm[k])
for m in range(j):
matrix_temp[m] = matrix[m][j]
matrix_temp[j] = 0
matrix[j] = matrix_temp
matrix = matrix / matrix.max()
distance_func = lambda x,y: abs(x-y)
else:
matrix_temp = np.empty([n], dtype=float)
for j in range(n):
distance_func = lev
for j in range(n):
first_elem = npm[j]
for k in range(j + 1, n):
matrix_temp[k] = lev(first_elem, npm[k])
for m in range(j):
matrix_temp[m] = matrix[m][j]
matrix_temp[j] = 0
matrix[j] = matrix_temp
matrix = matrix / matrix.max()
for k in range(j+1,n):
matrix[j, k] = distance_func(first_elem, npm[k])
matrix = matrix + matrix.T
matrix = matrix / matrix.max()
print('current weight: ' + str(weights[weight_index]))
return matrix * weights[weight_index]
......@@ -416,6 +408,7 @@ def ocel_get_object_distances_sy(ocel, object_type, weights_per_attribute) -> di
# Drop oid because it is not needed for the matrix calculation
oid_filtered.drop(['ocel:oid'], axis = 1, inplace = True)
# weight-dictionary to array
columns = list(oid_filtered.columns)
weights = []
for column in columns:
......
......@@ -14,8 +14,9 @@ from sklearn import cluster as skcluster
# PARAMETERS
p_ocel_file = './data/ocel.jsonocel'
p_mode = 'existence' # all | existence
p_object_type = 'customers' # object-type-name, options depend on the data
p_object_type = 'products' # object-type-name, options depend on the data
p_attr_weights = {
'producer': 2
} # attributes that are not given in the data are not used
p_clustering_mode = 'kmeans' # optional, default: kmeans
p_cluster_count = np.NaN # cluster-count (optional, default: np.NaN which leads to automatic k determination)
......@@ -52,8 +53,10 @@ assert p_clustering_mode in algorithms.keys(), 'The given clustering mode "' + p
ocel = pm4py.read_ocel(p_ocel_file)
# res_lh = f.ocel_get_object_distances_lh(ocel, p_object_type, p_attr_weights)
res = f.ocel_get_object_distances_sy(ocel, p_object_type, p_attr_weights)
# res_lh = f.ocel_get_object_distances_lh(ocel, p_object_type, p_attr_weights)
# print(res['distances']-res_lh['distances'])
duration = datetime.datetime.now() - start_ts
print('-------------------------------------------------------')
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment