Commit b9deb382 authored by Stanislav Yuliyanov's avatar Stanislav Yuliyanov
Browse files

Merge branch 'revert-f2cd2bdf' into 'main'

Revert "added distance matrix implementation"

See merge request !2
parents f2cd2bdf c6fe8ac7
......@@ -278,155 +278,4 @@ def ocel_get_object_distances(ocel, object_type, weights_per_attribute, show_log
res[j, i] = res[i, j]
count += 1
return res
#######################################################################
# Calculating the distance matrix based on stanislav's implementation #
# #
# Save unique activies to a list
def get_unique_activities(activities):
activity_list = []
for activity in activities:
if activity in activity_list:
continue
else:
activity_list.append(activity)
return activity_list
# Create hash-table of activities to represent
# each one as a UNICODE-character
def map_activities_to_letter(unique_activities):
activities_dict = {}
cur_letter = 'a'
for activity in unique_activities:
activities_dict[activity] = cur_letter
cur_letter = chr(ord(cur_letter)+1)
return activities_dict
# Create data frame with either the desired object type or the relations
def get_df (df_type, o_type, df_input):
if (df_type == "objects"):
df = pd.DataFrame(df_input.objects)
df = df[df['ocel:type'] == o_type]
# df.drop(['ocel:type', 'ocel:oid'], axis = 1, inplace = True)
df.drop(['ocel:type'], axis = 1, inplace = True)
elif (df_type == "relations"):
df = pd.DataFrame(df_input.relations)
df = df[df['ocel:type'] == o_type]
df = df.filter(items=['ocel:oid', 'ocel:activity', 'ocel:timestamp'])
df = df.sort_values(by=['ocel:oid', 'ocel:timestamp'], ascending=[True, True])
return df
# Store control flow of every object to a list
def find_cflow(relations_df, activity_dict):
cflow_list = []
prev = ''
string = ''
for index, row in relations_df.iterrows():
if (prev != row['ocel:oid'] and prev != ''):
cflow_list.append(string)
string = ''
prev = row['ocel:oid']
string += activity_dict[row['ocel:activity']]
cflow_list.append(string)
return cflow_list
# Drop columns with NaN-only values
def drop_nan_columns(df):
for column in df.columns:
if (df[column].isna().sum() == len(df)):
df.drop([column], axis = 1, inplace = True)
return df
# Create list of NaN replacements for each column
def create_dict_nan(df):
dict_nan = {}
for index, value in df.dtypes.items():
if value == "object":
dict_nan[index] = ""
else:
dict_nan[index] = 0.0
return dict_nan
# Calculates the distance matrix
def distance_matrix(df, weights):
n = int(df.shape[0])
columns = df.columns
matrix = np.array([[0] * n] * n)
for i in range(df.shape[1]):
single_column_df = df[columns[i]]
matrix = matrix + calc_distance(single_column_df, 0, n, weights)
return matrix / df.shape[1]
# Helper function for the distance matrix calculation
def calc_distance(df, weight_index, n, weights):
matrix = np.array([[0] * n] * n)
npm = df.values
if (isinstance(npm[0], str) and df.name != "cflow"):
for j in range(n):
matrix[j][j] = 0
first_elem = npm[j]
for k in range(j+1,n):
matrix[j][k] = first_elem == npm[k]
matrix[k][j] = matrix[j][k]
elif (isinstance(npm[0], float)):
for j in range(n):
matrix[j][j] = 0
first_elem = npm[j]
for k in range(j+1,n):
matrix[j][k] = abs(first_elem - npm[k])
matrix[k][j] = matrix[j][k]
matrix = matrix / matrix.max()
else:
for j in range(n):
first_elem = npm[j]
if (j % 100 == 0):
print(j)
for k in range(j+1,n):
matrix[j][k] = lev(first_elem, npm[k])
matrix[k][j] = matrix[j][k]
matrix = matrix / matrix.max()
return matrix * weights[weight_index]
# #
# End of stanislav's implementation #
#################################################################
\ No newline at end of file
return res
\ No newline at end of file
......@@ -57,61 +57,10 @@ f.print_w_ts('Reading ocel data...')
ocel = pm4py.read_ocel(p_ocel_file)
f.print_w_ts('ocel data read.')
# res = f.ocel_get_object_distances(ocel, p_object_type, p_attr_weights)
##### CALCULATING DISTANCE MATRIX #####
# Setting object type
obj_type = "items"
# List of all weights
weights = [1]
# Save unique activies to a list
unique_activities = f.get_unique_activities(sorted(pm4py.ocel_object_type_activities(ocel)[obj_type], reverse=False))
# Create hash-table of activities to represent each one as a UNICODE-character
activity_dict = f.map_activities_to_letter(unique_activities)
# Create data frame with desired object type
oid_filtered = f.get_df("objects", obj_type, ocel)
# Create data frame with all relations
relations_df = f.get_df("relations", obj_type, ocel)
# Store control flow of every object to a list
cflow_list = f.find_cflow(relations_df, activity_dict)
# Sorts data frame and appends corresponding control flow
oid_filtered.sort_values(by=['ocel:oid'], ascending=[True], inplace=True)
oid_filtered = oid_filtered.assign(cflow=cflow_list)
# Create list of NaN replacements for each column
na_values = f.create_dict_nan(oid_filtered)
# Drop columns with NaN-only values
oid_filtered = f.drop_nan_columns(oid_filtered)
# Replace the remaining NaN with empty string or 0)
oid_filtered = oid_filtered.fillna(value=na_values)
# Create list of oids
list_oid = oid_filtered.filter(items=['ocel:oid']).values
# Drop oid because it is not needed for the matrix calculation
oid_filtered.drop(['ocel:oid'], axis = 1, inplace = True)
# Calculate distance matrix
res = f.distance_matrix(oid_filtered, weights)
#############################################
# res - contains distance matrix
# list_oid - containst list of oid
res = f.ocel_get_object_distances(ocel, p_object_type, p_attr_weights)
f.print_w_ts('distance matrix calculated:')
f.print_w_ts(res)
f.print_w_ts('duration: ' + str(datetime.datetime.now() - start_ts))
print('oid list:')
print(list_oid)
quit()
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment