Yashvardhan Rautela · 0698dfb9
--- a/Main_Sprint_1.ipynb 0 → 100644

+ 583

− 0
+++ b/Main_Sprint_1.ipynb 0 → 100644

+ 583

− 0
+%% Cell type:markdown id:eb7c3ecc tags:
+
+<h2>Connecting with PyCelonis and Data Integration</h2>
+
+%% Cell type:code id:c14ad92b tags:
+
+``` python
+import pycelonis
+import pm4py
+from pycelonis.pql import PQL, PQLColumn, PQLFilter, OrderByColumn
+import pandas as pd
+import warnings
+warnings.filterwarnings("ignore")
+```
+
+%% Cell type:code id:756a1379 tags:
+
+``` python
+# create a get_celonis object
+celonis = pycelonis.get_celonis(base_url = 'academic-celonis-x509kq.eu-2.celonis.cloud',
+                                api_token = 'NTg1YjI0YzUtNjI5Yi00NzRmLWE1ZmQtMWQxYzM3OGRkYWQ0OldNMk5ERkdCUUZER1FJeXdYaTY3dFprOCs5dExOd3M0czRtYXk0dmJyTFhJ',
+                                key_type = 'USER_KEY')
+```
+
+%% Cell type:code id:94d02cb7 tags:
+
+``` python
+# load the dataset
+original_data = pm4py.read_xes('receipt.xes')
+```
+
+%% Output
+
+
+%% Cell type:code id:8c7482d3 tags:
+
+``` python
+ # in our academic license, we can only have at most 100,000 enteries
+if len(original_data) < 100000:
+    data = original_data.copy()
+else:
+    # if there are more than 100,000 rows, select the first 100,000
+    data = original_data[:100000].copy()
+display(data)
+```
+
+%% Output
+
+
+%% Cell type:code id:764ea3b5 tags:
+
+``` python
+# create data pool, or get data pool if it already exists
+try:
+    data_pool = celonis.data_integration.get_data_pools().find("data_pool")
+    print("The data pool already exists and has been fetched successfully.")
+except:
+    data_pool = celonis.data_integration.create_data_pool("data_pool")
+    print("Data pool has been successfully created.")
+```
+
+%% Output
+
+    The data pool already exists and has been fetched successfully.
+
+%% Cell type:code id:5f47bbd4 tags:
+
+``` python
+# create data model, or get data model if it already exists
+try:
+    data_model = data_pool.get_data_models().find("data_model")
+    print("The data model already exists and has been fetched successfully.")
+except:
+    data_model = data_pool.create_data_model("data_model")
+    print("Data model has been successfully created.")
+```
+
+%% Output
+
+    The data model already exists and has been fetched successfully.
+
+%% Cell type:code id:3ad94e97 tags:
+
+``` python
+# create table in data pool, or do nothing if table in data pool already exists
+try:
+    data_pool.create_table(df = data, table_name = "data_table",  drop_if_exists = False)
+    print("Successfully created table in the data pool")
+except:
+    data_pool.get_tables()
+    print("The table already exists in the data pool.")
+```
+
+%% Output
+
+    The table already exists in the data pool.
+
+%% Cell type:code id:6005fcc7 tags:
+
+``` python
+# add the table from the pool to the model, or do nothing if it already exists
+try:
+    tables = data_model.add_table(name = "data_table", alias = "data_table")
+    print("Successfully added table from pool to model")
+except:
+    print("The table already exists in the data model.")
+```
+
+%% Output
+
+    The table already exists in the data model.
+
+%% Cell type:code id:996b1eb3 tags:
+
+``` python
+data_model.reload()
+```
+
+%% Output
+
+
+%% Cell type:code id:5622a2c2 tags:
+
+``` python
+# process Configuration
+tables = data_model.get_tables()
+activities = tables.find("data_table")
+process_configuration = data_model.create_process_configuration(activity_table_id = activities.id,
+                                                                case_id_column = "case:concept:name",
+                                                                activity_column = "concept:name",
+                                                                timestamp_column = "time:timestamp",)
+```
+
+%% Cell type:markdown id:8ae95f12 tags:
+
+<h2>Identification of Batches</h2>
+
+%% Cell type:code id:aac0fafd tags:
+
+``` python
+# get the data, data pool and data model from Celonis (they are created in the Data_Integration)
+data = celonis.data_integration
+data_pool = data.get_data_pools().find("data_pool")
+data_model = data_pool.get_data_models().find("data_model")
+
+# create a PQL object
+pql = PQL()
+
+# extract these columns
+pql.columns.append(PQLColumn(name="Case ID", query="\"data_table\".\"CASE:CONCEPT:NAME\""))
+pql.columns.append(PQLColumn(name="Activity", query='"data_table"."concept:name"'))
+pql.columns.append(PQLColumn(name="Resource", query='"data_table"."ORG:RESOURCE"'))
+pql.columns.append(PQLColumn(name="Time", query='"data_table"."time:timestamp"'))
+
+# load the columns to a dataframe
+dataframe = data_model.export_data_frame(pql)
+```
+
+%% Output
+
+
+%% Cell type:code id:38d9a163 tags:
+
+``` python
+display(dataframe)
+```
+
+%% Output
+
+
+%% Cell type:code id:9d321c08 tags:
+
+``` python
+variables = {}
+
+# detecting batches for different merge distances
+for merge_dist in [15, 240]:
+    variable_name = f"discover_batches_{merge_dist}"
+    variables[variable_name] = pm4py.discover_batches(dataframe, case_id_key='Case ID',
+                                                      resource_key = 'Resource', activity_key = 'Activity',
+                                                      timestamp_key = 'Time', merge_distance = merge_dist*60)
+```
+
+%% Cell type:code id:88c0d7e3 tags:
+
+``` python
+for variable_name, value in variables.items():
+    print("When merge distance(mins) =", variable_name[17:])
+
+    # creating a dataframe to display the results
+    batches_df = pd.DataFrame(columns = ["Activity", "Resource", "No. of Batches",
+                                         "Concurrent batching", "Simultaneous", "Batching at start",
+                                         "Batching at end", "Sequential batching"])
+
+    # getting info of the batches
+    for activity_resource in value:
+        record = {}
+        record["Activity"] = activity_resource[0][0]
+        record["Resource"] = activity_resource[0][1]
+        record["No. of Batches"] = activity_resource[1]
+
+        for batch_type in activity_resource[2]:
+            record[batch_type] = len(activity_resource[2][batch_type])
+
+        batches_df = batches_df.append(record, ignore_index=True)
+
+    batches_df = batches_df.fillna(0)
+    display(batches_df)
+```
+
+%% Output
+
+    When merge distance(mins) = 15
+
+
+    When merge distance(mins) = 240
+
+
+%% Cell type:markdown id:3e690f31 tags:
+
+<h2>Activities repeated by different resources</h2>
+
+%% Cell type:code id:2f3394f8 tags:
+
+``` python
+pql = PQL()
+pql.columns.append(PQLColumn(name="Activity", query='"data_table"."concept:name"'))
+pql.columns.append(PQLColumn(name="Resource", query='"data_table"."org:resource"'))
+pql.columns.append(PQLColumn(name="Case ID", query='"data_table"."case:concept:name"'))
+dataframe = data_model.export_data_frame(pql)
+
+# get dataframe using PQL
+display(dataframe)
+```
+
+%% Output
+
+
+
+%% Cell type:code id:1427a728 tags:
+
+``` python
+# merge the case id and their activities
+dataframe['Different Activities for all cases'] = 'Case ID: '+ dataframe['Case ID'] + ' (activity: ' + dataframe['Activity'] + ')'
+
+# combine the resources which have the same case id and activity
+res_df = dataframe.groupby(['Different Activities for all cases'])['Resource'].apply(list).to_frame()
+
+res_df
+```
+
+%% Output
+
+                                                                                    Resource
+    Different Activities for all cases
+    Case ID: case-10011 (activity: Confirmation of ...                          [Resource21]
+    Case ID: case-10011 (activity: T02 Check confir...              [Resource10, Resource21]
+    Case ID: case-10011 (activity: T03 Adjust confi...                          [Resource21]
+    Case ID: case-10017 (activity: Confirmation of ...                          [Resource30]
+    Case ID: case-10017 (activity: T02 Check confir...  [Resource30, Resource30, Resource30]
+    ...                                                                                  ...
+    Case ID: case-9997 (activity: T02 Check confirm...                          [Resource06]
+    Case ID: case-9997 (activity: T04 Determine con...                          [Resource06]
+    Case ID: case-9997 (activity: T05 Print and sen...                          [Resource06]
+    Case ID: case-9997 (activity: T06 Determine nec...                          [Resource06]
+    Case ID: case-9997 (activity: T10 Determine nec...                          [Resource06]
+    
+    [8332 rows x 1 columns]
+
+%% Cell type:code id:5ca70c7e tags:
+
+``` python
+num=[]
+
+# filter the resources, remove the missing values.
+def filter_resource():
+    res=[]
+    for l1 in  res_df['Resource']:
+        #l2 = list(set(l1))
+        l2=list(filter(None, l1))
+        res.append(l2)
+        num.append(len(l2))
+    return res
+
+res_df['flitered resources']=filter_resource()
+
+#This column shows that the number of resources which are related to an activity,
+# the higher the number is, the higher rework to identify the resources will be
+res_df['Number of resources']=num
+
+
+res_df.reset_index()
+```
+
+%% Output
+
+                         Different Activities for all cases  \
+    0     Case ID: case-10011 (activity: Confirmation of...
+    1     Case ID: case-10011 (activity: T02 Check confi...
+    2     Case ID: case-10011 (activity: T03 Adjust conf...
+    3     Case ID: case-10017 (activity: Confirmation of...
+    4     Case ID: case-10017 (activity: T02 Check confi...
+    ...                                                 ...
+    8327  Case ID: case-9997 (activity: T02 Check confir...
+    8328  Case ID: case-9997 (activity: T04 Determine co...
+    8329  Case ID: case-9997 (activity: T05 Print and se...
+    8330  Case ID: case-9997 (activity: T06 Determine ne...
+    8331  Case ID: case-9997 (activity: T10 Determine ne...
+    
+                                      Resource  \
+    0                             [Resource21]
+    1                 [Resource10, Resource21]
+    2                             [Resource21]
+    3                             [Resource30]
+    4     [Resource30, Resource30, Resource30]
+    ...                                    ...
+    8327                          [Resource06]
+    8328                          [Resource06]
+    8329                          [Resource06]
+    8330                          [Resource06]
+    8331                          [Resource06]
+    
+                            flitered resources  Number of resources
+    0                             [Resource21]                    1
+    1                 [Resource10, Resource21]                    2
+    2                             [Resource21]                    1
+    3                             [Resource30]                    1
+    4     [Resource30, Resource30, Resource30]                    3
+    ...                                    ...                  ...
+    8327                          [Resource06]                    1
+    8328                          [Resource06]                    1
+    8329                          [Resource06]                    1
+    8330                          [Resource06]                    1
+    8331                          [Resource06]                    1
+    
+    [8332 rows x 4 columns]
+
+%% Cell type:markdown id:d6a0d9c6 tags:
+
+<h2>Resource-activity performance for one timestamp</h2>
+
+%% Cell type:code id:062f8346 tags:
+
+``` python
+pql = PQL()
+pql.columns.append(PQLColumn(name="Case_id", query='"data_table_CASES"."case:concept:name"'))
+pql.columns.append(PQLColumn(name="Previous_activity", query='SOURCE("data_table"."CONCEPT:NAME")'))
+pql.columns.append(PQLColumn(name="Current_activitiy", query='TARGET("data_table"."CONCEPT:NAME")'))
+pql.columns.append(PQLColumn(name="Status", query='TARGET("data_table"."lifecycle:transition")'))
+pql.columns.append(PQLColumn(name="Resource", query='TARGET("data_table"."org:resource")'))
+pql.columns.append(PQLColumn(name="Throughput_times", query='SECONDS_BETWEEN(SOURCE("data_table"."TIME:TIMESTAMP"), TARGET("data_table"."TIME:TIMESTAMP"))'))
+
+dataframe = data_model.export_data_frame(pql)
+display(dataframe)
+```
+
+%% Output
+
+
+
+%% Cell type:code id:264016a4 tags:
+
+``` python
+#The average throughput time of the same activity for different resources.
+mean_times = dataframe.groupby(['Current_activitiy', 'Resource'])['Throughput_times'].mean().reset_index()
+mean_times
+```
+
+%% Output
+
+                             Current_activitiy    Resource  Throughput_times
+    0        T02 Check confirmation of receipt  Resource01      75895.181632
+    1        T02 Check confirmation of receipt  Resource02      60169.247632
+    2        T02 Check confirmation of receipt  Resource03         45.059616
+    3        T02 Check confirmation of receipt  Resource04      36173.443571
+    4        T02 Check confirmation of receipt  Resource05       8926.559600
+    ..                                     ...         ...               ...
+    351  T20 Print report Y to stop indication  Resource08         19.479000
+    352  T20 Print report Y to stop indication  Resource09         17.639000
+    353  T20 Print report Y to stop indication  Resource11         25.486667
+    354  T20 Print report Y to stop indication  Resource12         31.518000
+    355  T20 Print report Y to stop indication  Resource13         26.884000
+    
+    [356 rows x 3 columns]
+
+%% Cell type:code id:8c24676c tags:
+
+``` python
+#Get the most and least efficient resources for each activity
+activities_name = mean_times['Current_activitiy'].unique()
+min_dataframes = []
+max_dataframes = []
+for activity in activities_name:
+    subset = mean_times[mean_times['Current_activitiy'] == activity]
+    min_times = subset[subset['Throughput_times'] == subset['Throughput_times'].min()]
+    max_times = subset[subset['Throughput_times'] == subset['Throughput_times'].max()]
+    min_times_each_activity = pd.DataFrame(min_times)
+    max_times_each_activity = pd.DataFrame(max_times)
+    min_dataframes.append(min_times_each_activity)
+    max_dataframes.append(max_times_each_activity)
+```
+
+%% Cell type:code id:a54c9f1c tags:
+
+``` python
+#The smaller the average throughput time, the higher the efficiency.
+most_efficient_resources_per_activity = pd.concat(min_dataframes)
+most_efficient_resources_per_activity
+```
+
+%% Output
+
+                                         Current_activitiy    Resource  \
+    33                   T02 Check confirmation of receipt  Resource35
+    55                  T03 Adjust confirmation of receipt  Resource30
+    90               T04 Determine confirmation of receipt  Resource38
+    111         T05 Print and send confirmation of receipt  Resource18
+    141             T06 Determine necessity of stop advice  Resource16
+    163                 T07-1 Draft intern advice aspect 1  Resource06
+    185                 T07-2 Draft intern advice aspect 2      admin2
+    189        T07-3 Draft intern advice hold for aspect 3  Resource15
+    195     T07-4 Draft internal advice to hold for type 4  Resource26
+    203                 T07-5 Draft intern advice aspect 5  Resource26
+    211              T08 Draft and send request for advice  Resource22
+    218  T09-1 Process or receive external advice from ...  Resource15
+    221  T09-2 Process or receive external advice from ...  Resource26
+    223  T09-3 Process or receive external advice from ...  Resource28
+    226  T09-4 Process or receive external advice from ...  Resource09
+    254         T10 Determine necessity to stop indication  Resource30
+    276           T11 Create document X request unlicensed      admin1
+    291            T12 Check document X request unlicensed      admin1
+    292           T13 Adjust document X request unlicensed  Resource17
+    302        T14 Determine document X request unlicensed  Resource15
+    309            T15 Print document X request unlicensed  Resource03
+    319                 T16 Report reasons to hold request  Resource03
+    327              T17 Check report Y to stop indication  Resource01
+    337             T18 Adjust report Y to stop indicition  Resource11
+    341          T19 Determine report Y to stop indication  Resource07
+    348              T20 Print report Y to stop indication  Resource03
+    
+         Throughput_times
+    33          37.659000
+    55          15.425500
+    90          21.507000
+    111         29.541056
+    141         28.894500
+    163         46.908000
+    185          0.691000
+    189         20.963500
+    195         17.331000
+    203         18.893000
+    211         56.378000
+    218         16.279000
+    221        591.956000
+    223         75.314250
+    226         15.952000
+    254         18.541500
+    276         11.243000
+    291         10.472000
+    292         10.175000
+    302         12.270000
+    309         14.797143
+    319         52.393500
+    327         18.765000
+    337         15.176000
+    341         14.121000
+    348         14.704000
+
+%% Cell type:code id:8e17e3e5 tags:
+
+``` python
+#The larger the average throughput time, the lower the efficiency.
+least_efficient_resources_per_activity = pd.concat(max_dataframes)
+least_efficient_resources_per_activity
+```
+
+%% Output
+
+                                         Current_activitiy    Resource  \
+    36                   T02 Check confirmation of receipt  Resource39
+    52                  T03 Adjust confirmation of receipt  Resource21
+    83               T04 Determine confirmation of receipt  Resource28
+    125         T05 Print and send confirmation of receipt      admin1
+    159             T06 Determine necessity of stop advice      admin2
+    169                 T07-1 Draft intern advice aspect 1  Resource24
+    183                 T07-2 Draft intern advice aspect 2  Resource40
+    191        T07-3 Draft intern advice hold for aspect 3  Resource35
+    193     T07-4 Draft internal advice to hold for type 4  Resource03
+    206                 T07-5 Draft intern advice aspect 5      admin2
+    210              T08 Draft and send request for advice  Resource15
+    219  T09-1 Process or receive external advice from ...  Resource22
+    221  T09-2 Process or receive external advice from ...  Resource26
+    222  T09-3 Process or receive external advice from ...  Resource25
+    228  T09-4 Process or receive external advice from ...  Resource26
+    262         T10 Determine necessity to stop indication      admin2
+    275           T11 Create document X request unlicensed  Resource21
+    282            T12 Check document X request unlicensed  Resource05
+    293           T13 Adjust document X request unlicensed      admin2
+    304        T14 Determine document X request unlicensed  Resource19
+    317            T15 Print document X request unlicensed      admin1
+    325                 T16 Report reasons to hold request  Resource12
+    333              T17 Check report Y to stop indication  Resource11
+    336             T18 Adjust report Y to stop indicition  Resource05
+    345          T19 Determine report Y to stop indication  Resource12
+    349              T20 Print report Y to stop indication  Resource05
+    
+         Throughput_times
+    36       3.074175e+06
+    52       3.744626e+06
+    83       1.205394e+06
+    125      1.916927e+05
+    159      2.174823e+06
+    169      1.894784e+06
+    183      1.821035e+06
+    191      6.175655e+05
+    193      2.994191e+05
+    206      1.740934e+06
+    210      7.199140e+04
+    219      4.669490e+02
+    221      5.919560e+02
+    222      1.037999e+06
+    228      2.260937e+03
+    262      2.407355e+06
+    275      7.764095e+06
+    282      1.999766e+05
+    293      1.478579e+06
+    304      1.965321e+05
+    317      5.776177e+05
+    325      7.326962e+05
+    333      1.521877e+02
+    336      2.032450e+01
+    345      9.250450e+01
+    349      5.209450e+01
+
+%% Cell type:markdown id:d2ac327d tags:
+
+<h2>Resource-activity performance for two timestamps</h2>
+
+%% Cell type:code id:c138d822 tags:
+
+``` python
+pql = PQL()
+pql.columns.append(PQLColumn(name="Case_id", query='"data_table_CASES"."case:concept:name"'))
+pql.columns.append(PQLColumn(name="Activity", query='"data_table"."concept:name"'))
+pql.columns.append(PQLColumn(name="Resource", query='"data_table"."org:resource"'))
+pql.columns.append(PQLColumn(name="Start_time", query='"data_table"."start_timestamp"'))
+pql.columns.append(PQLColumn(name="End_time", query='"data_table"."time:timestamp"'))
+pql.columns.append(PQLColumn(name="Throughput_times", query='SECONDS_BETWEEN ( "data_table"."start_timestamp" , "data_table"."time:timestamp" )'))
+
+dataframe = data_model.export_data_frame(pql)
+```
+
+%% Output
+
+
+    ---------------------------------------------------------------------------
+    PyCelonisDataExportFailedError            Traceback (most recent call last)
+Input     In [23], in <cell line: 9>()
+          6 pql.columns.append(PQLColumn(name="End_time", query='"data_table"."time:timestamp"'))
+          7 pql.columns.append(PQLColumn(name="Throughput_times", query='SECONDS_BETWEEN ( "data_table"."start_timestamp" , "data_table"."time:timestamp" )'))
+    ----> 9 dataframe = data_model.export_data_frame(pql)
+File     ~\anaconda3\lib\site-packages\pycelonis\ems\data_integration\data_model.py:444, in DataModel.export_data_frame(self, query, query_environment)
+        440 data_export = self.create_data_export(query, ExportType.PARQUET, query_environment=query_environment)
+        442 data_export.wait_for_execution()
+    --> 444 data_frames = [read_parquet(chunk) for chunk in data_export.get_chunks()]
+        445 return pd.concat(data_frames).reset_index(drop=True)
+File     ~\anaconda3\lib\site-packages\pycelonis\ems\data_integration\data_model.py:444, in <listcomp>(.0)
+        440 data_export = self.create_data_export(query, ExportType.PARQUET, query_environment=query_environment)
+        442 data_export.wait_for_execution()
+    --> 444 data_frames = [read_parquet(chunk) for chunk in data_export.get_chunks()]
+        445 return pd.concat(data_frames).reset_index(drop=True)
+File     ~\anaconda3\lib\site-packages\pycelonis\ems\data_integration\data_export.py:79, in DataExport.get_chunks(self)
+         76     raise PyCelonisDataExportRunningError()
+         78 if data_export_status.export_status != ExportStatus.DONE:
+    ---> 79     raise PyCelonisDataExportFailedError(data_export_status.export_status, data_export_status.message)
+         81 if data_export_status.export_chunks:
+         82     for i in range(data_export_status.export_chunks):
+    PyCelonisDataExportFailedError: FAILED Column "data_table"."start_timestamp" is missing.
+
+%% Cell type:code id:283d2d90 tags:
+
+``` python
+```