Skip to content
Snippets Groups Projects

Upload Sprint 1

Closed Yashvardhan Rautela requested to merge yashvardhanrautela-main-patch-30769 into main
1 file
+ 583
0
Compare changes
  • Side-by-side
  • Inline
+ 583
0
%% Cell type:markdown id:eb7c3ecc tags:
<h2>Connecting with PyCelonis and Data Integration</h2>
%% Cell type:code id:c14ad92b tags:
``` python
import pycelonis
import pm4py
from pycelonis.pql import PQL, PQLColumn, PQLFilter, OrderByColumn
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
```
%% Cell type:code id:756a1379 tags:
``` python
# create a get_celonis object
celonis = pycelonis.get_celonis(base_url = 'academic-celonis-x509kq.eu-2.celonis.cloud',
api_token = 'NTg1YjI0YzUtNjI5Yi00NzRmLWE1ZmQtMWQxYzM3OGRkYWQ0OldNMk5ERkdCUUZER1FJeXdYaTY3dFprOCs5dExOd3M0czRtYXk0dmJyTFhJ',
key_type = 'USER_KEY')
```
%% Cell type:code id:94d02cb7 tags:
``` python
# load the dataset
original_data = pm4py.read_xes('receipt.xes')
```
%% Output
%% Cell type:code id:8c7482d3 tags:
``` python
# in our academic license, we can only have at most 100,000 enteries
if len(original_data) < 100000:
data = original_data.copy()
else:
# if there are more than 100,000 rows, select the first 100,000
data = original_data[:100000].copy()
display(data)
```
%% Output
%% Cell type:code id:764ea3b5 tags:
``` python
# create data pool, or get data pool if it already exists
try:
data_pool = celonis.data_integration.get_data_pools().find("data_pool")
print("The data pool already exists and has been fetched successfully.")
except:
data_pool = celonis.data_integration.create_data_pool("data_pool")
print("Data pool has been successfully created.")
```
%% Output
The data pool already exists and has been fetched successfully.
%% Cell type:code id:5f47bbd4 tags:
``` python
# create data model, or get data model if it already exists
try:
data_model = data_pool.get_data_models().find("data_model")
print("The data model already exists and has been fetched successfully.")
except:
data_model = data_pool.create_data_model("data_model")
print("Data model has been successfully created.")
```
%% Output
The data model already exists and has been fetched successfully.
%% Cell type:code id:3ad94e97 tags:
``` python
# create table in data pool, or do nothing if table in data pool already exists
try:
data_pool.create_table(df = data, table_name = "data_table", drop_if_exists = False)
print("Successfully created table in the data pool")
except:
data_pool.get_tables()
print("The table already exists in the data pool.")
```
%% Output
The table already exists in the data pool.
%% Cell type:code id:6005fcc7 tags:
``` python
# add the table from the pool to the model, or do nothing if it already exists
try:
tables = data_model.add_table(name = "data_table", alias = "data_table")
print("Successfully added table from pool to model")
except:
print("The table already exists in the data model.")
```
%% Output
The table already exists in the data model.
%% Cell type:code id:996b1eb3 tags:
``` python
data_model.reload()
```
%% Output
%% Cell type:code id:5622a2c2 tags:
``` python
# process Configuration
tables = data_model.get_tables()
activities = tables.find("data_table")
process_configuration = data_model.create_process_configuration(activity_table_id = activities.id,
case_id_column = "case:concept:name",
activity_column = "concept:name",
timestamp_column = "time:timestamp",)
```
%% Cell type:markdown id:8ae95f12 tags:
<h2>Identification of Batches</h2>
%% Cell type:code id:aac0fafd tags:
``` python
# get the data, data pool and data model from Celonis (they are created in the Data_Integration)
data = celonis.data_integration
data_pool = data.get_data_pools().find("data_pool")
data_model = data_pool.get_data_models().find("data_model")
# create a PQL object
pql = PQL()
# extract these columns
pql.columns.append(PQLColumn(name="Case ID", query="\"data_table\".\"CASE:CONCEPT:NAME\""))
pql.columns.append(PQLColumn(name="Activity", query='"data_table"."concept:name"'))
pql.columns.append(PQLColumn(name="Resource", query='"data_table"."ORG:RESOURCE"'))
pql.columns.append(PQLColumn(name="Time", query='"data_table"."time:timestamp"'))
# load the columns to a dataframe
dataframe = data_model.export_data_frame(pql)
```
%% Output
%% Cell type:code id:38d9a163 tags:
``` python
display(dataframe)
```
%% Output
%% Cell type:code id:9d321c08 tags:
``` python
variables = {}
# detecting batches for different merge distances
for merge_dist in [15, 240]:
variable_name = f"discover_batches_{merge_dist}"
variables[variable_name] = pm4py.discover_batches(dataframe, case_id_key='Case ID',
resource_key = 'Resource', activity_key = 'Activity',
timestamp_key = 'Time', merge_distance = merge_dist*60)
```
%% Cell type:code id:88c0d7e3 tags:
``` python
for variable_name, value in variables.items():
print("When merge distance(mins) =", variable_name[17:])
# creating a dataframe to display the results
batches_df = pd.DataFrame(columns = ["Activity", "Resource", "No. of Batches",
"Concurrent batching", "Simultaneous", "Batching at start",
"Batching at end", "Sequential batching"])
# getting info of the batches
for activity_resource in value:
record = {}
record["Activity"] = activity_resource[0][0]
record["Resource"] = activity_resource[0][1]
record["No. of Batches"] = activity_resource[1]
for batch_type in activity_resource[2]:
record[batch_type] = len(activity_resource[2][batch_type])
batches_df = batches_df.append(record, ignore_index=True)
batches_df = batches_df.fillna(0)
display(batches_df)
```
%% Output
When merge distance(mins) = 15
When merge distance(mins) = 240
%% Cell type:markdown id:3e690f31 tags:
<h2>Activities repeated by different resources</h2>
%% Cell type:code id:2f3394f8 tags:
``` python
pql = PQL()
pql.columns.append(PQLColumn(name="Activity", query='"data_table"."concept:name"'))
pql.columns.append(PQLColumn(name="Resource", query='"data_table"."org:resource"'))
pql.columns.append(PQLColumn(name="Case ID", query='"data_table"."case:concept:name"'))
dataframe = data_model.export_data_frame(pql)
# get dataframe using PQL
display(dataframe)
```
%% Output
%% Cell type:code id:1427a728 tags:
``` python
# merge the case id and their activities
dataframe['Different Activities for all cases'] = 'Case ID: '+ dataframe['Case ID'] + ' (activity: ' + dataframe['Activity'] + ')'
# combine the resources which have the same case id and activity
res_df = dataframe.groupby(['Different Activities for all cases'])['Resource'].apply(list).to_frame()
res_df
```
%% Output
Resource
Different Activities for all cases
Case ID: case-10011 (activity: Confirmation of ... [Resource21]
Case ID: case-10011 (activity: T02 Check confir... [Resource10, Resource21]
Case ID: case-10011 (activity: T03 Adjust confi... [Resource21]
Case ID: case-10017 (activity: Confirmation of ... [Resource30]
Case ID: case-10017 (activity: T02 Check confir... [Resource30, Resource30, Resource30]
... ...
Case ID: case-9997 (activity: T02 Check confirm... [Resource06]
Case ID: case-9997 (activity: T04 Determine con... [Resource06]
Case ID: case-9997 (activity: T05 Print and sen... [Resource06]
Case ID: case-9997 (activity: T06 Determine nec... [Resource06]
Case ID: case-9997 (activity: T10 Determine nec... [Resource06]
[8332 rows x 1 columns]
%% Cell type:code id:5ca70c7e tags:
``` python
num=[]
# filter the resources, remove the missing values.
def filter_resource():
res=[]
for l1 in res_df['Resource']:
#l2 = list(set(l1))
l2=list(filter(None, l1))
res.append(l2)
num.append(len(l2))
return res
res_df['flitered resources']=filter_resource()
#This column shows that the number of resources which are related to an activity,
# the higher the number is, the higher rework to identify the resources will be
res_df['Number of resources']=num
res_df.reset_index()
```
%% Output
Different Activities for all cases \
0 Case ID: case-10011 (activity: Confirmation of...
1 Case ID: case-10011 (activity: T02 Check confi...
2 Case ID: case-10011 (activity: T03 Adjust conf...
3 Case ID: case-10017 (activity: Confirmation of...
4 Case ID: case-10017 (activity: T02 Check confi...
... ...
8327 Case ID: case-9997 (activity: T02 Check confir...
8328 Case ID: case-9997 (activity: T04 Determine co...
8329 Case ID: case-9997 (activity: T05 Print and se...
8330 Case ID: case-9997 (activity: T06 Determine ne...
8331 Case ID: case-9997 (activity: T10 Determine ne...
Resource \
0 [Resource21]
1 [Resource10, Resource21]
2 [Resource21]
3 [Resource30]
4 [Resource30, Resource30, Resource30]
... ...
8327 [Resource06]
8328 [Resource06]
8329 [Resource06]
8330 [Resource06]
8331 [Resource06]
flitered resources Number of resources
0 [Resource21] 1
1 [Resource10, Resource21] 2
2 [Resource21] 1
3 [Resource30] 1
4 [Resource30, Resource30, Resource30] 3
... ... ...
8327 [Resource06] 1
8328 [Resource06] 1
8329 [Resource06] 1
8330 [Resource06] 1
8331 [Resource06] 1
[8332 rows x 4 columns]
%% Cell type:markdown id:d6a0d9c6 tags:
<h2>Resource-activity performance for one timestamp</h2>
%% Cell type:code id:062f8346 tags:
``` python
pql = PQL()
pql.columns.append(PQLColumn(name="Case_id", query='"data_table_CASES"."case:concept:name"'))
pql.columns.append(PQLColumn(name="Previous_activity", query='SOURCE("data_table"."CONCEPT:NAME")'))
pql.columns.append(PQLColumn(name="Current_activitiy", query='TARGET("data_table"."CONCEPT:NAME")'))
pql.columns.append(PQLColumn(name="Status", query='TARGET("data_table"."lifecycle:transition")'))
pql.columns.append(PQLColumn(name="Resource", query='TARGET("data_table"."org:resource")'))
pql.columns.append(PQLColumn(name="Throughput_times", query='SECONDS_BETWEEN(SOURCE("data_table"."TIME:TIMESTAMP"), TARGET("data_table"."TIME:TIMESTAMP"))'))
dataframe = data_model.export_data_frame(pql)
display(dataframe)
```
%% Output
%% Cell type:code id:264016a4 tags:
``` python
#The average throughput time of the same activity for different resources.
mean_times = dataframe.groupby(['Current_activitiy', 'Resource'])['Throughput_times'].mean().reset_index()
mean_times
```
%% Output
Current_activitiy Resource Throughput_times
0 T02 Check confirmation of receipt Resource01 75895.181632
1 T02 Check confirmation of receipt Resource02 60169.247632
2 T02 Check confirmation of receipt Resource03 45.059616
3 T02 Check confirmation of receipt Resource04 36173.443571
4 T02 Check confirmation of receipt Resource05 8926.559600
.. ... ... ...
351 T20 Print report Y to stop indication Resource08 19.479000
352 T20 Print report Y to stop indication Resource09 17.639000
353 T20 Print report Y to stop indication Resource11 25.486667
354 T20 Print report Y to stop indication Resource12 31.518000
355 T20 Print report Y to stop indication Resource13 26.884000
[356 rows x 3 columns]
%% Cell type:code id:8c24676c tags:
``` python
#Get the most and least efficient resources for each activity
activities_name = mean_times['Current_activitiy'].unique()
min_dataframes = []
max_dataframes = []
for activity in activities_name:
subset = mean_times[mean_times['Current_activitiy'] == activity]
min_times = subset[subset['Throughput_times'] == subset['Throughput_times'].min()]
max_times = subset[subset['Throughput_times'] == subset['Throughput_times'].max()]
min_times_each_activity = pd.DataFrame(min_times)
max_times_each_activity = pd.DataFrame(max_times)
min_dataframes.append(min_times_each_activity)
max_dataframes.append(max_times_each_activity)
```
%% Cell type:code id:a54c9f1c tags:
``` python
#The smaller the average throughput time, the higher the efficiency.
most_efficient_resources_per_activity = pd.concat(min_dataframes)
most_efficient_resources_per_activity
```
%% Output
Current_activitiy Resource \
33 T02 Check confirmation of receipt Resource35
55 T03 Adjust confirmation of receipt Resource30
90 T04 Determine confirmation of receipt Resource38
111 T05 Print and send confirmation of receipt Resource18
141 T06 Determine necessity of stop advice Resource16
163 T07-1 Draft intern advice aspect 1 Resource06
185 T07-2 Draft intern advice aspect 2 admin2
189 T07-3 Draft intern advice hold for aspect 3 Resource15
195 T07-4 Draft internal advice to hold for type 4 Resource26
203 T07-5 Draft intern advice aspect 5 Resource26
211 T08 Draft and send request for advice Resource22
218 T09-1 Process or receive external advice from ... Resource15
221 T09-2 Process or receive external advice from ... Resource26
223 T09-3 Process or receive external advice from ... Resource28
226 T09-4 Process or receive external advice from ... Resource09
254 T10 Determine necessity to stop indication Resource30
276 T11 Create document X request unlicensed admin1
291 T12 Check document X request unlicensed admin1
292 T13 Adjust document X request unlicensed Resource17
302 T14 Determine document X request unlicensed Resource15
309 T15 Print document X request unlicensed Resource03
319 T16 Report reasons to hold request Resource03
327 T17 Check report Y to stop indication Resource01
337 T18 Adjust report Y to stop indicition Resource11
341 T19 Determine report Y to stop indication Resource07
348 T20 Print report Y to stop indication Resource03
Throughput_times
33 37.659000
55 15.425500
90 21.507000
111 29.541056
141 28.894500
163 46.908000
185 0.691000
189 20.963500
195 17.331000
203 18.893000
211 56.378000
218 16.279000
221 591.956000
223 75.314250
226 15.952000
254 18.541500
276 11.243000
291 10.472000
292 10.175000
302 12.270000
309 14.797143
319 52.393500
327 18.765000
337 15.176000
341 14.121000
348 14.704000
%% Cell type:code id:8e17e3e5 tags:
``` python
#The larger the average throughput time, the lower the efficiency.
least_efficient_resources_per_activity = pd.concat(max_dataframes)
least_efficient_resources_per_activity
```
%% Output
Current_activitiy Resource \
36 T02 Check confirmation of receipt Resource39
52 T03 Adjust confirmation of receipt Resource21
83 T04 Determine confirmation of receipt Resource28
125 T05 Print and send confirmation of receipt admin1
159 T06 Determine necessity of stop advice admin2
169 T07-1 Draft intern advice aspect 1 Resource24
183 T07-2 Draft intern advice aspect 2 Resource40
191 T07-3 Draft intern advice hold for aspect 3 Resource35
193 T07-4 Draft internal advice to hold for type 4 Resource03
206 T07-5 Draft intern advice aspect 5 admin2
210 T08 Draft and send request for advice Resource15
219 T09-1 Process or receive external advice from ... Resource22
221 T09-2 Process or receive external advice from ... Resource26
222 T09-3 Process or receive external advice from ... Resource25
228 T09-4 Process or receive external advice from ... Resource26
262 T10 Determine necessity to stop indication admin2
275 T11 Create document X request unlicensed Resource21
282 T12 Check document X request unlicensed Resource05
293 T13 Adjust document X request unlicensed admin2
304 T14 Determine document X request unlicensed Resource19
317 T15 Print document X request unlicensed admin1
325 T16 Report reasons to hold request Resource12
333 T17 Check report Y to stop indication Resource11
336 T18 Adjust report Y to stop indicition Resource05
345 T19 Determine report Y to stop indication Resource12
349 T20 Print report Y to stop indication Resource05
Throughput_times
36 3.074175e+06
52 3.744626e+06
83 1.205394e+06
125 1.916927e+05
159 2.174823e+06
169 1.894784e+06
183 1.821035e+06
191 6.175655e+05
193 2.994191e+05
206 1.740934e+06
210 7.199140e+04
219 4.669490e+02
221 5.919560e+02
222 1.037999e+06
228 2.260937e+03
262 2.407355e+06
275 7.764095e+06
282 1.999766e+05
293 1.478579e+06
304 1.965321e+05
317 5.776177e+05
325 7.326962e+05
333 1.521877e+02
336 2.032450e+01
345 9.250450e+01
349 5.209450e+01
%% Cell type:markdown id:d2ac327d tags:
<h2>Resource-activity performance for two timestamps</h2>
%% Cell type:code id:c138d822 tags:
``` python
pql = PQL()
pql.columns.append(PQLColumn(name="Case_id", query='"data_table_CASES"."case:concept:name"'))
pql.columns.append(PQLColumn(name="Activity", query='"data_table"."concept:name"'))
pql.columns.append(PQLColumn(name="Resource", query='"data_table"."org:resource"'))
pql.columns.append(PQLColumn(name="Start_time", query='"data_table"."start_timestamp"'))
pql.columns.append(PQLColumn(name="End_time", query='"data_table"."time:timestamp"'))
pql.columns.append(PQLColumn(name="Throughput_times", query='SECONDS_BETWEEN ( "data_table"."start_timestamp" , "data_table"."time:timestamp" )'))
dataframe = data_model.export_data_frame(pql)
```
%% Output
---------------------------------------------------------------------------
PyCelonisDataExportFailedError Traceback (most recent call last)
Input In [23], in <cell line: 9>()
6 pql.columns.append(PQLColumn(name="End_time", query='"data_table"."time:timestamp"'))
7 pql.columns.append(PQLColumn(name="Throughput_times", query='SECONDS_BETWEEN ( "data_table"."start_timestamp" , "data_table"."time:timestamp" )'))
----> 9 dataframe = data_model.export_data_frame(pql)
File ~\anaconda3\lib\site-packages\pycelonis\ems\data_integration\data_model.py:444, in DataModel.export_data_frame(self, query, query_environment)
440 data_export = self.create_data_export(query, ExportType.PARQUET, query_environment=query_environment)
442 data_export.wait_for_execution()
--> 444 data_frames = [read_parquet(chunk) for chunk in data_export.get_chunks()]
445 return pd.concat(data_frames).reset_index(drop=True)
File ~\anaconda3\lib\site-packages\pycelonis\ems\data_integration\data_model.py:444, in <listcomp>(.0)
440 data_export = self.create_data_export(query, ExportType.PARQUET, query_environment=query_environment)
442 data_export.wait_for_execution()
--> 444 data_frames = [read_parquet(chunk) for chunk in data_export.get_chunks()]
445 return pd.concat(data_frames).reset_index(drop=True)
File ~\anaconda3\lib\site-packages\pycelonis\ems\data_integration\data_export.py:79, in DataExport.get_chunks(self)
76 raise PyCelonisDataExportRunningError()
78 if data_export_status.export_status != ExportStatus.DONE:
---> 79 raise PyCelonisDataExportFailedError(data_export_status.export_status, data_export_status.message)
81 if data_export_status.export_chunks:
82 for i in range(data_export_status.export_chunks):
PyCelonisDataExportFailedError: FAILED Column "data_table"."start_timestamp" is missing.
%% Cell type:code id:283d2d90 tags:
``` python
```
Loading