Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
R
Resource-Based Conformance Checking in Celonis
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Yingqin Fu
Resource-Based Conformance Checking in Celonis
Merge requests
!1
Upload Sprint 1
Code
Review changes
Check out branch
Download
Patches
Plain diff
Closed
Upload Sprint 1
yashvardhanrautela-main-patch-30769
into
main
Overview
0
Commits
1
Pipelines
0
Changes
1
Closed
Yashvardhan Rautela
requested to merge
yashvardhanrautela-main-patch-30769
into
main
2 years ago
Overview
0
Commits
1
Pipelines
0
Changes
1
Expand
0
0
Merge request reports
Compare
main
main (base)
and
latest version
latest version
0698dfb9
1 commit,
2 years ago
1 file
+
583
−
0
Inline
Compare changes
Side-by-side
Inline
Show whitespace changes
Show one file at a time
Main_Sprint_1.ipynb
0 → 100644
+
583
−
0
Options
%% Cell type:markdown id:eb7c3ecc tags:
<h2>
Connecting with PyCelonis and Data Integration
</h2>
%% Cell type:code id:c14ad92b tags:
```
python
import
pycelonis
import
pm4py
from
pycelonis.pql
import
PQL
,
PQLColumn
,
PQLFilter
,
OrderByColumn
import
pandas
as
pd
import
warnings
warnings
.
filterwarnings
(
"
ignore
"
)
```
%% Cell type:code id:756a1379 tags:
```
python
# create a get_celonis object
celonis
=
pycelonis
.
get_celonis
(
base_url
=
'
academic-celonis-x509kq.eu-2.celonis.cloud
'
,
api_token
=
'
NTg1YjI0YzUtNjI5Yi00NzRmLWE1ZmQtMWQxYzM3OGRkYWQ0OldNMk5ERkdCUUZER1FJeXdYaTY3dFprOCs5dExOd3M0czRtYXk0dmJyTFhJ
'
,
key_type
=
'
USER_KEY
'
)
```
%% Cell type:code id:94d02cb7 tags:
```
python
# load the dataset
original_data
=
pm4py
.
read_xes
(
'
receipt.xes
'
)
```
%% Output
%% Cell type:code id:8c7482d3 tags:
```
python
# in our academic license, we can only have at most 100,000 enteries
if
len
(
original_data
)
<
100000
:
data
=
original_data
.
copy
()
else
:
# if there are more than 100,000 rows, select the first 100,000
data
=
original_data
[:
100000
].
copy
()
display
(
data
)
```
%% Output
%% Cell type:code id:764ea3b5 tags:
```
python
# create data pool, or get data pool if it already exists
try
:
data_pool
=
celonis
.
data_integration
.
get_data_pools
().
find
(
"
data_pool
"
)
print
(
"
The data pool already exists and has been fetched successfully.
"
)
except
:
data_pool
=
celonis
.
data_integration
.
create_data_pool
(
"
data_pool
"
)
print
(
"
Data pool has been successfully created.
"
)
```
%% Output
The data pool already exists and has been fetched successfully.
%% Cell type:code id:5f47bbd4 tags:
```
python
# create data model, or get data model if it already exists
try
:
data_model
=
data_pool
.
get_data_models
().
find
(
"
data_model
"
)
print
(
"
The data model already exists and has been fetched successfully.
"
)
except
:
data_model
=
data_pool
.
create_data_model
(
"
data_model
"
)
print
(
"
Data model has been successfully created.
"
)
```
%% Output
The data model already exists and has been fetched successfully.
%% Cell type:code id:3ad94e97 tags:
```
python
# create table in data pool, or do nothing if table in data pool already exists
try
:
data_pool
.
create_table
(
df
=
data
,
table_name
=
"
data_table
"
,
drop_if_exists
=
False
)
print
(
"
Successfully created table in the data pool
"
)
except
:
data_pool
.
get_tables
()
print
(
"
The table already exists in the data pool.
"
)
```
%% Output
The table already exists in the data pool.
%% Cell type:code id:6005fcc7 tags:
```
python
# add the table from the pool to the model, or do nothing if it already exists
try
:
tables
=
data_model
.
add_table
(
name
=
"
data_table
"
,
alias
=
"
data_table
"
)
print
(
"
Successfully added table from pool to model
"
)
except
:
print
(
"
The table already exists in the data model.
"
)
```
%% Output
The table already exists in the data model.
%% Cell type:code id:996b1eb3 tags:
```
python
data_model
.
reload
()
```
%% Output
%% Cell type:code id:5622a2c2 tags:
```
python
# process Configuration
tables
=
data_model
.
get_tables
()
activities
=
tables
.
find
(
"
data_table
"
)
process_configuration
=
data_model
.
create_process_configuration
(
activity_table_id
=
activities
.
id
,
case_id_column
=
"
case:concept:name
"
,
activity_column
=
"
concept:name
"
,
timestamp_column
=
"
time:timestamp
"
,)
```
%% Cell type:markdown id:8ae95f12 tags:
<h2>
Identification of Batches
</h2>
%% Cell type:code id:aac0fafd tags:
```
python
# get the data, data pool and data model from Celonis (they are created in the Data_Integration)
data
=
celonis
.
data_integration
data_pool
=
data
.
get_data_pools
().
find
(
"
data_pool
"
)
data_model
=
data_pool
.
get_data_models
().
find
(
"
data_model
"
)
# create a PQL object
pql
=
PQL
()
# extract these columns
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Case ID
"
,
query
=
"
\"
data_table
\"
.
\"
CASE:CONCEPT:NAME
\"
"
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Activity
"
,
query
=
'"
data_table
"
.
"
concept:name
"'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Resource
"
,
query
=
'"
data_table
"
.
"
ORG:RESOURCE
"'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Time
"
,
query
=
'"
data_table
"
.
"
time:timestamp
"'
))
# load the columns to a dataframe
dataframe
=
data_model
.
export_data_frame
(
pql
)
```
%% Output
%% Cell type:code id:38d9a163 tags:
```
python
display
(
dataframe
)
```
%% Output
%% Cell type:code id:9d321c08 tags:
```
python
variables
=
{}
# detecting batches for different merge distances
for
merge_dist
in
[
15
,
240
]:
variable_name
=
f
"
discover_batches_
{
merge_dist
}
"
variables
[
variable_name
]
=
pm4py
.
discover_batches
(
dataframe
,
case_id_key
=
'
Case ID
'
,
resource_key
=
'
Resource
'
,
activity_key
=
'
Activity
'
,
timestamp_key
=
'
Time
'
,
merge_distance
=
merge_dist
*
60
)
```
%% Cell type:code id:88c0d7e3 tags:
```
python
for
variable_name
,
value
in
variables
.
items
():
print
(
"
When merge distance(mins) =
"
,
variable_name
[
17
:])
# creating a dataframe to display the results
batches_df
=
pd
.
DataFrame
(
columns
=
[
"
Activity
"
,
"
Resource
"
,
"
No. of Batches
"
,
"
Concurrent batching
"
,
"
Simultaneous
"
,
"
Batching at start
"
,
"
Batching at end
"
,
"
Sequential batching
"
])
# getting info of the batches
for
activity_resource
in
value
:
record
=
{}
record
[
"
Activity
"
]
=
activity_resource
[
0
][
0
]
record
[
"
Resource
"
]
=
activity_resource
[
0
][
1
]
record
[
"
No. of Batches
"
]
=
activity_resource
[
1
]
for
batch_type
in
activity_resource
[
2
]:
record
[
batch_type
]
=
len
(
activity_resource
[
2
][
batch_type
])
batches_df
=
batches_df
.
append
(
record
,
ignore_index
=
True
)
batches_df
=
batches_df
.
fillna
(
0
)
display
(
batches_df
)
```
%% Output
When merge distance(mins) = 15
When merge distance(mins) = 240
%% Cell type:markdown id:3e690f31 tags:
<h2>
Activities repeated by different resources
</h2>
%% Cell type:code id:2f3394f8 tags:
```
python
pql
=
PQL
()
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Activity
"
,
query
=
'"
data_table
"
.
"
concept:name
"'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Resource
"
,
query
=
'"
data_table
"
.
"
org:resource
"'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Case ID
"
,
query
=
'"
data_table
"
.
"
case:concept:name
"'
))
dataframe
=
data_model
.
export_data_frame
(
pql
)
# get dataframe using PQL
display
(
dataframe
)
```
%% Output
%% Cell type:code id:1427a728 tags:
```
python
# merge the case id and their activities
dataframe
[
'
Different Activities for all cases
'
]
=
'
Case ID:
'
+
dataframe
[
'
Case ID
'
]
+
'
(activity:
'
+
dataframe
[
'
Activity
'
]
+
'
)
'
# combine the resources which have the same case id and activity
res_df
=
dataframe
.
groupby
([
'
Different Activities for all cases
'
])[
'
Resource
'
].
apply
(
list
).
to_frame
()
res_df
```
%% Output
Resource
Different Activities for all cases
Case ID: case-10011 (activity: Confirmation of ... [Resource21]
Case ID: case-10011 (activity: T02 Check confir... [Resource10, Resource21]
Case ID: case-10011 (activity: T03 Adjust confi... [Resource21]
Case ID: case-10017 (activity: Confirmation of ... [Resource30]
Case ID: case-10017 (activity: T02 Check confir... [Resource30, Resource30, Resource30]
... ...
Case ID: case-9997 (activity: T02 Check confirm... [Resource06]
Case ID: case-9997 (activity: T04 Determine con... [Resource06]
Case ID: case-9997 (activity: T05 Print and sen... [Resource06]
Case ID: case-9997 (activity: T06 Determine nec... [Resource06]
Case ID: case-9997 (activity: T10 Determine nec... [Resource06]
[8332 rows x 1 columns]
%% Cell type:code id:5ca70c7e tags:
```
python
num
=
[]
# filter the resources, remove the missing values.
def
filter_resource
():
res
=
[]
for
l1
in
res_df
[
'
Resource
'
]:
#l2 = list(set(l1))
l2
=
list
(
filter
(
None
,
l1
))
res
.
append
(
l2
)
num
.
append
(
len
(
l2
))
return
res
res_df
[
'
flitered resources
'
]
=
filter_resource
()
#This column shows that the number of resources which are related to an activity,
# the higher the number is, the higher rework to identify the resources will be
res_df
[
'
Number of resources
'
]
=
num
res_df
.
reset_index
()
```
%% Output
Different Activities for all cases \
0 Case ID: case-10011 (activity: Confirmation of...
1 Case ID: case-10011 (activity: T02 Check confi...
2 Case ID: case-10011 (activity: T03 Adjust conf...
3 Case ID: case-10017 (activity: Confirmation of...
4 Case ID: case-10017 (activity: T02 Check confi...
... ...
8327 Case ID: case-9997 (activity: T02 Check confir...
8328 Case ID: case-9997 (activity: T04 Determine co...
8329 Case ID: case-9997 (activity: T05 Print and se...
8330 Case ID: case-9997 (activity: T06 Determine ne...
8331 Case ID: case-9997 (activity: T10 Determine ne...
Resource \
0 [Resource21]
1 [Resource10, Resource21]
2 [Resource21]
3 [Resource30]
4 [Resource30, Resource30, Resource30]
... ...
8327 [Resource06]
8328 [Resource06]
8329 [Resource06]
8330 [Resource06]
8331 [Resource06]
flitered resources Number of resources
0 [Resource21] 1
1 [Resource10, Resource21] 2
2 [Resource21] 1
3 [Resource30] 1
4 [Resource30, Resource30, Resource30] 3
... ... ...
8327 [Resource06] 1
8328 [Resource06] 1
8329 [Resource06] 1
8330 [Resource06] 1
8331 [Resource06] 1
[8332 rows x 4 columns]
%% Cell type:markdown id:d6a0d9c6 tags:
<h2>
Resource-activity performance for one timestamp
</h2>
%% Cell type:code id:062f8346 tags:
```
python
pql
=
PQL
()
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Case_id
"
,
query
=
'"
data_table_CASES
"
.
"
case:concept:name
"'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Previous_activity
"
,
query
=
'
SOURCE(
"
data_table
"
.
"
CONCEPT:NAME
"
)
'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Current_activitiy
"
,
query
=
'
TARGET(
"
data_table
"
.
"
CONCEPT:NAME
"
)
'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Status
"
,
query
=
'
TARGET(
"
data_table
"
.
"
lifecycle:transition
"
)
'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Resource
"
,
query
=
'
TARGET(
"
data_table
"
.
"
org:resource
"
)
'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Throughput_times
"
,
query
=
'
SECONDS_BETWEEN(SOURCE(
"
data_table
"
.
"
TIME:TIMESTAMP
"
), TARGET(
"
data_table
"
.
"
TIME:TIMESTAMP
"
))
'
))
dataframe
=
data_model
.
export_data_frame
(
pql
)
display
(
dataframe
)
```
%% Output
%% Cell type:code id:264016a4 tags:
```
python
#The average throughput time of the same activity for different resources.
mean_times
=
dataframe
.
groupby
([
'
Current_activitiy
'
,
'
Resource
'
])[
'
Throughput_times
'
].
mean
().
reset_index
()
mean_times
```
%% Output
Current_activitiy Resource Throughput_times
0 T02 Check confirmation of receipt Resource01 75895.181632
1 T02 Check confirmation of receipt Resource02 60169.247632
2 T02 Check confirmation of receipt Resource03 45.059616
3 T02 Check confirmation of receipt Resource04 36173.443571
4 T02 Check confirmation of receipt Resource05 8926.559600
.. ... ... ...
351 T20 Print report Y to stop indication Resource08 19.479000
352 T20 Print report Y to stop indication Resource09 17.639000
353 T20 Print report Y to stop indication Resource11 25.486667
354 T20 Print report Y to stop indication Resource12 31.518000
355 T20 Print report Y to stop indication Resource13 26.884000
[356 rows x 3 columns]
%% Cell type:code id:8c24676c tags:
```
python
#Get the most and least efficient resources for each activity
activities_name
=
mean_times
[
'
Current_activitiy
'
].
unique
()
min_dataframes
=
[]
max_dataframes
=
[]
for
activity
in
activities_name
:
subset
=
mean_times
[
mean_times
[
'
Current_activitiy
'
]
==
activity
]
min_times
=
subset
[
subset
[
'
Throughput_times
'
]
==
subset
[
'
Throughput_times
'
].
min
()]
max_times
=
subset
[
subset
[
'
Throughput_times
'
]
==
subset
[
'
Throughput_times
'
].
max
()]
min_times_each_activity
=
pd
.
DataFrame
(
min_times
)
max_times_each_activity
=
pd
.
DataFrame
(
max_times
)
min_dataframes
.
append
(
min_times_each_activity
)
max_dataframes
.
append
(
max_times_each_activity
)
```
%% Cell type:code id:a54c9f1c tags:
```
python
#The smaller the average throughput time, the higher the efficiency.
most_efficient_resources_per_activity
=
pd
.
concat
(
min_dataframes
)
most_efficient_resources_per_activity
```
%% Output
Current_activitiy Resource \
33 T02 Check confirmation of receipt Resource35
55 T03 Adjust confirmation of receipt Resource30
90 T04 Determine confirmation of receipt Resource38
111 T05 Print and send confirmation of receipt Resource18
141 T06 Determine necessity of stop advice Resource16
163 T07-1 Draft intern advice aspect 1 Resource06
185 T07-2 Draft intern advice aspect 2 admin2
189 T07-3 Draft intern advice hold for aspect 3 Resource15
195 T07-4 Draft internal advice to hold for type 4 Resource26
203 T07-5 Draft intern advice aspect 5 Resource26
211 T08 Draft and send request for advice Resource22
218 T09-1 Process or receive external advice from ... Resource15
221 T09-2 Process or receive external advice from ... Resource26
223 T09-3 Process or receive external advice from ... Resource28
226 T09-4 Process or receive external advice from ... Resource09
254 T10 Determine necessity to stop indication Resource30
276 T11 Create document X request unlicensed admin1
291 T12 Check document X request unlicensed admin1
292 T13 Adjust document X request unlicensed Resource17
302 T14 Determine document X request unlicensed Resource15
309 T15 Print document X request unlicensed Resource03
319 T16 Report reasons to hold request Resource03
327 T17 Check report Y to stop indication Resource01
337 T18 Adjust report Y to stop indicition Resource11
341 T19 Determine report Y to stop indication Resource07
348 T20 Print report Y to stop indication Resource03
Throughput_times
33 37.659000
55 15.425500
90 21.507000
111 29.541056
141 28.894500
163 46.908000
185 0.691000
189 20.963500
195 17.331000
203 18.893000
211 56.378000
218 16.279000
221 591.956000
223 75.314250
226 15.952000
254 18.541500
276 11.243000
291 10.472000
292 10.175000
302 12.270000
309 14.797143
319 52.393500
327 18.765000
337 15.176000
341 14.121000
348 14.704000
%% Cell type:code id:8e17e3e5 tags:
```
python
#The larger the average throughput time, the lower the efficiency.
least_efficient_resources_per_activity
=
pd
.
concat
(
max_dataframes
)
least_efficient_resources_per_activity
```
%% Output
Current_activitiy Resource \
36 T02 Check confirmation of receipt Resource39
52 T03 Adjust confirmation of receipt Resource21
83 T04 Determine confirmation of receipt Resource28
125 T05 Print and send confirmation of receipt admin1
159 T06 Determine necessity of stop advice admin2
169 T07-1 Draft intern advice aspect 1 Resource24
183 T07-2 Draft intern advice aspect 2 Resource40
191 T07-3 Draft intern advice hold for aspect 3 Resource35
193 T07-4 Draft internal advice to hold for type 4 Resource03
206 T07-5 Draft intern advice aspect 5 admin2
210 T08 Draft and send request for advice Resource15
219 T09-1 Process or receive external advice from ... Resource22
221 T09-2 Process or receive external advice from ... Resource26
222 T09-3 Process or receive external advice from ... Resource25
228 T09-4 Process or receive external advice from ... Resource26
262 T10 Determine necessity to stop indication admin2
275 T11 Create document X request unlicensed Resource21
282 T12 Check document X request unlicensed Resource05
293 T13 Adjust document X request unlicensed admin2
304 T14 Determine document X request unlicensed Resource19
317 T15 Print document X request unlicensed admin1
325 T16 Report reasons to hold request Resource12
333 T17 Check report Y to stop indication Resource11
336 T18 Adjust report Y to stop indicition Resource05
345 T19 Determine report Y to stop indication Resource12
349 T20 Print report Y to stop indication Resource05
Throughput_times
36 3.074175e+06
52 3.744626e+06
83 1.205394e+06
125 1.916927e+05
159 2.174823e+06
169 1.894784e+06
183 1.821035e+06
191 6.175655e+05
193 2.994191e+05
206 1.740934e+06
210 7.199140e+04
219 4.669490e+02
221 5.919560e+02
222 1.037999e+06
228 2.260937e+03
262 2.407355e+06
275 7.764095e+06
282 1.999766e+05
293 1.478579e+06
304 1.965321e+05
317 5.776177e+05
325 7.326962e+05
333 1.521877e+02
336 2.032450e+01
345 9.250450e+01
349 5.209450e+01
%% Cell type:markdown id:d2ac327d tags:
<h2>
Resource-activity performance for two timestamps
</h2>
%% Cell type:code id:c138d822 tags:
```
python
pql
=
PQL
()
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Case_id
"
,
query
=
'"
data_table_CASES
"
.
"
case:concept:name
"'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Activity
"
,
query
=
'"
data_table
"
.
"
concept:name
"'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Resource
"
,
query
=
'"
data_table
"
.
"
org:resource
"'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Start_time
"
,
query
=
'"
data_table
"
.
"
start_timestamp
"'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
End_time
"
,
query
=
'"
data_table
"
.
"
time:timestamp
"'
))
pql
.
columns
.
append
(
PQLColumn
(
name
=
"
Throughput_times
"
,
query
=
'
SECONDS_BETWEEN (
"
data_table
"
.
"
start_timestamp
"
,
"
data_table
"
.
"
time:timestamp
"
)
'
))
dataframe
=
data_model
.
export_data_frame
(
pql
)
```
%% Output
---------------------------------------------------------------------------
PyCelonisDataExportFailedError Traceback (most recent call last)
Input In [23], in
<cell
line:
9
>
()
6 pql.columns.append(PQLColumn(name="End_time", query='"data_table"."time:timestamp"'))
7 pql.columns.append(PQLColumn(name="Throughput_times", query='SECONDS_BETWEEN ( "data_table"."start_timestamp" , "data_table"."time:timestamp" )'))
----> 9 dataframe = data_model.export_data_frame(pql)
File ~
\a
naconda3
\l
ib
\s
ite-packages
\p
ycelonis
\e
ms
\d
ata_integration
\d
ata_model.py:444, in DataModel.export_data_frame(self, query, query_environment)
440 data_export = self.create_data_export(query, ExportType.PARQUET, query_environment=query_environment)
442 data_export.wait_for_execution()
--> 444 data_frames = [read_parquet(chunk) for chunk in data_export.get_chunks()]
445 return pd.concat(data_frames).reset_index(drop=True)
File ~
\a
naconda3
\l
ib
\s
ite-packages
\p
ycelonis
\e
ms
\d
ata_integration
\d
ata_model.py:444, in
<listcomp>
(.0)
440 data_export = self.create_data_export(query, ExportType.PARQUET, query_environment=query_environment)
442 data_export.wait_for_execution()
--> 444 data_frames = [read_parquet(chunk) for chunk in data_export.get_chunks()]
445 return pd.concat(data_frames).reset_index(drop=True)
File ~
\a
naconda3
\l
ib
\s
ite-packages
\p
ycelonis
\e
ms
\d
ata_integration
\d
ata_export.py:79, in DataExport.get_chunks(self)
76 raise PyCelonisDataExportRunningError()
78 if data_export_status.export_status != ExportStatus.DONE:
---> 79 raise PyCelonisDataExportFailedError(data_export_status.export_status, data_export_status.message)
81 if data_export_status.export_chunks:
82 for i in range(data_export_status.export_chunks):
PyCelonisDataExportFailedError: FAILED Column "data_table"."start_timestamp" is missing.
%% Cell type:code id:283d2d90 tags:
```
python
```
Loading