Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 29 13:20:59 2025
@author: aedrich
"""
import numpy as np
import pandas as pd
import netCDF4 as nc
import pickle as pkl
import os
import logging
import settings
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, f1_score, roc_curve, auc, fbeta_score
from joblib import delayed, Parallel
from tkinter import Label
from utilities.ncfile_generation import generate_basic_ncfile
from utilities.strings_for_ncfile import char_to_string, features_to_char
class comparison_training_prediction_dataset:
def __init__(self, logger):
self.logger = logger
self.error = False
self.import_prediction_dataset()
self.import_training_dataset()
self.compare_features()
if not self.error:
self.additional_instances_to_drop()
self.save_prediction_dataset()
self.save_training_dataset()
def import_prediction_dataset(self):
ds = nc.Dataset(settings.path_pred)
pred = ds['Result'][:, :].data
pred_features = ds['features'][:].data
self.feature_list = char_to_string(pred_features)
if 'xcoord' in self.feature_list and 'ycoord' in self.feature_list:
self.pred = pd.DataFrame(pred, columns=self.feature_list)
else:
self.pred = pd.DataFrame(pred, columns=['xcoord', 'ycoord']+self.feature_list)
self.xy = pd.DataFrame()
self.xy['ycoord'] = self.pred['ycoord']
self.xy['xcoord'] = self.pred['xcoord']
self.idx = ds['Dropped'][:].data
self.idx = [int(x) for x in self.idx]
if len(settings.not_included_pred_data) > 0:
for dataset in settings.not_included_pred_data:
if dataset in self.pred.columns.tolist():
self.pred = self.pred.drop(dataset, axis=1)
self.logger.info('Prediction dataset imported')
self.logger.info('The following ' + str(len(self.pred.columns.tolist()))
+ ' features are included in the prediction dataset: '
+ str(self.pred.columns.tolist()))
def import_training_dataset(self):
# Import training dataset as csv file
self.train = pd.read_csv(settings.path_train)
# Extract and remove labels from training dataset
self.labels = np.array(
self.train['label']).reshape(
[np.shape(self.train['label'])[0], 1])
self.xy_train = pd.DataFrame()
self.xy_train['ID'] = self.train['ID']
self.xy_train['label'] = self.train['label']
self.xy_train['ycoord'] = self.train['ycoord']
self.xy_train['xcoord'] = self.train['xcoord']
self.train = self.train.drop(['xcoord', 'ycoord', 'ID', 'label'], axis=1)
if len(settings.not_included_train_data) > 0:
for dataset in settings.not_included_train_data:
if dataset in self.train.columns.tolist():
self.train = self.train.drop(dataset, axis=1)
self.logger.info('Training dataset imported')
self.logger.info('The following ' + str(len(self.train.columns.tolist()))
+ ' features are included in the training dataset: '
+ str(self.train.columns.tolist()))
def compare_features(self):
"""
It is assessed if all features in the training dataset also appear
in the prediction dataset. If that is not the case, the training
process will be relaunched with an adapted training dataset where the
feature(s) that is/are not contrained in the training dataset are
removed. The second trained model will be stored in a seperate
folder which is named <old_folder_name>_retrain.
If more features appear in the prediction dataset, the additional
features are removed.
"""
self.logger.info('Features are compared between training and prediction dataset')
if set(self.train.columns) == set(self.pred.columns):
self.logger.info('Features are identical in both training and prediction dataset')
self.pred = self.pred[self.train.columns]
self.logger.info('Potentially varying order of features has been fixed')
self.error = False
else:
self.logger.warning('Features are not identical in the training and prediction dataset')
extra_in_pred = set(self.pred.columns) - set(self.train.columns)
extra_in_train = set(self.train.columns) - set(self.pred.columns)
if len(extra_in_pred) > 0 and len(extra_in_train) == 0:
self.logger.warning('More features in prediction dataset, additional features are removed')
self.pred = self.pred[self.train.columns]
self.error = False
elif len(extra_in_train) > 0 and len(extra_in_pred) == 0 :
self.logger.warning('More features in training dataset, additional features are removed')
self.train = self.train[self.pred.columns]
self.error = False
elif len(extra_in_train) > 0 and len(extra_in_pred) > 0:
self.logger.warning('There are mismatching features in both datasets')
self.common_columns = self.train.columns.intersection(self.pred.columns)
if len(self.common_columns.tolist()) == 0:
self.logger.error('Error: No common columns in training and prediction dataset')
self.error = True
elif len(self.common_columns.tolist()) < 6:
self.logger.warning('Warning: only ' + str(len(self.common_columns.tolist())) + ' common columns in training and prediction dataset')
self.error = False
self.train = self.train[self.common_columns]
self.pred = self.pred[self.common_columns]
else:
self.logger.info(str(len(self.common_columns.tolist())) + ' common columns in training and prediction dataset')
self.error = False
self.train = self.train[self.common_columns]
self.pred = self.pred[self.common_columns]
else:
self.logger.error('Error: Unknown issue detected. Check features manually!')
self.error = True
self.logger.info('Feature comparison completed')
def additional_instances_to_drop(self):
"""
All instances that have a value of zero in all columns of a categorical
feature are identified and appended to the list of instances for which
a reliable prediction is not possible.
Input:
master: related to information display in external window
logger: related to generation of a process log
row: related to information display in external window, int
idx: Previously defined instances for which prediction is not
possible, list
pred: prediction dataset, pandas DataFrame
Output:
idx: Updated list of instances for which prediction is not
possible, list
row: Updated row information related to information display in
external window, int
"""
self.logger.info('Start identification of instances that are not represented by at least one categorical feature')
columns = self.pred.columns
# Regular expression to match "<feature>_<value>_encoded"
pattern = re.compile(r"^(.*?)(_?\d+)?_encoded$")
encoded_features = {pattern.match(col).group(1) for col in columns if pattern.match(col)}
self.logger.info('Identified encoded features: ' + str(encoded_features))
count = 0
for feature in encoded_features:
feature_cols = [col for col in self.pred.columns if col.startswith(feature) and col.endswith("_encoded")]
all_zero_rows = (self.pred[feature_cols] == 0).all(axis=1)
all_zero_rows = self.pred.index[all_zero_rows].tolist()
self.idx = list(set(self.idx + all_zero_rows))
count = count + len(all_zero_rows)
self.logger.info(str(count) + ' instances have been identified that are not represented by at least one categorical feature')
def save_prediction_dataset(self):
"""
Save prediction dataset and information on dropped rows as nc-file
"""
self.pred = pd.concat([self.xy, self.pred], axis=1)
pred = self.pred.to_numpy()
char_features = features_to_char(self.pred.columns)
outfile = settings.path_pred
self.logger.info('Prediction dataset is saved to ' + outfile)
if os.path.exists(outfile):
os.remove(outfile)
ds = generate_basic_ncfile(outfile, crs=None)
ds.createDimension('lat', (np.shape(pred)[0]))
ds.createDimension('lon', (np.shape(pred)[1]))
ds.createDimension('ix', (len(self.idx)))
ds.createDimension('feat', len(char_features))
result = ds.createVariable('Result', 'f4', ('lat', 'lon'))
dropped = ds.createVariable('Dropped', 'u8', 'ix')
Features = ds.createVariable('features', 'S1', 'feat')
result[:, :] = pred
dropped[:] = np.array(self.idx)
Features[:] = char_features
ds.close()
def save_training_dataset(self):
"""
Save dataframe as csv. If necessary folder is created.
"""
self.logger.info('Saving of training data in progress')
outfile = settings.path_train
# If outfile exists already, delete
if os.path.exists(outfile):
os.remove(outfile)
self.train = pd.concat([self.xy_train, self.train], axis=1)
# Save dataframe as csv
self.train.to_csv(outfile, sep=',', index=False)
self.logger.info('Training dataset saved')