Newer
Older
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import netCDF4 as nc
import pickle as pkl
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, f1_score, roc_curve, auc, fbeta_score
from joblib import Parallel, delayed
import settings
from utilities.ncfile_generation import generate_ncfile
from utilities.strings_for_ncfile import char_to_string
class prepare_data:
"""
This class prepares the data to be
used in the Random Forest classifier.
"""
def __init__(self, aim, logger):
invalid = False
self.aim = aim
self.logger = logger
if aim == 'train_test':
print('Train the model')
invalid = False
elif aim == 'prediction':
print('Prepare the hazard map')
invalid = False
else:
print('Not a valid command. Enter train_test or prediction.')
invalid = True
if not invalid:
self.test_size = settings.size # Size of validation dataset
# Column name of label in training dataset
self.label_name = 'label'
self.xy = pd.DataFrame() # Array to save coordinates for reshaping
if aim == 'train_test':
self.import_features_labels() # Prepare the training
# Generate train/validation dataset
self.split_training_testing()
elif aim == 'prediction':
self.import_features() # Import prediction dataset
def import_features(self):
"""
Imports the features for prediction.
"""
# Import prediction dataset either as csv file or nc file
if os.path.isdir(settings.path_pred):
path_pred = settings.path_pred + 'prediction.nc'
else:
path_pred = settings.path_pred
ds = nc.Dataset(path_pred)
pred = ds['Result'][:, :].data
pred_features = ds['features'][:].data
self.feature_list = char_to_string(pred_features)
self.features = pd.DataFrame(pred, columns=self.feature_list)
self.dropped = ds['Dropped'][:].data
self.dropped = [int(x) for x in self.dropped]
# Save the prediction coordinates in the prediction dataset
self.xy['ycoord'] = self.features['ycoord']
self.xy['xcoord'] = self.features['xcoord']
self.features = self.features.drop(['xcoord', 'ycoord'], axis=1)
self.feature_list = list(self.features.columns)
self.features = np.array(self.features)
self.logger.info('Features for prediction were imported')
self.logger.info('The following '
+ str(len(self.feature_list))
+ ' features are included in the prediction dataset: '
+ str(self.feature_list))
def import_features_labels(self):
"""
Imports the features for training.
"""
# Import training dataset as csv file
if os.path.isdir(settings.path_train):
self.features = pd.read_csv(settings.path_train + 'training.csv')
else:
self.features = pd.read_csv(settings.path_train)
# Extract and remove labels from training dataset
self.labels = np.array(self.features[self.label_name]).reshape(
[np.shape(self.features[self.label_name])[0], 1])
self.features = self.features.drop(self.label_name, axis=1)
# Store coordinates from training data
self.xy['ycoord'] = self.features['ycoord']
self.xy['xcoord'] = self.features['xcoord']
self.features = self.features.drop(['xcoord', 'ycoord', 'ID'], axis=1)
self.feature_list = list(self.features.columns)
self.features = np.array(self.features)
self.logger.info('Features for training were imported')
self.logger.info('The following ' + str(len(self.feature_list))
+ ' features are included in the training dataset: '
+ str(self.feature_list))
def split_training_testing(self):
"""
Splits the training data into training and validation data.
"""
self.train_features, self.test_features, self.train_labels, self.test_labels = \
train_test_split(self.features,
self.labels,
test_size=self.test_size,
random_state=settings.random_seed,
stratify=self.labels)
print('Data split')
self.logger.info('Training data split in training and test dataset')
def __init__(self, aim, parallel=False, log=None):
super().__init__(aim, log)
self.aim = aim
self.parallel = parallel
self.logger = log
self.num_chunks = 10
# Random Forest settings
self.criterion = settings.criterion
self.n_estimators = settings.num_trees
self.max_depth = settings.depth
self.model_dir = settings.model_database_dir
self.model_to_load = settings.model_to_load
self.model_to_save = settings.model_to_save
self.output_dir = None
if aim == 'train_test':
print('Model is trained')
self.define()
self.train()
self.predict()
self.evaluate()
self.create_output_dir()
self.save_model()
self.save_parameters()
self.feature_importance()
elif aim == 'prediction':
print('Prediction is performed')
self.create_output_dir()
self.load_model()
self.predict()
self.extract_pos_neg_predictions()
self.reshape_prediction()
self.save_prediction()
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
def define(self):
"""
Define the Random Forest Classifier model.
"""
self.model = RandomForestClassifier(n_estimators=self.n_estimators,
max_depth=self.max_depth,
random_state=settings.random_seed)
self.logger.info('Model is defined')
def train(self):
"""
Train the Random Forest Classifier model.
"""
self.model.fit(self.train_features, np.ravel(self.train_labels))
self.logger.info('Model is trained')
def predict(self):
"""
Make the prediction.
"""
print('Predicting...')
self.logger.info('Predicting...')
if self.aim == 'prediction':
pred = self.features
elif self.aim == 'train_test':
pred = self.test_features
if self.parallel:
self.split_array_into_chunks(pred)
prediction = Parallel(n_jobs=10)(
delayed(self.model.predict)(chunk) for chunk in self.chunks)
self.prediction = np.concatenate(prediction, axis=0)
else:
self.prediction = self.model.predict(pred)
def split_array_into_chunks(self, pred):
"""
Split a NumPy array into chunks without changing the number of columns.
"""
self.logger.info('Prediction dataset is split in chunks')
# Calculate the number of rows in each chunk
rows_per_chunk = pred.shape[0] // self.num_chunks
remaining_rows = pred.shape[0] % self.num_chunks
# Create chunks
self.chunks = []
start = 0
for i in range(self.num_chunks):
end = start + rows_per_chunk + (1 if i < remaining_rows else 0)
chunk = pred[start:end, :]
self.chunks.append(chunk)
start = end
length = length + len(chunk)
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
def evaluate(self):
"""
Evaluate the validation dataset.
"""
self.logger.info('Model is evaluated')
y_pred_prob = self.model.predict_proba(self.test_features)[:, 1]
self.fpr, self.tpr, self.thresholds = roc_curve(self.test_labels, y_pred_prob)
# Calculate AUC (Area Under the Curve)
self.roc_auc = auc(self.fpr, self.tpr)
diff = [abs(pred-test_labels) for pred, test_labels
in zip(list(self.prediction), list(self.test_labels))]
self.acc = str(diff.count(1)) + '/' + str(len(diff))
self.mae = round(np.mean(diff), 2)
print('Mean absolute error: ' + str(self.mae))
print('Wrongly predicted: '
+ str(np.count_nonzero(diff))
+ '/' + str(len(diff)))
self.mse = mean_squared_error(self.test_labels, self.prediction)
self.f1 = f1_score(self.test_labels, self.prediction)
self.fbeta = fbeta_score(self.test_labels, self.prediction, beta=2)
print('Mean squared error: ' + str(self.mse))
def create_output_dir(self):
"""
Define and create the output directory.
"""
self.output_dir = self.model_dir + self.model_to_save
if not os.path.isdir(self.output_dir):
os.makedirs(self.output_dir, exist_ok=True)
def save_model(self):
"""
Save the Random Forest Classifier model.
"""
with open(self.output_dir + '/saved_model.pkl', 'wb') as file:
pkl.dump(self.model, file)
self.logger.info('Model is saved')
def save_parameters(self):
"""
Save the metadata associated with the prediction.
"""
params = {'Area': settings.bounding_box,
'criterion': [self.criterion],
'n_estimators': [self.n_estimators],
'max_depth': [self.max_depth],
'features': self.feature_list,
'mse': self.mse,
'mae': self.mae,
'f1': self.f1,
'roc_threshold': self.thresholds,
'roc_fpr': self.fpr,
'roc_tpr': self.tpr,
'roc_auc': self.roc_auc,
'accuracy': self.acc,
'fbeta': self.fbeta
}
with open(settings.model_database_dir
+ self.model_to_save
+ '/model_params.pkl', 'wb') as file:
pkl.dump(params, file)
self.logger.info('Parameters are saved')
def load_model(self):
"""
Load the Random Forest Classifier model and the metadata.
Make sure to compare features of training and prediction dataset
as well as their order.
print('Loading model ' + self.model_dir
+ self.model_to_load + '/saved_model.pkl')
self.logger.info('Loading model ' + self.model_dir
+ self.model_to_load + '/saved_model.pkl')
+ self.model_to_load + '/saved_model.pkl', 'rb') as file:
self.model = pkl.load(file)
with open(settings.model_database_dir
+ self.model_to_save
+ '/model_params.pkl', 'rb') as f:
params = pkl.load(f)
features = params['features']
self.logger.info('Model loaded from '
+ self.model_dir
+ self.model_to_load)
print('Model loaded from '
+ self.model_dir
+ self.model_to_load)
print("Model successfully loaded")
def save_prediction(self):
"""
Save the prediction.
"""
if self.aim == 'prediction':
output_dir = self.model_dir + self.model_to_load
self.xy.to_csv(output_dir + '/prediction_results.csv',
columns=['xcoord', 'ycoord', 'pred'],
index=True)
self.df_pos.to_csv(output_dir + '/pos_prediction_results.csv',
columns=['xcoord', 'ycoord', 'pred'],
index=True)
self.df_neg.to_csv(output_dir + '/neg_prediction_results.csv',
columns=['xcoord', 'ycoord', 'pred'],
index=True)
print('Predictions saved in ' + output_dir)
self.logger.info('Prediction saved in ' + output_dir)
def reshape_prediction(self):
"""
Reshape the individual predictions into a map.
"""
arr_xy = np.array(self.xy)
arr_xy[self.dropped, :] = settings.no_value#*np.shape(arr_xy)[1]
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
result = np.reshape(list(arr_xy[:, 2]),
(len(list(set(self.xy['ycoord']))),
len(list(set(self.xy['xcoord'])))))
self.logger.info('Prediction is reshaped into the final map')
self.save_prediction_as_nc(result)
def extract_pos_neg_predictions(self):
"""
Distinguish between the classes of the Classifier.
"""
print('Extract pos and neg predictions...')
self.logger.info('Extract positive and negative predictions...')
self.xy['pred'] = self.prediction
self.df_pos = self.xy[self.xy.pred == 1]
self.df_neg = self.xy[self.xy.pred == 0]
def save_prediction_as_nc(self, prediction):
"""
Save the hazard map to a netCDF4 file.
"""
print('Saving as nc-File')
outfile_name = self.model_dir + self.model_to_load + '/prediction.nc'
if os.path.exists(outfile_name):
os.remove(outfile_name)
generate_ncfile(outfile_name,
np.array(list(set(self.xy['xcoord']))),
np.array(list(set(self.xy['ycoord']))),
prediction,
crs=settings.crs,
missing_value=settings.no_value)
self.logger.info('Map is saved as nc-file')
def feature_importance(self):
"""
Access feature importance information from the Random Forest.
"""
feature_imp = pd.Series(self.model.feature_importances_,
index=self.feature_list).sort_values(
ascending=False)
feature_imp.to_csv(self.model_dir
+ settings.model_to_load
+ '/feature_importance.csv')
self.logger.info('Feature importance is saved')