diff --git a/src/plain_scripts/RandomForest.py b/src/plain_scripts/RandomForest.py index 02d800230a9fba9b742f0aa04b7d3119fdcf2c0b..edfed30702c5f19691b48cace7e39a841b8bdb09 100644 --- a/src/plain_scripts/RandomForest.py +++ b/src/plain_scripts/RandomForest.py @@ -25,12 +25,12 @@ class prepare_data: used in the Random Forest classifier. """ - def __init__(self, aim, logger, retrain): + def __init__(self, aim, logger): invalid = False self.aim = aim self.logger = logger - self.retrain = retrain + if aim == 'train_test': print('Train the model') invalid = False @@ -53,7 +53,7 @@ class prepare_data: self.split_training_testing() elif aim == 'prediction': self.import_features() # Import prediction dataset - + def import_features(self): """ @@ -67,61 +67,29 @@ class prepare_data: else: path_pred = settings.path_pred - if path_pred.split('.')[-1] == 'csv': - self.features = pd.read_csv(path_pred) - - elif path_pred.split('.')[-1] == 'nc': - ds = nc.Dataset(path_pred) - pred = ds['Result'][:, :].data - pred_features = ds['features'][:].data - self.feature_list = char_to_string(pred_features) - - if 'xcoord' in self.feature_list and 'ycoord' in self.feature_list: - self.features = pd.DataFrame(pred, columns=self.feature_list) - else: - self.features = pd.DataFrame(pred, columns=['xcoord', 'ycoord']+self.feature_list) - - self.dropped = ds['Dropped'][:].data - self.dropped = [int(x) for x in self.dropped] + ds = nc.Dataset(path_pred) + pred = ds['Result'][:, :].data + pred_features = ds['features'][:].data + self.feature_list = char_to_string(pred_features) + self.features = pd.DataFrame(pred, columns=self.feature_list) + self.dropped = ds['Dropped'][:].data + self.dropped = [int(x) for x in self.dropped] + # Save the prediction coordinates in the prediction dataset self.xy['ycoord'] = self.features['ycoord'] self.xy['xcoord'] = self.features['xcoord'] - - # Remove all features that shall not be included in - # prediction from DataFrame (see settings!) - if len(settings.not_included_pred_data) > 0: - for dataset in settings.not_included_pred_data: - self.features = self.features.drop(dataset, axis=1) - - # Determine which classes are contained in the categorical features - # It is distinguished between one-hot and ordinal encoded features - self.categorical_classes = {} - cat_subset = [feat for feat in self.features.columns.tolist() if '_encoded' in feat] - df_sub = self.features[cat_subset] - cat_feat = ['_'.join(col.split('_')[:len(col.split('_'))-1]) for col in df_sub.columns.tolist()] - self.distibuish_encoding = {} - for feat in list(set(cat_feat)): - classes = [] - if cat_feat.count(feat)>1: - classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f]) - self.distibuish_encoding[feat] = 'ohe' - else: - classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f]) - self.distibuish_encoding[feat] = 'ordinal' - self.categorical_classes[feat] = {} - self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist] - self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat) - + + self.features = self.features.drop(['xcoord', 'ycoord'], axis=1) self.feature_list = list(self.features.columns) - self.features_org = self.features.copy() + self.features = np.array(self.features) self.logger.info('Features for prediction were imported') self.logger.info('The following ' + str(len(self.feature_list)) + ' features are included in the prediction dataset: ' + str(self.feature_list)) - + def import_features_labels(self): """ @@ -133,6 +101,7 @@ class prepare_data: self.features = pd.read_csv(settings.path_train + 'training.csv') else: self.features = pd.read_csv(settings.path_train) + # Extract and remove labels from training dataset self.labels = np.array(self.features[self.label_name]).reshape( [np.shape(self.features[self.label_name])[0], 1]) @@ -142,50 +111,17 @@ class prepare_data: self.xy['ycoord'] = self.features['ycoord'] self.xy['xcoord'] = self.features['xcoord'] - # Drop ID from training data - self.features = self.features.drop('ID', axis=1) - self.features = self.features.drop(['xcoord', 'ycoord'], axis=1) - - # Remove all features that shall not be included in - # training from DataFrame (see settings!) - - if self.retrain: - features_to_remove = pd.read_csv(settings.path_ml + settings.model_to_save + '/feature_mismatch_training.csv')['to_drop'].to_list() - not_included_train_data = settings.not_included_train_data + features_to_remove - else: - not_included_train_data = settings.not_included_train_data - - - if len(not_included_train_data) > 0: - for dataset in not_included_train_data: - self.features = self.features.drop(dataset, axis=1) - - # Determine which classes are contained in the categorical features - # It is distinguished between one-hot and ordinal encoded features - self.categorical_classes = {} - cat_subset = [feat for feat in self.features.columns.tolist() if '_encoded' in feat] - df_sub = self.features[cat_subset] - cat_feat = ['_'.join(col.split('_')[:-2]) for col in df_sub.columns.tolist()] - for feat in list(set(cat_feat)): - classes = [] - if cat_feat.count(feat)>1: - classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f]) - else: - classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f]) - self.categorical_classes[feat] = {} - self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist] - self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat) - - + self.features = self.features.drop(['xcoord', 'ycoord', 'ID'], axis=1) self.feature_list = list(self.features.columns) + self.features = np.array(self.features) + self.logger.info('Features for training were imported') self.logger.info('The following ' + str(len(self.feature_list)) + ' features are included in the training dataset: ' + str(self.feature_list)) - self.features = np.array(self.features) def split_training_testing(self): - + """ Splits the training data into training and validation data. """ @@ -196,33 +132,31 @@ class prepare_data: test_size=self.test_size, random_state=settings.random_seed, stratify=self.labels) + print('Data split') self.logger.info('Training data split in training and test dataset') - - + + class RandomForest(prepare_data): - def __init__(self, aim, parallel=False, log=None, retrain=None): + def __init__(self, aim, parallel=False, log=None): - super().__init__(aim, log, retrain) + super().__init__(aim, log) self.aim = aim self.parallel = parallel - self.retrain = retrain + self.logger = log self.num_chunks = 10 + # Random Forest settings self.criterion = settings.criterion self.n_estimators = settings.num_trees self.max_depth = settings.depth self.model_dir = settings.model_database_dir - if self.retrain: - self.model_to_load = settings.model_to_load + '_retrain' - self.model_to_save = settings.model_to_save + '_retrain' - else: - self.model_to_load = settings.model_to_load - self.model_to_save = settings.model_to_save + self.model_to_load = settings.model_to_load + self.model_to_save = settings.model_to_save self.output_dir = None if aim == 'train_test': @@ -240,11 +174,10 @@ class RandomForest(prepare_data): print('Prediction is performed') self.create_output_dir() self.load_model() - if not self.error: - self.predict() - self.extract_pos_neg_predictions() - self.reshape_prediction() - self.save_prediction() + self.predict() + self.extract_pos_neg_predictions() + self.reshape_prediction() + self.save_prediction() def define(self): @@ -375,8 +308,7 @@ class RandomForest(prepare_data): 'roc_tpr': self.tpr, 'roc_auc': self.roc_auc, 'accuracy': self.acc, - 'fbeta': self.fbeta, - 'categories': self.categorical_classes + 'fbeta': self.fbeta } with open(settings.model_database_dir @@ -386,77 +318,6 @@ class RandomForest(prepare_data): self.logger.info('Parameters are saved') - def adapt_categorical_features(self, train_classes, training_features): - - """ - The encoded features in the training and prediction dataset are - compared regarding the contained classes. Depending on the user - input, instances in the prediction dataset with classes that are - not included in the training dataset are either set to no_value or - nevertheless considered in the prediction. The surplus additional - features are removed either way to achieve the same set of features - as in the training dataset - """ - - self.instances_to_drop = [] - self.features_not_in_training = [] - - for feat in [val for val in training_features if '_encode' in val]: - if feat not in self.feature_list: - print('Error: cannot proceed with mapping') - print('Error: Categorical feature ' + feat + ' not in prediction dataset') - self.logger.error('Error: Categorical feature ' + feat + ' not in prediction dataset') - - self.error = True - self.retrain = True - self.features_not_in_training.append(feat) - - if len(self.features_not_in_training) > 0: - pd.DataFrame(self.features_not_in_training, columns=['to_drop']).to_csv(self.model_dir + self.model_to_load + 'feature_mismatch_training.csv', index=False) - - if not self.retrain: - if list(set([val for val in training_features if '_encode' in val])) != list(set(self.feature_list)): - for feat in list(set(['_'.join(val.split('_')[:-2]) for val in self.feature_list if '_encode' in val])): - if feat in list(self.distibuish_encoding.keys()): - if self.distibuish_encoding[feat] == 'ohe': - if (train_classes[feat]['num_cols'] < self.categorical_classes[feat]['num_cols']) or (set(train_classes[feat]['classes']) != set(self.categorical_classes[feat]['classes'])): - print(feat + ': Prediction dataset contains more or other classes than training dataset') - - self.logger.warning(feat + ': Prediction dataset contains more classes than training dataset') - self.logger.info('Apply user defined handling approach') - - common_elements = set(train_classes[feat]['classes']).intersection(set(self.categorical_classes[feat]['classes'])) - - if self.properties_map['keep']: - if len(common_elements) == 0: - print('Error: no common classes for ' + feat + ' in training and prediction dataset') - self.logger.error('Error: no common classes for ' + feat + ' in training and prediction dataset') - self.error = True - else: - to_drop = [feat + '_' + str(f) + '_encode' for f in self.categorical_classes[feat]['classes'] if f not in common_elements] - self.features = self.features.drop(to_drop, axis=1) - self.feature_list = self.features.columns.tolist() - elif self.properties_map['remove_instances']: - to_drop_col = [feat + '_' + str(f) + '_encode' for f in self.categorical_classes[feat]['classes'] if f not in common_elements] - to_drop_row = [] - for col in to_drop_col: - to_drop_row = to_drop_row + self.features.index[self.features[col] == 1].tolist() - self.features = self.features.drop(to_drop_col, axis=1) - - print('Not matching features have been removed') - self.logger.info('Not matching features have been removed') - - self.feature_list = self.features.columns.tolist() - self.instances_to_drop = self.instances_to_drop + to_drop_row - - print('Instances to consider during mapping have been adapted') - self.logger.info('Instances to consider during mapping have been adapted') - - print('Categorical features have been handled and hamonised') - self.logger.info('Categorical features have been handled and hamonised') - self.logger.info('Remaining features: ' + str(self.feature_list)) - - def load_model(self): """ @@ -478,63 +339,7 @@ class RandomForest(prepare_data): + '/model_params.pkl', 'rb') as f: params = pkl.load(f) features = params['features'] - self.error = False - self.adapt_categorical_features(params['categories'], features) - - if not self.error: - if len(self.feature_list) == len(features): - if set(self.feature_list) != set(features): - - print('Error: Not all features of the model are contained in the prediction dataset') - self.logger.error('Error: Not all features of the model are contained in the prediction dataset') - - self.error = True - elif self.feature_list != features: - - print('The order or features differs. Prediction features are reordered') - self.logger.info('The order or features differs. Prediction features are reordered') - - self.features = self.features[features] - if self.features.columns.tolist() != features: - print('There is still something wrong with the order of the features!') - - elif self.feature_list == features: - - print('Prediction and training dataset have the same order') - self.logger.info('Prediction and training dataset have the same order') - elif len(self.feature_list) < len(features): - - print('Error: Not all features of the model are contained in the prediction dataset') - self.logger.error('Error: Not all features of the model are contained in the prediction dataset') - - self.error = True - elif len(self.feature_list) > len(features): - if set(features).issubset(self.feature_list): - to_drop = list(set(self.feature_list)-set(features)) - self.features = self.features.drop(to_drop, axis=1) - self.feature = self.features[features] - if self.features.columns.tolist() != features: - print('There is still something wrong with the order of the features!') - self.error = True - else: - print('Features in the prediction dataset which were not used for training were removed') - print('Features in the prediction dataset were sorted to match the training features') - - self.logger.warning('Features in the prediction dataset which were not used for training were removed') - self.logger.info('Features left: ' + str(self.feature_list)) - self.logger.info('Features in the prediction dataset were sorted to match the training features') - else: - Label(self.master, text='Error: Not all features of the model are contained in the prediction dataset').grid( - row=self.row, column=1) - self.row = self.row + 1 - self.master.update() - - self.logger.error('Error: Not all features of the model are contained in the prediction dataset') - - self.error = True - if not self.error: - self.feature_list = self.features.columns.tolist() - self.features = self.features.to_numpy() + self.logger.info('Model loaded from ' + self.model_dir @@ -572,9 +377,8 @@ class RandomForest(prepare_data): Reshape the individual predictions into a map. """ - dropped = list(set(self.dropped + self.instances_to_drop)) arr_xy = np.array(self.xy) - arr_xy[dropped, :] = settings.no_value#*np.shape(arr_xy)[1] + arr_xy[self.dropped, :] = settings.no_value#*np.shape(arr_xy)[1] result = np.reshape(list(arr_xy[:, 2]), (len(list(set(self.xy['ycoord']))), diff --git a/src/plain_scripts/check_user_input.py b/src/plain_scripts/check_user_input.py index 768d71cb640e50abb37180767bd2925342d63fe3..dd336ba013567b5e15cf7391ccea90cd88e0b487 100644 --- a/src/plain_scripts/check_user_input.py +++ b/src/plain_scripts/check_user_input.py @@ -14,16 +14,16 @@ class check_general_settings(): if training_dataset or map_generation: if os.path.isdir(path_train): - save_path = path_train + 'check_user_input.log' + save_path = path_train + '/check_user_input.log' else: - save_path = os.path.dirname(path_train) + 'check_user_input.log' + save_path = os.path.dirname(path_train) + '/check_user_input.log' elif prediction_dataset: if os.path.isdir(path_pred): - save_path = path_pred + 'check_user_input.log' + save_path = path_pred + '/check_user_input.log' else: - save_path = os.path.dirname(path_pred) + 'check_user_input.log' + save_path = os.path.dirname(path_pred) + '/check_user_input.log' else: - save_path = 'check_user_input.log' + save_path = '/check_user_input.log' if os.path.exists(save_path): os.remove(save_path) diff --git a/src/plain_scripts/compatibility_of_input_datasets.py b/src/plain_scripts/compatibility_of_input_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..255869e669b384d04b44030b4381f7f7b65795ef --- /dev/null +++ b/src/plain_scripts/compatibility_of_input_datasets.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Jan 29 13:20:59 2025 + +@author: aedrich +""" + +import numpy as np +import pandas as pd +import netCDF4 as nc +import pickle as pkl +import os +import logging +import settings +import re + +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import mean_squared_error, f1_score, roc_curve, auc, fbeta_score +from joblib import delayed, Parallel +from tkinter import Label + +from utilities.ncfile_generation import generate_basic_ncfile +from utilities.strings_for_ncfile import char_to_string, features_to_char + + +class comparison_training_prediction_dataset: + + def __init__(self, logger): + + self.logger = logger + self.error = False + + self.import_prediction_dataset() + self.import_training_dataset() + self.compare_features() + if not self.error: + self.additional_instances_to_drop() + self.save_prediction_dataset() + self.save_training_dataset() + + def import_prediction_dataset(self): + + ds = nc.Dataset(settings.path_pred) + pred = ds['Result'][:, :].data + pred_features = ds['features'][:].data + self.feature_list = char_to_string(pred_features) + + if 'xcoord' in self.feature_list and 'ycoord' in self.feature_list: + self.pred = pd.DataFrame(pred, columns=self.feature_list) + else: + self.pred = pd.DataFrame(pred, columns=['xcoord', 'ycoord']+self.feature_list) + + self.xy = pd.DataFrame() + self.xy['ycoord'] = self.pred['ycoord'] + self.xy['xcoord'] = self.pred['xcoord'] + + self.idx = ds['Dropped'][:].data + self.idx = [int(x) for x in self.idx] + + if len(settings.not_included_pred_data) > 0: + for dataset in settings.not_included_pred_data: + if dataset in self.pred.columns.tolist(): + self.pred = self.pred.drop(dataset, axis=1) + + self.logger.info('Prediction dataset imported') + self.logger.info('The following ' + str(len(self.pred.columns.tolist())) + + ' features are included in the prediction dataset: ' + + str(self.pred.columns.tolist())) + + def import_training_dataset(self): + + # Import training dataset as csv file + self.train = pd.read_csv(settings.path_train) + # Extract and remove labels from training dataset + self.labels = np.array( + self.train['label']).reshape( + [np.shape(self.train['label'])[0], 1]) + + self.xy_train = pd.DataFrame() + self.xy_train['ID'] = self.train['ID'] + self.xy_train['label'] = self.train['label'] + self.xy_train['ycoord'] = self.train['ycoord'] + self.xy_train['xcoord'] = self.train['xcoord'] + + self.train = self.train.drop(['xcoord', 'ycoord', 'ID', 'label'], axis=1) + + if len(settings.not_included_train_data) > 0: + for dataset in settings.not_included_train_data: + if dataset in self.train.columns.tolist(): + self.train = self.train.drop(dataset, axis=1) + + self.logger.info('Training dataset imported') + self.logger.info('The following ' + str(len(self.train.columns.tolist())) + + ' features are included in the training dataset: ' + + str(self.train.columns.tolist())) + + def compare_features(self): + + """ + It is assessed if all features in the training dataset also appear + in the prediction dataset. If that is not the case, the training + process will be relaunched with an adapted training dataset where the + feature(s) that is/are not contrained in the training dataset are + removed. The second trained model will be stored in a seperate + folder which is named <old_folder_name>_retrain. + + If more features appear in the prediction dataset, the additional + features are removed. + + """ + + self.logger.info('Features are compared between training and prediction dataset') + + if set(self.train.columns) == set(self.pred.columns): + self.logger.info('Features are identical in both training and prediction dataset') + self.pred = self.pred[self.train.columns] + + self.logger.info('Potentially varying order of features has been fixed') + self.error = False + + else: + self.logger.warning('Features are not identical in the training and prediction dataset') + + extra_in_pred = set(self.pred.columns) - set(self.train.columns) + extra_in_train = set(self.train.columns) - set(self.pred.columns) + + if len(extra_in_pred) > 0 and len(extra_in_train) == 0: + self.logger.warning('More features in prediction dataset, additional features are removed') + + self.pred = self.pred[self.train.columns] + self.error = False + + elif len(extra_in_train) > 0 and len(extra_in_pred) == 0 : + self.logger.warning('More features in training dataset, additional features are removed') + + self.train = self.train[self.pred.columns] + self.error = False + + elif len(extra_in_train) > 0 and len(extra_in_pred) > 0: + self.logger.warning('There are mismatching features in both datasets') + + self.common_columns = self.train.columns.intersection(self.pred.columns) + + if len(self.common_columns.tolist()) == 0: + self.logger.error('Error: No common columns in training and prediction dataset') + self.error = True + + elif len(self.common_columns.tolist()) < 6: + self.logger.warning('Warning: only ' + str(len(self.common_columns.tolist())) + ' common columns in training and prediction dataset') + self.error = False + + self.train = self.train[self.common_columns] + self.pred = self.pred[self.common_columns] + + else: + self.logger.info(str(len(self.common_columns.tolist())) + ' common columns in training and prediction dataset') + self.error = False + + self.train = self.train[self.common_columns] + self.pred = self.pred[self.common_columns] + else: + self.logger.error('Error: Unknown issue detected. Check features manually!') + self.error = True + + self.logger.info('Feature comparison completed') + + def additional_instances_to_drop(self): + + """ + All instances that have a value of zero in all columns of a categorical + feature are identified and appended to the list of instances for which + a reliable prediction is not possible. + + Input: + master: related to information display in external window + logger: related to generation of a process log + row: related to information display in external window, int + idx: Previously defined instances for which prediction is not + possible, list + pred: prediction dataset, pandas DataFrame + + Output: + idx: Updated list of instances for which prediction is not + possible, list + row: Updated row information related to information display in + external window, int + + """ + + self.logger.info('Start identification of instances that are not represented by at least one categorical feature') + + columns = self.pred.columns + # Regular expression to match "<feature>_<value>_encoded" + pattern = re.compile(r"^(.*?)(_?\d+)?_encoded$") + encoded_features = {pattern.match(col).group(1) for col in columns if pattern.match(col)} + + self.logger.info('Identified encoded features: ' + str(encoded_features)) + count = 0 + for feature in encoded_features: + + feature_cols = [col for col in self.pred.columns if col.startswith(feature) and col.endswith("_encoded")] + all_zero_rows = (self.pred[feature_cols] == 0).all(axis=1) + all_zero_rows = self.pred.index[all_zero_rows].tolist() + self.idx = list(set(self.idx + all_zero_rows)) + count = count + len(all_zero_rows) + + self.logger.info(str(count) + ' instances have been identified that are not represented by at least one categorical feature') + + def save_prediction_dataset(self): + + """ + Save prediction dataset and information on dropped rows as nc-file + """ + + self.pred = pd.concat([self.xy, self.pred], axis=1) + pred = self.pred.to_numpy() + char_features = features_to_char(self.pred.columns) + + outfile = settings.path_pred + self.logger.info('Prediction dataset is saved to ' + outfile) + + if os.path.exists(outfile): + os.remove(outfile) + + ds = generate_basic_ncfile(outfile, crs=None) + ds.createDimension('lat', (np.shape(pred)[0])) + ds.createDimension('lon', (np.shape(pred)[1])) + ds.createDimension('ix', (len(self.idx))) + ds.createDimension('feat', len(char_features)) + result = ds.createVariable('Result', 'f4', ('lat', 'lon')) + dropped = ds.createVariable('Dropped', 'u8', 'ix') + Features = ds.createVariable('features', 'S1', 'feat') + result[:, :] = pred + dropped[:] = np.array(self.idx) + Features[:] = char_features + ds.close() + + def save_training_dataset(self): + + """ + Save dataframe as csv. If necessary folder is created. + """ + + self.logger.info('Saving of training data in progress') + + outfile = settings.path_train + + # If outfile exists already, delete + if os.path.exists(outfile): + os.remove(outfile) + + self.train = pd.concat([self.xy_train, self.train], axis=1) + + # Save dataframe as csv + self.train.to_csv(outfile, sep=',', index=False) + self.logger.info('Training dataset saved') + diff --git a/src/plain_scripts/settings copy.py b/src/plain_scripts/settings copy.py new file mode 100644 index 0000000000000000000000000000000000000000..30ee799a15006c6db9168f649d2eb7b75b79ca15 --- /dev/null +++ b/src/plain_scripts/settings copy.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" + This is a template file for settings.py + Either duplicate and rename or fill out and rename. + More information on the individual meaning and what to consider can be + found in the user manual +""" + +import logging +import json +import types + +def export_variables(logger): + + variables = globals() + # Filter out non-serializable objects + defined_vars = {} + for k, v in variables.items(): + if not k.startswith('__') and not callable(v) and not isinstance(v, types.ModuleType): + try: + # Test if the value is JSON serializable + json.dumps(v) + defined_vars[k] = v + except (TypeError, OverflowError): + # Skip non-serializable values + pass + # Convert the dictionary to a JSON string + vars_json = json.dumps(defined_vars, indent=4) + logger.info("Exported variables: %s", vars_json) + +# Mandatory parameters +days = 2 +approach = 'statistical' + +# Steps +training_dataset = False # Boolean, if training dataset shall be created +preprocessing = 'no_interpolation' # Defines preprocessing approach: 'cluster', 'interpolation', 'no_interpolation' +train_from_scratch = True +train_delete = None + +prediction_dataset = False # Boolean, if prediction dataset shall be created +pred_from_scratch = True +pred_delete = None + +map_generation = True # Boolean, if mapping shall be performed + +# General + +crs = 'wgs84' # Coordinate reference system, string +no_value = -999 # No data value, integer, suggestion -999 +random_seed = 42 # Random seed, integer +resolution = 25 # Resolution in m of the final map, integer, all datasets will be interpolated to this resolution +path_ml = '/Volumes/LaCie/2nd_Paper/entire_swiss_for_paper/maps/' # Path to where shire framework related parameters/files will be stored +data_summary_path = None # Path to the data summary file, string, relevant only for training/prediction dataset generation +key_to_include_path = None # Path to kets_to_include file, string, relevant only for training/prediction dataset generation + +# Training dataset generation + +size = None # Size of the validation dataset, float number between 0 and 1 +path_train = '/Volumes/LaCie/2nd_Paper/entire_swiss_for_paper/training_datasets/{days}/training_statistical_{days}d.csv' # Path to directory where the training dataset is/shall be stored +ohe = None # One-hot encoding, bool + +path_landslide_database = None # Path to where the landslide database is stored, string +ID = 'ID' # Name of the column containing landslide ID, string +landslide_database_x = 'xcoord' # Name of the column containing longitude values, string +landslide_database_y = 'ycoord' # Name of the column containing latitude values, string + +path_nonls_locations = None # Path to where the non-landslide database is stored, string +num_nonls = None # Number of non-landslide locations to include in the training dataset, integer +nonls_database_x = None # Name of the column containing longitude values, string +nonls_database_y = None # Name of the column containing longitude values, string + +#cluster = False # Use clustering for training dataset generation, bool +#interpolation = False # Use interpolation for training dataset generation, bool + +# Prediction dataset generation + +bounding_box = None # Coordinates of the edges of the bounding box of the area of interest, list, [<ymax>, <ymin>, <xmin>, <xmax>] +path_pred = None # Path to directory where the prediction dataset is/shall be stored + +# Map generation + +RF_training = True # Train the RF, bool +RF_prediction = True # Make a prediction using the RF, bool + +not_included_pred_data = ['xcoord', 'ycoord']# List of features in the training dataset not to be considered in prediction +not_included_train_data = [] # List of features in the training dataset not to be considered in model training + +num_trees = 100 # Number of trees in the Random Forest, integer +criterion = 'gini' # Criterion for the Random Forest, string +depth = 20 # Number of nodes of the RF, integer + +model_to_save = '/Volumes/LaCie/2nd_Paper/entire_swiss_for_paper/maps/{approach}/RF_{days}' # Folder name for storage of the RF results, string +model_to_load = '/Volumes/LaCie/2nd_Paper/entire_swiss_for_paper/maps/{approach}/RF_{days}' # Folder where RF model is stored, string, identical to model_to_save if training and prediction is done at the same time +model_database_dir = path_ml # Directory where models should be stored +parallel = True # Boolean, true if prediction data shall be split to predict in parallel + +keep_cat_features = False #bool, true if categorical features shall be kept even if some instances in prediction dataset have classes not covered by the prediction dataset +remove_instances = True # bool, true of instances in prediction dataset shall be removed if they have different classes than the instances in the training dataset \ No newline at end of file diff --git a/src/plain_scripts/shire.py b/src/plain_scripts/shire.py index a9c19b0e1fc9cea998a9aee2f6d325e7df3fea53..c4a97250872ec7f32ed58b442d24f1925906cf35 100644 --- a/src/plain_scripts/shire.py +++ b/src/plain_scripts/shire.py @@ -9,6 +9,8 @@ from create_training_data import create_training_data from create_prediction_data import create_prediction_data from RandomForest import RandomForest from check_user_input import check_general_settings +from compatibility_of_input_datasets import comparison_training_prediction_dataset + from utilities.initialise_log import save_log """ @@ -78,20 +80,26 @@ else: print('Map will be generated') logger.info('Map generation started') - if settings.parallel: - print('Prediction will run in parallel') - logger.info('Prediction will run in parallel') - if settings.RF_training: - logger.info('Random Forest training is launched') - s = RandomForest('train_test', parallel=settings.parallel, log=logger) - logger = s.logger - if settings.RF_prediction: - logger.info('Random Forest prediction in launched') - s = RandomForest('prediction', parallel=settings.parallel, log=logger) - logger = s.logger - - print('Map successfully created') - logger.info('Map successfully created') + print('Training and prediction dataset will be assessed for compatibility') + logger.info('Training and prediction dataset will be assessed for compatibility') + + s = comparison_training_prediction_dataset(logger) + + if not s.error: + if settings.parallel: + print('Prediction will run in parallel') + logger.info('Prediction will run in parallel') + if settings.RF_training: + logger.info('Random Forest training is launched') + s = RandomForest('train_test', parallel=settings.parallel, log=logger) + logger = s.logger + if settings.RF_prediction: + logger.info('Random Forest prediction in launched') + s = RandomForest('prediction', parallel=settings.parallel, log=logger) + logger = s.logger + + print('Map successfully created') + logger.info('Map successfully created') for handler in logger.handlers: handler.close()