diff --git a/data/data_summary_template.csv b/data/data_summary_template.csv index 66bc3e99dfa2fc662ff61937efbff01c53d851d4..dec8ef78aa03acb7bf2d200dddd797bbe2114a31 100755 --- a/data/data_summary_template.csv +++ b/data/data_summary_template.csv @@ -1 +1 @@ -path,keys,no_value,continuous \ No newline at end of file +path,keys,no_value,categorical \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..04eaba7ac641511286ed4e9fde2cac7214a6f241 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,32 @@ +affine==2.3.1 +attrs==22.2.0 +certifi==2021.5.30 +cftime==1.6.0 +click==8.0.4 +click-plugins==1.1.1 +cligj==0.7.2 +cycler==0.11.0 +importlib-metadata==4.8.3 +importlib-resources==5.4.0 +joblib==1.1.1 +kiwisolver==1.3.1 +LatLon23==1.0.7 +matplotlib==3.3.4 +netCDF4==1.6.2 +numpy==1.19.5 +pandas==1.1.5 +Pillow==8.4.0 +pyparsing==3.1.1 +pyproj==3.0.1 +python-dateutil==2.8.2 +pytz==2023.3.post1 +rasterio==1.2.10 +scikit-learn==0.24.2 +scipy==1.5.4 +six==1.16.0 +sklearn==0.0 +snuggs==1.4.7 +threadpoolctl==3.1.0 +tqdm==4.64.1 +typing_extensions==4.1.1 +zipp==3.6.0 diff --git a/src/gui_version/RandomForest_gui.py b/src/gui_version/RandomForest_gui.py new file mode 100644 index 0000000000000000000000000000000000000000..fa1fa5b6133996db1df3a0fc0923b40b8432c9e1 --- /dev/null +++ b/src/gui_version/RandomForest_gui.py @@ -0,0 +1,831 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import pandas as pd +import netCDF4 as nc +import pickle as pkl +import os +import logging + +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import mean_squared_error, f1_score, roc_curve, auc, fbeta_score +from joblib import delayed, Parallel +from tkinter import Label + +from utilities.ncfile_generation import generate_ncfile +from utilities.strings_for_ncfile import char_to_string + +class prepare_data: + + """ + This class prepares the data to be + used in the Random Forest classifier. + """ + + def __init__(self, master, aim, log=None): + + self.master = master + self.logger = log + self.row = 0 + self.import_parameters() + #self.save_log() + self.logger.info("Susceptibility/hazard map generation started") + + self.master.geometry() + self.master.winfo_toplevel().title("Map generation") + + Label(self.master, text="Log:").grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, text="Map generation started").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + aim = aim + invalid = False + if aim == 'train_test': + invalid = False + self.logger.info("Train the model") + elif aim == 'prediction': + invalid = False + self.logger.info("Prepare the susceptibility/hazard map") + else: + self.logger.info( + "Not a valid command. Enter train_test or prediction") + invalid = True + + if not invalid: + self.test_size = self.properties_map['size_val'] + self.label_name = self.properties_map['name_label'] + self.xy = pd.DataFrame() + + if aim == 'train_test': + self.import_features_labels() + self.split_training_testing() + elif aim == 'prediction': + self.import_features() + + def import_parameters(self): + + with open('tmp_map.pkl', 'rb') as handle: + self.properties_map = pkl.load(handle) + + with open('tmp_settings.pkl', 'rb') as handle: + self.properties_settings = pkl.load(handle) + + if self.properties_map['drop_pred'] == '': + self.not_included_pred_data = [] + else: + self.not_included_pred_data = self.properties_map[ + 'drop_pred'].split(',') + + if self.properties_map['drop_train'] == '': + self.not_included_train_data = [] + else: + self.not_included_train_data = self.properties_map[ + 'drop_train'].split(',') + + def import_features(self): + + """ + Imports the features for prediction. + """ + + # Import prediction dataset either as csv file or nc file + if self.properties_map['pred_path'].split('.')[-1] == 'csv': + self.features = pd.read_csv(self.properties_map['pred_path']) + + elif self.properties_map['pred_path'].split('.')[-1] == 'nc': + ds = nc.Dataset(self.properties_map['pred_path']) + pred = ds['Result'][:, :].data + pred_features = ds['features'][:].data + self.feature_list = char_to_string(pred_features) + if 'xcoord' in self.feature_list and 'ycoord' in self.feature_list: + self.features = pd.DataFrame(pred, columns=self.feature_list) + else: + self.features = pd.DataFrame(pred, columns=['xcoord', 'ycoord']+self.feature_list) + + self.dropped = ds['Dropped'][:].data + #print(type(self.dropped[0])) + self.dropped = [int(x) for x in self.dropped] + + # Save the prediction coordinates in the prediction dataset + self.xy['ycoord'] = self.features['ycoord'] + self.xy['xcoord'] = self.features['xcoord'] + + # Remove all features that shall not be included + # in prediction from DataFrame (see settings!) + if len(self.not_included_pred_data) > 0: + for dataset in self.not_included_pred_data: + self.features = self.features.drop(dataset, axis=1) + + # Determine which classes are contained in the categorical features + # It is distinguished between one-hot and ordinal encoded features + self.categorical_classes = {} + cat_subset = [feat for feat in self.features.columns.tolist() if '_encode' in feat] + df_sub = self.features[cat_subset] + cat_feat = ['_'.join(col.split('_')[:-2]) for col in df_sub.columns.tolist()] + for feat in list(set(cat_feat)): + classes = [] + if cat_feat.count(feat)>1: + classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f]) + else: + classes = list(set(df_sub[feat + '_encode'].tolist())) + self.categorical_classes[feat] = {} + self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist] + self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat) + + self.feature_list = list(self.features.columns) + #self.features = np.array(self.features) + self.features_org = self.features.copy() + self.logger.info('Features imported') + self.logger.info('The following ' + str(len(self.feature_list)) + + ' features are included in the prediction dataset: ' + + str(self.feature_list)) + + Label(self.master, text="Features successfully imported").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + def import_features_labels(self): + + """ + Imports the features for training. + """ + + # Import training dataset as csv file + self.features = pd.read_csv(self.properties_map['train_path']) + # Extract and remove labels from training dataset + self.labels = np.array( + self.features[self.label_name]).reshape( + [np.shape(self.features[self.label_name])[0], 1]) + self.features = self.features.drop(self.label_name, axis=1) + + # Store coordinates from training data + self.xy['ycoord'] = self.features['ycoord'] + self.xy['xcoord'] = self.features['xcoord'] + + self.features = self.features.drop(['xcoord', 'ycoord'], axis=1) + + # Drop ID from training data + self.features = self.features.drop('ID', axis=1) + + # Remove all features that shall not be included in + # training from DataFrame (see settings!) + if len(self.not_included_train_data) > 0: + for dataset in self.not_included_train_data: + self.features = self.features.drop(dataset, axis=1) + + # Determine which classes are contained in the categorical features + # It is distinguished between one-hot and ordinal encoded features + self.categorical_classes = {} + cat_subset = [feat for feat in self.features.columns.tolist() if '_encode' in feat] + df_sub = self.features[cat_subset] + cat_feat = ['_'.join(col.split('_')[:-2]) for col in df_sub.columns.tolist()] + for feat in list(set(cat_feat)): + classes = [] + if cat_feat.count(feat)>1: + classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f]) + else: + classes = list(set(df_sub[feat + '_encode'].tolist())) + self.categorical_classes[feat] = {} + self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist] + self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat) + + self.feature_list = list(self.features.columns) + self.features = np.array(self.features) + + self.logger.info('Features imported') + self.logger.info('The following ' + str(len(self.feature_list)) + + ' features are included in the training dataset: ' + + str(self.feature_list)) + + Label(self.master, + text="Features and label successfully imported").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + def split_training_testing(self): + + """ + Splits the training data into training and validation data. + """ + + self.train_features, self.test_features, self.train_labels, self.test_labels = \ + train_test_split( + self.features, + self.labels, + test_size=self.test_size, + random_state=int(self.properties_settings['random_seed']), + stratify=self.labels) + + self.logger.info('Validation dataset split from training dataset') + + Label(self.master, text="Training dataset splitted").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + +class RandomForest(prepare_data): + + def __init__(self, master, aim, parallel=False, log=None): + + super().__init__(master, aim, log=log) + self.aim = aim + self.logger = log + self.parallel = parallel + self.num_chunks = 10 + # Random Forest settings + self.criterion = self.properties_map['criterion'] + self.n_estimators = self.properties_map['num_trees'] + self.max_depth = self.properties_map['depth_trees'] + + self.logger.info('Aim: ' + str(aim)) + + if aim == 'prediction': + self.model_dir = self.properties_map['model_path'] + '/' + self.model_to_load = self.properties_map['model_to_load'] + '/' + else: + self.model_dir = self.properties_map['model_path'] + '/' + self.model_to_save = self.properties_map['model_to_load'] + '/' + self.output_dir = None + + if aim == 'train_test': + + Label(self.master, text="Model training started").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.define() + self.train() + self.predict() + self.evaluate() + self.create_output_dir() + self.save_model() + self.save_parameters() + self.feature_importance() + + elif aim == 'prediction': + + Label(self.master, text="Mapping started").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.create_output_dir() + self.load_model() + if not self.error: + self.predict() + self.extract_pos_neg_predictions() + self.reshape_prediction() + self.save_prediction() + + def define(self): + + """ + Define the Random Forest Classifier model. + """ + + self.model = RandomForestClassifier( + n_estimators=self.n_estimators, + max_depth=self.max_depth, + random_state=int(self.properties_settings['random_seed'])) + + self.logger.info('Parameters: ' + + str(self.n_estimators) + ' (Num. estimators) ' + '|' + + str(self.max_depth) + ' (Depth) ' + '|' + + ' (Random seed) ' + '|' + + str(self.criterion) + ' (Criterion) ' + '|' + + str(self.test_size) + ' (Splitting ratio) ' + '|') + + Label(self.master, text="Model is defined").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + def train(self): + + """ + Train the Random Forest Classifier model. + """ + + self.model.fit(self.train_features, np.ravel(self.train_labels)) + self.logger.info('Model trained') + + Label(self.master, text="Model successfully trained").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + def predict(self): + + """ + Make the prediction. + """ + + if self.aim == 'prediction': + pred = self.features + elif self.aim == 'train_test': + pred = self.test_features + + if self.parallel: + self.split_array_into_chunks() + prediction = Parallel(n_jobs=10)(delayed( + self.model.predict)(chunk) for chunk in self.chunks) + self.prediction = np.concatenate(prediction, axis=0) + else: + self.prediction = self.model.predict(pred) + + if self.aim == 'prediction': + self.logger.info('Prediction completed') + + Label(self.master, text="Prediction completed").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + elif self.aim == 'train_test': + + Label(self.master, text="Validation data predicted").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('Validation data predicted') + + def split_array_into_chunks(self): + + """ + Split a NumPy array into chunks without changing the number of columns. + + """ + + # Calculate the number of rows in each chunk + rows_per_chunk = self.features.shape[0] // self.num_chunks + remaining_rows = self.features.shape[0] % self.num_chunks + + # Create chunks + self.chunks = [] + start = 0 + for i in range(self.num_chunks): + end = start + rows_per_chunk + (1 if i < remaining_rows else 0) + chunk = self.features[start:end, :] + self.chunks.append(chunk) + start = end + + self.logger.info('Prediction dataset split into chunks') + + Label(self.master, text="Prediction dataset split into chunks").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + def evaluate(self): + + """ + Evaluate the validation dataset. + """ + + y_pred_prob = self.model.predict_proba(self.test_features)[:, 1] + self.fpr, self.tpr, self.thresholds = roc_curve(self.test_labels, y_pred_prob) + + self.roc_auc = auc(self.fpr, self.tpr) + + diff = [abs(pred-test_labels) + for pred, test_labels + in zip(list(self.prediction), list(self.test_labels))] + self.acc = str(diff.count(1)) + '/' + str(len(diff)) + self.mae = round(np.mean(diff), 2) + self.mse = mean_squared_error(self.test_labels, self.prediction) + self.f1 = f1_score(self.test_labels, self.prediction) + self.fbeta = fbeta_score(self.test_labels, self.prediction, beta=2) + + self.logger.info('Evaluation metrics computed') + + Label(self.master, text="Evaluation metrics computed").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + def create_output_dir(self): + + """ + Define and create the output directory. + """ + + if self.aim == 'train_test': + self.output_dir = self.model_dir + self.model_to_save + else: + self.output_dir = self.model_dir + self.model_to_load + + if not os.path.isdir(self.output_dir): + os.mkdir(self.output_dir) + + def save_model(self): + + """ + Save the Random Forest Classifier model. + """ + + with open(self.output_dir + '/saved_model.pkl', 'wb') as file: + pkl.dump(self.model, file) + + self.logger.info('Model saved to ' + + self.output_dir + + '/saved_model.pkl') + + Label(self.master, text="Model saved").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + def save_parameters(self): + + """ + Save the metadata associated with the prediction. + """ + + tmp_max = self.xy.max(axis=0) + tmp_min = self.xy.min(axis=0) + + params = {'Area': [tmp_min[0], tmp_max[0], tmp_min[1], tmp_max[1]], + 'criterion': [self.criterion], + 'n_estimators': [self.n_estimators], + 'max_depth': [self.max_depth], + 'features': self.feature_list, + 'mse': self.mse, + 'mae': self.mae, + 'f1': self.f1, + 'roc_threshold': self.thresholds, + 'roc_fpr': self.fpr, + 'roc_tpr': self.tpr, + 'roc_auc': self.roc_auc, + 'accuracy': self.acc, + 'fbeta': self.fbeta, + 'categories': self.categorical_classes + } + + with open(self.model_dir + + self.model_to_save + + 'model_params.pkl', 'wb') as file: + pkl.dump(params, file) + + self.logger.info('Parameters saved to ' + + self.model_dir + + self.model_to_save + + 'model_params.pkl') + + Label(self.master, text="Parameters saved").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + def adapt_categorical_features(self, train_classes): + + """ + Assure that identical categorical features are used in training + and prediction dataset + + The encoded features in the training and prediction dataset are + compared regarding the contained classes. Depending on the user + input, instances in the prediction dataset with classes that are + not included in the training dataset are either set to no_value or + nevertheless considered in the prediction. The surplus additional + features are removed either way to achieve the same set of features + as in the training dataset + + Input: + train_classes: dictionary containing for each categorical feature + all classes and the number of total classes + contained in the training dataset + + Output: + None + """ + + Label(self.master, text="Categorical features are compared between training and prediction dataset").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.instances_to_drop = [] + for feat in list(train_classes.keys()): + if feat not in list(self.categorical_classes.keys()): + + Label(self.master, text='Categorical feature ' + feat + ' not in prediction dataset').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + Label(self.master, text='Error: cannot proceed with mapping').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.error('Error: Categorical feature ' + feat + ' not in prediction dataset') + self.error = True + else: + if (train_classes[feat]['num_cols'] < self.categorical_classes[feat]['num_cols']) or (set(train_classes[feat]['classes']) != set(self.categorical_classes[feat]['classes'])): + + Label(self.master, text=feat + ': Prediction dataset contains more or other classes than training dataset').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + Label(self.master, text='Apply user defined handling approach').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.warning(feat + ': Prediction dataset contains more classes than training dataset') + self.logger.info('Apply user defined handling approach') + + common_elements = set(train_classes[feat]['classes']).intersection(set(self.categorical_classes[feat]['classes'])) + if self.properties_map['keep']: + if len(common_elements) == 0: + + Label(self.master, text='Error: no common classes for ' + feat + ' in training and prediction dataset').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.error('Error: no common classes for ' + feat + ' in training and prediction dataset') + self.error = True + else: + to_drop = [feat + '_' + f + '_encode' for f in self.categorical_classes[feat]['classes'] if f not in common_elements] + self.features = self.features.drop(to_drop, axis=1) + self.feature_list = self.features.columns.tolist() + elif self.properties_map['remove_instances']: + to_drop_col = [feat + '_' + f + '_encode' for f in self.categorical_classes[feat]['classes'] if f not in common_elements] + to_drop_row = [] + for col in to_drop_col: + to_drop_row = to_drop_row + self.features.index[self.features[col] == 1].tolist() + self.features = self.features.drop(to_drop_col, axis=1) + + Label(self.master, text='Not matching features have been removed').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('Not matching features have been removed') + + self.feature_list = self.features.columns.tolist() + self.instances_to_drop = self.instances_to_drop + to_drop_row + + Label(self.master, text='Instances to consider during mapping have been adapted').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('Instances to consider during mapping have been adapted') + + Label(self.master, text='Categorical features have been handled and hamonised').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('Categorical features have been handled and hamonised') + self.logger.info('Remaining features: ' + str(self.feature_list)) + + def load_model(self): + + """ + Load the Random Forest Classifier model and the metadata. + Make sure to compare features of training and prediction dataset + as well as their order. + """ + + with open( + self.model_dir + + self.model_to_load + + 'saved_model.pkl', 'rb') as file: + self.model = pkl.load(file) + + with open( + self.model_dir + + self.properties_map['model_to_load'] + + '/model_params.pkl', 'rb') as f: + params = pkl.load(f) + features = params['features'] + self.error = False + self.adapt_categorical_features(params['categories']) + + if not self.error: + if len(self.feature_list) == len(features): + if set(self.feature_list) != set(features): + + Label(self.master, text='Error: Not all features of the model are contained in the prediction dataset').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.error('Error: Not all features of the model are contained in the prediction dataset') + + self.error = True + elif self.feature_list != features: + + Label(self.master, text='The order or features differs. Prediction features are reordered').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('The order or features differs. Prediction features are reordered') + + self.features = self.features[features] + if self.features.columns.tolist() != features: + + Label(self.master, text='There is still something wrong with the order of the features!').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + self.error = True + elif self.feature_list == features: + + Label(self.master, text='Prediction and training dataset have the same order').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('Prediction and training dataset have the same order') + elif len(self.feature_list) < len(features): + + Label(self.master, text='Error: Not all features of the model are contained in the prediction dataset').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.error('Error: Not all features of the model are contained in the prediction dataset') + + self.error = True + elif len(self.feature_list) > len(features): + if set(features).issubset(self.feature_list): + to_drop = list(set(self.feature_list)-set(features)) + self.features_org = self.features_org.drop(to_drop, axis=1) + self.features_org = self.features_org[features] + if self.features_org.columns.tolist() != features: + Label(self.master, text='There is still something wrong with the order of the features!').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + self.error = True + else: + self.features = self.features_org.to_numpy() + self.feature_list = self.features_org.columns.tolist() + + Label(self.master, text='Features in the prediction dataset which were not used for training were removed').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + Label(self.master, text='Features in the prediction dataset were sorted to match the training features').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.warning('Features in the prediction dataset which were not used for training were removed') + self.logger.info('Features left: ' + str(self.feature_list)) + self.logger.info('Features in the prediction dataset were sorted to match the training features') + else: + Label(self.master, text='Error: Not all features of the model are contained in the prediction dataset').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.error('Error: Not all features of the model are contained in the prediction dataset') + + self.error = True + if not self.error: + self.feature_list = self.features.columns.tolist() + self.features = self.features.to_numpy() + + + self.logger.info('Model loaded from ' + + self.model_dir + + self.model_to_load) + + Label(self.master, text=('Model loaded from ' + + self.model_dir + + self.model_to_load)).grid(row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + Label(self.master, text="Model successfully loaded").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + def save_prediction(self): + + """ + Save the prediction. + """ + + if self.aim == 'prediction': + output_dir = self.model_dir + self.model_to_load + + self.xy.to_csv( + output_dir + 'prediction_results.csv', + columns=['xcoord', 'ycoord', 'pred'], + index=True) + self.df_pos.to_csv( + output_dir + 'pos_prediction_results.csv', + columns=['xcoord', 'ycoord', 'pred'], + index=True) + self.df_neg.to_csv( + output_dir + 'neg_prediction_results.csv', + columns=['xcoord', 'ycoord', 'pred'], + index=True) + + self.logger.info('Prediction saved in ' + output_dir) + + Label(self.master, text="Prediction saved as csv-file").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + def reshape_prediction(self): + + """ + Reshape the individual predictions into a map. + """ + dropped = list(set(self.dropped + self.instances_to_drop)) + arr_xy = np.array(self.xy) + arr_xy[dropped, :] = [self.properties_settings['no_value']] + + result = np.reshape(list(arr_xy[:, 2]), + (len(list(set(self.xy['ycoord']))), + len(list(set(self.xy['xcoord']))))) + + self.logger.info('Map generated') + + Label(self.master, text="Prediction reshaped into map").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.save_prediction_as_nc(result) + + def extract_pos_neg_predictions(self): + + """ + Distinguish between the classes of the Classifier. + """ + + self.xy['pred'] = self.prediction + self.df_pos = self.xy[self.xy.pred == 1] + self.df_neg = self.xy[self.xy.pred == 0] + + self.logger.info('Positive and negative predictions extracted') + + def save_prediction_as_nc(self, prediction): + + """ + Save the hazard map to a netCDF4 file. + """ + + outfile_name = self.model_dir + self.model_to_load + 'prediction.nc' + + if os.path.exists(outfile_name): + os.remove(outfile_name) + + generate_ncfile(outfile_name, + np.array(sorted(set(list(self.xy['xcoord'])))), + np.array(sorted(set(list(self.xy['ycoord'])))), + prediction, + crs=self.properties_settings['crs'], + missing_value=self.properties_settings['no_value']) + + self.logger.info('Map saved in ' + outfile_name) + + Label(self.master, text="Map saved as nc-file").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + def feature_importance(self): + + """ + Access feature importance information from the Random Forest. + """ + + feature_imp = pd.Series(self.model.feature_importances_, + index=self.feature_list).sort_values( + ascending=False) + + feature_imp.to_csv(self.model_dir + + self.model_to_save + + 'feature_importance.csv') + + self.logger.info('Feature importance determined') + + Label(self.master, text="Feature importance computed").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() diff --git a/src/gui_version/check_user_input.py b/src/gui_version/check_user_input.py new file mode 100644 index 0000000000000000000000000000000000000000..03c418b573b3e17aefe34d11301372a69f8ac719 --- /dev/null +++ b/src/gui_version/check_user_input.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import pickle +import pandas as pd +import os +import logging + +from utilities.initialise_log import save_log + + +class check_general_settings(): + + """ + This class imports the temporary pickle files saved with the + information provided by the user through the GUI and performs quality + control on the provided information. + + The class instance error then decides whether SHIRE aborts the run + or if the next step is initialised. + + The information on the results of the quality check is saved in a log + file which stored either at the location of the training dataset or + the prediction dataset. + """ + + def __init__(self): + + path = ['tmp_settings.pkl', 'tmp_map.pkl', 'tmp_train.pkl', 'tmp_pred.pkl'] + self.error = False + + save_path = 'check_user_input.log' + + if os.path.exists(save_path): + os.remove(save_path) + + self.logger = save_log(save_path) + self.logger.info("Start checking user input") + self.open_reference() + + for self.path in path: + self.logger.info('Check: ' + self.path) + if os.path.exists(self.path): + self.logger.info(self.path + ': available') + self.open_file() + self.check_parameters() + + if self.path == path[3]: + self.check_coordinates() + if self.path == path[3] or self.path == path[2]: + self.check_path_extension_geosummary() + else: + self.logger.info(self.path + ': not available') + + self.logger.info('Check completed') + for handler in self.logger.handlers: + handler.close() + self.logger.removeHandler(handler) + + def open_file(self, path=None): + + if path == None: + path = self.path + + with open(path, 'rb') as handle: + self.params = pickle.load(handle) + + if path == None: + self.logger.info('Parameters loaded') + + def open_reference(self): + + self.ref = pd.read_csv('./utilities/properties_user_input.csv') + self.logger.info('Reference imported') + + def compare_type(self, key, types): + + type_map = { + 'int': int, + 'float': float, + 'str': str, + 'list': list, + 'dict': dict, + 'bool': bool + } + + types = types.split(',') + types = [type_map.get(i) for i in types] + + if type(self.params[key]) not in types: + self.logger.error(key + ': Wrong parameter type provided! Provide: ' + str(types)) + self.error = True + + def compare_extension(self, key, ext): + + if len(self.params[key].split('.')) != 2: + self.logger.error(key + ': Path names should not contain full stops!') + self.error = True + + ext = ext.split(',') + + if self.params[key].split('.')[1] not in ext: + self.logger.error(key + ': Wrong file format provided! Provide: ' + str(ext)) + self.error = True + + def compare_range(self, key, r): + + r = r.split(',') + + if r[1] == 'inf': + if self.params[key] < float(r[0]): + self.logger.error(key + ': Value not within range!') + self.error = True + else: + if self.params[key] < float(r[0]) or self.params[key] > float(r[1]): + self.logger.error(key + ': Value not within range!') + self.error = True + + def check_coordinates(self): + + if self.params['south'] >= self.params['north']: + if self.params['south'] == self.params['north']: + self.logger.error('Careful! South coordinate indentical to north coordinate!') + else: + self.logger.error('Careful! South coordinate north of north coordinate!') + self.error = True + + if self.params['west'] >= self.params['east']: + if self.params['west'] == self.params['east']: + self.logger.error('Careful! West coordinate identical to east coordinate!') + self.error = True + else: + if ((self.params['west'] < 0 and self.params['west'] > -10) and (self.params['east'] > 0 and self.params['east'] < 10)) or ((self.params['west'] > 0 and self.params['west'] > 170) and (self.params['east'] < 0 and self.params['east'] < -170)): + self.logger.warning('Careful! Please check east and west coordinates!') + else: + self.logger.error('Careful! West coordinate east of east coordinate!') + self.error = True + + def check_file_exist(self, key, path): + + if not os.path.exists(path) and not os.path.isdir(os.path.dirname(path)): + self.logger.error(key + ': Path or file does not exist!') + self.error = True + + def check_path_extension_geosummary(self): + + self.logger.info('Check paths in geospatial data summary') + summary = pd.read_csv(self.params['geo_path']) + keys_to_include = pd.read_csv(self.params['feat_path']) + for key in list(keys_to_include['keys_to_include']): + idx = list(summary['keys']).index(key) + + if summary.at[idx, 'path'].split('.')[1] not in ['nc', 'tif', 'tiff']: + self.logger.error(key + ': Wrong file format!') + self.error = True + + if not os.path.exists(summary.at[idx, 'path']): + self.logger.error(key + ': File cannot be found!') + self.error = True + + def check_parameters(self): + + ref_keys = self.ref['key'].tolist() + for key in list(self.params.keys()): + idx = ref_keys.index(key) + self.logger.info('Check ' + key + ' | is path: ' + str(self.ref.at[idx, 'path']) + ' | Range: ' + str(self.ref.at[idx, 'range']) + ' | Extension: ' + str(self.ref.at[idx, 'extension']) + ' | Type: ' + str(self.ref.at[idx, 'type'])) + if self.ref.at[idx, 'path'] == 1: + #print(self.ref.at[idx, 'path']) + self.check_file_exist(key, self.params[key]) + if self.ref.at[idx, 'range'] != 'None': + self.compare_range(key, self.ref.at[idx, 'range']) + if self.ref.at[idx, 'extension'] != 'None' and self.ref.at[idx, 'path'] == 1: + self.compare_extension(key, self.ref.at[idx, 'extension']) + if self.ref.at[idx, 'type'] != 'None': + self.compare_type(key, self.ref.at[idx, 'type']) + + \ No newline at end of file diff --git a/src/gui_version/create_prediction_data_gui.py b/src/gui_version/create_prediction_data_gui.py new file mode 100644 index 0000000000000000000000000000000000000000..499fa91f3071b0b3a15b7ca51c347da9e44177df --- /dev/null +++ b/src/gui_version/create_prediction_data_gui.py @@ -0,0 +1,581 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import pandas as pd +import netCDF4 as nc +import os +import pickle +import logging + +from tqdm import tqdm +from tkinter import Label + +from data_preprocessing_gui import generate_data_matrix +from utilities.ncfile_generation import generate_basic_ncfile +from utilities.strings_for_ncfile import features_to_char, char_to_string +from utilities.handle_categorical_values import handle_categorical_values + +class create_prediction_data: + + """ + This class creates the prediction data + for the Random Forest classifier. + + Input: + from_scratch: boolean, + True if prediction dataset should be generated from + scratch, otherwise false + delete: True if dataset/feature should be + deleted from prediction dataset + False if dataset should be added to existing + prediction dataset + (careful: from_scratch needs to be False!) + data_to_handle: list of features that should be added/deleted + datasets need to be listed in list_of_raw_datasets + + Output: + netCDF4 file + """ + + def __init__(self, master, log=None): + + self.logger = log + self.import_parameters() + + self.row = 0 + + self.master = master + self.master.geometry() + self.master.winfo_toplevel().title("Prediction dataset generation") + self.logger.info('Prediction dataset generation started') + + Label(self.master, text="Log:").grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, + text="Prediction dataset generation started").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + if self.from_scratch: + self.logger.info('Prediction dataset is generated from scratch') + Label(self.master, + text='Cube of interpolated datasets is generated').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + self.s = generate_data_matrix( + from_scratch=True, + delete=False, + dataset='prediction', + bb=self.bb, + data_to_handle=self.data_to_handle, + geo_overview=self.datasets_summary, + settings=self.properties_settings, + settings_train_pred=self.properties_pred) + self.logger.info('Cube of interpolated datasets has been generated') + Label(self.master, + text='Cube of interpolated datasets has been generated').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + self.import_cube() # Import data cube + + if not self.no_dataset_found: + Label(self.master, + text='Cube of interpolated datasets has been imported').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + # Add coordinate information to + # prediction dataset for map generation + self.add_coordinates() + Label(self.master, + text='Coordinates have been added').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + # Flatten data cube for efficient information extraction + self.flatten_cube() + Label(self.master, + text='Cube has been flattened').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + self.clean_df() # Determine no value instances in DataFrame + Label(self.master, + text='Dataset has been cleaned').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + # Save prediction dataset + self.handle_categorical_features() + Label(self.master, + text='Categorical features have been handled').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + self.char_features = features_to_char(self.df_pred.columns) + self.save_as_nc() + Label(self.master, + text='Prediction dataset generation successful').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + else: + Label(self.master, + text='Error: Cube of interpolated datasets has not been found!').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + + elif not self.from_scratch and not self.delete: + self.logger.info( + 'A feature will be added to existing prediction dataset') + self.logger.info('Feature to add: ' + str(self.data_to_handle)) + Label(self.master, + text='Feature(s) will be added to an existing prediction dataset').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + # Import existing prediction dataset + self.import_prediction_dataset() + if self.pred_exist: + Label(self.master, + text='Prediction dataset has been imported').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + # Import data cube that contains + # cut and interpolate dataset to be added + self.import_cube() + if not self.no_dataset_found: + Label(self.master, + text='Cube of interpolated datasets has been imported').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + # Check if datasets to be added are contained in the data cube + feature_to_add = [] + for feature in self.data_to_handle: + not_included = False + if feature not in self.features: + self.logger.info( + str(feature) + + ' not included in data cube,\ + it has to be added first') + Label(self.master, + text=str(feature) + ' not included in cube').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + not_included = True + feature_to_add.append(feature) + + #if not_included: + if len(feature_to_add): + self.logger.info(str(feature_to_add) + + ' will be appended to the data cube') + Label(self.master, + text=str(feature_to_add) + ' is appended to the cube').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + self.s = generate_data_matrix( + from_scratch=False, + delete=False, + dataset='prediction', + bb=self.bb, + data_to_handle=feature_to_add, + geo_overview=self.datasets_summary, + settings=self.properties_settings, + settings_train_pred=self.properties_pred, + keys_already_included=self.features) + self.logger.info('Data cube has been updated') + Label(self.master, + text='Cube has been updated').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + self.import_cube() + if not self.no_dataset_found: + self.add_feature() # Add feature + Label(self.master, + text='Feature(s) added to prediction dataset').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + # Save prediction dataset + self.clean_df() + Label(self.master, + text='Prediction dataset has been cleaned').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + self.handle_categorical_features(var=self.data_to_handle) + Label(self.master, + text='Categorical features have been handled').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + self.char_features = features_to_char(self.df_pred.columns) + self.save_as_nc() + Label(self.master, + text='Prediction dataset successfully updated').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + else: + Label(self.master, + text='Cube of interpolated datasets has not been found').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + else: + Label(self.master, + text='Error: Cube of interpolated datasets has not been found').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + + elif not self.from_scratch and self.delete: + self.logger.info( + 'A feature will be delted from existing prediction dataset') + self.logger.info('Feature to delete: ' + str(self.data_to_handle)) + Label(self.master, + text='Feature(s) will be deleted from existing prediction dataset').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + Label(self.master, + text='Feature(s) to delete: ' + str(self.data_to_handle)).grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + # Import existing prediction dataset + self.import_prediction_dataset() + if self.pred_exist: + Label(self.master, + text='Existing prediction dataset imported').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + self.delete_features() # Delete features from prediction dataset + Label(self.master, + text='Feature(s) deleted').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + # Save prediction dataset + self.char_features = features_to_char(self.df_pred.columns) + self.save_as_nc() + Label(self.master, + text='Prediction dataset successfully updated').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + + def import_parameters(self): + + if os.path.exists('tmp_pred.pkl') and os.path.exists('tmp_settings.pkl'): + with open('tmp_pred.pkl', 'rb') as handle: + self.properties_pred = pickle.load(handle) + self.datasets_summary = pd.read_csv( + self.properties_pred['geo_path']) + self.data_to_handle = pd.read_csv( + self.properties_pred['feat_path']) + self.keys = pd.read_csv( + self.properties_pred['geo_path']) + self.keys = list(self.keys['keys']) + if self.properties_pred['from_scratch'] == 1: + self.from_scratch = True + self.data_to_handle = list( + self.data_to_handle['keys_to_include']) + else: + self.from_scratch = False + self.data_to_handle = list( + self.data_to_handle['keys_to_include']) + + if self.properties_pred['delete'] == 1: + self.delete = True + if self.properties_pred['add'] == 1: + self.delete = False + + self.bb = [self.properties_pred['north'], + self.properties_pred['south'], + self.properties_pred['west'], + self.properties_pred['east']] + + with open('tmp_settings.pkl', 'rb') as handle: + self.properties_settings = pickle.load(handle) + self.properties_settings['pred_path'] = \ + self.properties_pred['pred_path'] + else: + Label(self.master, + text='Error: user input files not found!').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + self.logger.error('Error: user input files not found!') + + + def add_feature(self): + + """ + Add feature to the prediction dataset + """ + + self.logger.info('Feature will be added') + + for count, key in enumerate(self.data_to_handle): + # Delete feature if it already exists in training dataset + if key in self.df_pred.columns: + self.logger.info('Feature already exists in dataset.\ + Existing feature is deleted') + self.df_pred = self.df_pred.drop(key, axis=1) + + self.logger.info('Adding ' + str(key)) + Label(self.master, + text='Adding ' + str(key)).grid(row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + if count == 0: + # Create empty DataFrame + self.df_features = pd.DataFrame(index=range(len(self.df_pred)), + columns=self.data_to_handle) + + data_flat = self.cube[:, :, self.features.index(key)].flatten() + self.df_features[key] = data_flat + + # Combine old training dataset with additional features + self.df_pred = pd.concat([self.df_pred, self.df_features], axis=1) + # Adapt column order + self.logger.info( + 'Prediction dataset contains the following features: ' + + str(list(self.df_pred.columns))) + Label(self.master, + text='Prediction dataset contains the following features: ' + + str(list(self.df_pred.columns))).grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + + def import_cube(self): + + """ + Import cut and interpolated data cube + which was created in pre-processing.py + """ + + self.logger.info('Data cube is being imported') + + # Path to the stored data cube (see data_preprocessing.py) + folder = self.properties_pred['pred_path'].rsplit('/', 1)[0] + path = folder + '/data_combined_prediction_' + str(self.properties_settings['resolution']) + '.nc' + + # Check if path exists and import the cube + # as well as list of datasets it contains + if not os.path.exists(path): + self.logger.error('Error: Dataset not found!') + self.no_dataset_found = True + else: + self.no_dataset_found = False + ds = nc.Dataset(path) + self.cube = ds['Result'][:, :, :].data + self.x = ds['Longitude'][:].data + self.y = ds['Latitude'][:].data + self.pred_features = ds['features'][:].data + self.features = char_to_string(self.pred_features) + + self.logger.info('Features included in dataset: ' + + str(self.features)) + Label(self.master, + text='Features included in prediction dataset: ' + + str(self.features)).grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + + def flatten_cube(self): + + """ + Flatten the individual datasets of the data cube + """ + + self.logger.info('Data cube is flattened') + # Go through all datasets in the data cube + for i in tqdm(range(np.shape(self.cube)[2])): + data = self.cube[:, :, i] + data_flat = data.flatten() # Flatten the dataset + # Save it to the DataFrame + self.df_pred[self.features[i]] = data_flat + + def add_coordinates(self): + + """ + Add coordinate for which the model shall + make an prediction to the DataFrame. + """ + + self.logger.info('Coordinates are added') + self.df_pred = pd.DataFrame(columns=['xcoord', 'ycoord'] + + self.features) + self.X, self.Y = np.meshgrid(self.x, self.y) + + data_flat = self.X.flatten() + self.df_pred['xcoord'] = data_flat + data_flat = self.Y.flatten() + self.df_pred['ycoord'] = data_flat + + def clean_df(self): + + """ + Clean the DataFrame from rows with no data values + """ + + self.logger.info('Prediction dataset is being cleaned') + + self.idx = [] + for i in tqdm(range(len(self.df_pred.to_numpy()))): + if (self.properties_settings['no_value'] in + self.df_pred.to_numpy()[i, :]): + self.idx.append(i) + + # Save information on invalid locations so that they + # can masked out during hazard map generation + self.logger.info(str(len(self.idx)) + + ' rows will be saved to be considered after\ + RF prediction due to invalid data') + Label(self.master, + text='Rows with missing features identified').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + + def delete_features(self): + + """ + Delete feature from prediction dataset + """ + + self.logger.info('Feature is being deleted') + to_drop = [] + for feat in self.data_to_handle: + for col in self.df_pred.columns.tolist(): + if feat in col: + to_drop.append(col) + + self.df_pred.drop(columns=to_drop, inplace=True) + + self.logger.info('Features now included in prediction dataset: ' + + str(list(self.df_pred.columns))) + + def import_prediction_dataset(self): + + """ + Import existing prediction dataset + """ + + if os.path.exists(self.properties_pred['pred_path']): + self.pred_exist = True + self.logger.info('Import existing prediction dataset') + + ds = nc.Dataset(self.properties_pred['pred_path']) + pred = ds['Result'][:, :].data + pred_features = ds['features'][:].data + self.features = char_to_string(pred_features) + self.idx = ds['Dropped'][:].data + self.df_pred = pd.DataFrame(pred, columns=self.features) + + self.logger.info( + 'Features included in the existing prediction dataset: ' + + str(self.features)) + else: + Label(self.master, + text='Error: no existing prediction dataset found!').grid( + row=self.row, column=1) + + self.row = self.row + 1 + self.master.update() + self.pred_exist = False + + def handle_categorical_features(self, var=None): + + """ + Function is called which performs one-hot or ordinal encoding + """ + + basic = ['xcoord', 'ycoord'] + self.df_pred = handle_categorical_values(self.df_pred, + self.datasets_summary, + self.properties_pred['ohe'], + basic, + var) + + to_drop = [] + for col in self.df_pred.columns.tolist(): + if str(self.properties_settings['no_value']) in col: + to_drop.append(col) + + self.df_pred = self.df_pred.drop(to_drop, axis=1) + + def save_as_nc(self): + + """ + Save prediction dataset and information on dropped rows as nc-file + """ + + df_pred = self.df_pred.to_numpy() + + outfile = self.properties_pred['pred_path'] + self.logger.info('Prediction dataset is saved to ' + outfile) + + isExist = os.path.exists(os.path.dirname(outfile)) + if not isExist: + os.makedirs(os.path.dirname(outfile)) + + ds = generate_basic_ncfile(outfile, crs=None) + ds.createDimension('lat', (np.shape(df_pred)[0])) + ds.createDimension('lon', (np.shape(df_pred)[1])) + ds.createDimension('ix', (len(self.idx))) + ds.createDimension('feat', len(self.char_features)) + result = ds.createVariable('Result', 'f4', ('lat', 'lon')) + dropped = ds.createVariable('Dropped', 'u8', 'ix') + Features = ds.createVariable('features', 'S1', 'feat') + result[:, :] = df_pred + dropped[:] = np.array(self.idx) + Features[:] = self.char_features + ds.close() diff --git a/src/gui_version/create_training_data_gui.py b/src/gui_version/create_training_data_gui.py new file mode 100644 index 0000000000000000000000000000000000000000..f7682c3d3e9073b79c14d86aa263a82f2d76a1f1 --- /dev/null +++ b/src/gui_version/create_training_data_gui.py @@ -0,0 +1,1171 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import pandas as pd +import netCDF4 as nc +import os +import itertools +import pickle +import logging +import tkinter as tk + +from tqdm import tqdm +from LatLon23 import LatLon, Latitude, Longitude +from sklearn.cluster import KMeans +from tkinter import Label, ttk + +from utilities.import_format import import_nc, import_tif +from utilities.cut_and_interpolate_gui import cut_and_interpolate +from utilities.strings_for_ncfile import char_to_string +from utilities.handle_categorical_values import handle_categorical_values +from data_preprocessing_gui import generate_data_matrix + +class create_training_data: + + """ + This class generates the training dataset for + the Random Forest Classifier. + + Input: + from_scratch: boolean, True if training dataset should be + generated from scratch, otherwise False + delete: True if dataset/feature should be deleted from csv-file + False if dataset should be added to existing csv-file + (careful: from_scratch needs to be False!) + data_to_handle: list of features that should be added/deleted + datasets need to be listed in list_of_raw_datasets + cluster: boolean, True if training locations are to be clustered + interpolation: boolean, True if datasets are supposed to be + interpolated before extracting information + preprocessing: list of length equal to data_to_handle, + preprocessing methods for the individual datasets, + can be either 'no_interpolation', 'interpolation' or 'cluster' + + Output: + csv file + """ + + def __init__(self, master, log=None): + + self.num_clusters = 15 + self.df_train = 'not_set_yet' + self.logger = log + self.import_settings() + + self.datasets_summary = pd.read_csv(self.properties_train['geo_path']) + + self.row = 0 + + self.master = master + self.master.geometry() + self.master.winfo_toplevel().title("Training dataset generation") + + Label(self.master, text="Log:").grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, text="Training dataset generation started").grid( + row=self.row, column=1) + self.logger.info('Training dataset generation started') + self.row = self.row + 1 + self.master.update() + + if self.from_scratch: + # If all datasets are supposed to be preprocessed + # using the same method + #if self.how_to_preprocess is None: + if self.from_scratch: + self.logger.info( + 'Launch training dataset generation according to desired approach') + else: + self.logger.info( + 'Launch feature addition according to desired approach') + if self.cluster is True and self.interpolation is True: + Label(self.master, text="Features to include:").grid( + row=self.row, column=1) + self.logger.info("Features to include:") + self.row = self.row + 1 + Label(self.master, text=self.data_to_handle).grid( + row=self.row, column=1) + self.logger.info(self.data_to_handle) + self.row = self.row + 1 + self.master.update() + self.main_cluster() + elif self.cluster is False and self.interpolation is True: + self.main() + elif self.interpolation is False: + + Label(self.master, text="Features to include:").grid( + row=self.row, column=1) + self.logger.info("Features to include:") + self.row = self.row + 1 + Label(self.master, text=self.data_to_handle).grid( + row=self.row, column=1) + self.logger.info(self.data_to_handle) + self.row = self.row + 1 + self.master.update() + + self.main_no_interpolation() + + elif not self.from_scratch and self.delete: + + self.logger.info('Feature deletion started') + + self.import_input_training_dataset() + self.delete_feature() + self.save_training_data() + + elif not self.from_scratch and not self.delete: + + self.logger.info('Feature addition started') + + self.import_input_training_dataset() + self.add_feature() + self.save_training_data() + + def import_settings(self): + + """ + User input is imported and the settings of the run are defined + """ + + with open('tmp_train.pkl', 'rb') as handle: + self.properties_train = pickle.load(handle) + self.overview = pd.read_csv(self.properties_train['geo_path']) + + if self.properties_train['from_scratch'] == 1: + self.from_scratch = True + self.delete = False + elif self.properties_train['delete'] == 1: + self.from_scratch = False + self.delete = True + if self.properties_train['add'] == 1: + self.from_scratch = False + self.delete = False + + self.data_to_handle = pd.read_csv( + self.properties_train['feat_path']) + self.data_to_handle = list(self.data_to_handle['keys_to_include']) + + if self.properties_train['interpolation'] == 1: + self.interpolation = True + self.cluster = False + self.no_interpolation = False + elif self.properties_train['cluster'] == 1: + self.cluster = True + self.interpolation = False + self.no_interpolation = False + elif self.properties_train['no_interpolation'] == 1: + self.cluster = False + self.interpolation = False + self.no_interpolation = True + + with open('tmp_settings.pkl', 'rb') as handle: + self.properties_settings = pickle.load(handle) + self.properties_settings['train_path'] = self.properties_train[ + 'train_path'] + + self.logger.info('User input has been imported') + + + def delete_feature(self): + + """ + Features in data_to_handle are deleted from the training dataset + """ + + to_drop = [] + for feat in self.data_to_handle: + for col in self.df_train.columns.tolist(): + if feat in col: + to_drop.append(col) + + self.df_train.drop(columns=to_drop, inplace=True) + + self.logger.info('Feature deleted') + self.logger.info( + 'Training dataset now includes the following features:') + self.logger.info(str(self.df_train.columns)) + + def import_cube(self): + + """ + Import data cube created in data_preprocessing.py + """ + + self.ds = nc.Dataset(self.s.outfile) + self.x = self.ds['Longitude'][:].data + self.y = self.ds['Latitude'][:].data + self.cube = self.ds['Result'][:, :, :].data + + features = self.ds['features'][:].data + + self.features = char_to_string(features) + + Label(self.master, text="Cube was imported").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + Label(self.master, text="Cube contains the following features:").grid( + row=self.row, column=1) + self.row = self.row + 1 + Label(self.master, text=self.features).grid(row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('Cube with feature datasets imported') + self.logger.info('Cube contains the following features:') + self.logger.info(self.features) + + def import_landslide_database(self): + + """ + Import training dataset which needs to be provided as csv file + """ + + if self.properties_train['ls_path'].split('.')[-1] != 'csv': + self.logger.error( + 'Error: Please provide a csv-file as input dataset') + + self.df_train = pd.read_csv(self.properties_train['ls_path']) + + for column in list(self.df_train.columns): + if column not in [self.properties_train['id'], + self.properties_train['x'], + self.properties_train['y']]: + self.df_train.drop(inplace=True, labels=column, axis=1) + self.length_before_cleaning = len(self.df_train) + self.add_nonls_locations() + self.df_train = pd.concat([self.df_train, self.df_absence], + axis=0, + ignore_index=True) + self.df_train = self.df_train.rename( + columns={self.properties_train['x']: 'xcoord', + self.properties_train['y']: 'ycoord', + self.properties_train['id']: 'ID'}) + self.df_train['label'] = self.label_training_data() + + Label(self.master, + text="Historic landslides were imported").grid( + row=self.row, column=1) + self.row = self.row + 1 + + self.logger.info('Landslide inventory imported') + self.logger.info('Currently the following features are included:') + self.logger.info(str(self.df_train.columns)) + + def add_nonls_locations(self): + + """ + Supplement presence data with absence data. It needs to be + pre-generated (see nonls_events_coordinates.py). + """ + + if self.properties_train['nonls_path'].split('.')[-1] == 'csv': + self.absence = pd.read_csv(self.properties_train['nonls_path']) + + self.absence = self.absence.rename( + columns={ + self.properties_train['x_nonls']: self.properties_train['x'], + self.properties_train['y_nonls']: self.properties_train['y']}) + + nonls_id = [ + 'nonls_event_' + str(i) for i in range(len(self.absence))] + self.absence.insert(0, self.properties_train['id'], nonls_id) + + self.logger.info('Absence locations added') + + elif self.properties_train['nonls_path'].split('.')[-1] == 'nc': + ds = nc.Dataset(self.properties_train['nonls_path']) + + x = ds[self.properties_train['x_nonls']][:].data + y = ds[self.properties_train['y_nonls']][:].data + + self.df_absence = pd.DataFrame(index=range(len(x)), + columns=list(self.df_train.columns)) + + self.df_absence[self.properties_train['id']] = [ + 'nonls_event_' + str(i) for i in range(len(x))] + self.df_absence[self.properties_train['x']] = list(x) + self.df_absence[self.properties_train['y']] = list(y) + + self.logger.info('Absence locations added') + + def label_training_data(self): + + """ + Provide labels to the training data + """ + + label = [int(1) for i in range(len(self.df_train))] + label[(len(label)-len(self.df_absence)):len(label)] = [ + int(0) for i in range(len(self.df_absence))] + + self.logger.info('Training data labeled') + + return label + + def import_input_training_dataset(self): + + """ + Existing training dataset is imported. + """ + + # Check if training dataset exists and import it + if not os.path.exists(self.properties_train['train_path']): + self.logger.error('Training dataset does not exist yet.\ + Please generate from scratch.') + else: + self.df_train = pd.read_csv(self.properties_train['train_path']) + self.features = list(self.df_train.columns) + self.logger.info('Training dataset imported') + self.logger.info('Features included in the training dataset:') + self.logger.info(self.features) + + def add_feature(self): + + """ + Add feature to the training dataset + """ + + self.logger.info('Feature adding started') + + # Features contained in the training dataset + cols = list(self.df_train.columns) + # List of labels for basic information + basic = ['Ereignis-Nr', 'xcoord', 'ycoord', 'label'] + print(self.data_to_handle) + if not self.cluster and self.interpolation: + # Call data_preprocessing.py + self.s = generate_data_matrix( + from_scratch=False, + delete=False, + dataset='training', + data_to_handle=self.data_to_handle, + geo_overview=self.overview, + settings=self.properties_settings, + settings_train_pred=self.properties_train, + keys_already_included=[x for x in cols if x not in basic]) + + # Import generated cube of cut and interpolated datasets + self.import_cube() + + for count, key in enumerate(self.data_to_handle): + # Delete feature if it already exists in training dataset + if key in self.df_train.columns: + self.logger.warning('Feature already exists in Dataset.\ + Existing feature is deleted') + self.df_train = self.df_train.drop(key, axis=1) + + self.logger.info('Adding ' + key + '...') + # Create empty DataFrame + if count == 0: + self.df_features = pd.DataFrame( + index=range(len(self.df_train)), + columns=self.data_to_handle) + + for index, row in self.df_train.iterrows(): + x_ind = int( + (np.abs( + self.x + - row[self.properties_train['x']])).argmin()) + y_ind = int( + (np.abs( + self.y + - row[self.properties_train['y']])).argmin()) + + self.df_features.at[index, key] = self.cube[ + y_ind, + x_ind, + self.features.index(key)] + print(self.df_train.columns.tolist()) + print(self.df_features.columns.tolist()) + # Combine old training dataset with additional features + self.df_train = pd.concat([self.df_train, self.df_features], + axis=1) + # Adapt column order + self.logger.info('Feature successfully added') + + self.clean_df() + self.handle_categorical_features(var=self.data_to_handle) + self.logger.info('One-hot encoding completed') + + elif self.cluster: + for key in self.data_to_handle: + if key in self.df_train.columns: + self.logger.info( + 'Feature already exists in dataset.\ + Existing feature is deleted') + self.df_train = self.df_train.drop(key, axis=1) + + self.main_cluster() + self.logger.info('Feature successfully added') + elif not self.cluster and not self.interpolation: + self.main_no_interpolation() + self.logger.info('Feature successfully added') + + self.logger.info('Training dataset contains following features:') + self.logger.info(str(self.df_train.columns)) + + def extract_gridded_info(self): + + """ + Extraction of the information of the geospatial datasets at all + elements of the training dataset. + + If training data is located within prediction area and if area is + small enough no further interpolation is necessary. + """ + + self.logger.info('Extraction of gridded information started') + + self.df_features = pd.DataFrame(index=range(len(self.df_train)), + columns=self.features) + + # Iterate over all instances of the training dataset + # and extract geospatial information + k = 0 + self.open_secondary_window(kind='int', maximum=len(self.df_train)) + for index, row in tqdm(self.df_train.iterrows()): + + self.progress_var.set(k) + self.label.config(text=str(index) + + '/' + str(len(self.df_train)) + + ' instance') + self.master.update() + k += 1 + + # Indices of training data elements are determined + x_ind = int((np.abs(self.x - row['xcoord'])).argmin()) + y_ind = int((np.abs(self.y - row['ycoord'])).argmin()) + + tmp = list(self.cube[y_ind, x_ind, :]) + self.df_features.loc[index] = tmp + + self.progress_var.set(k) + self.label.config(text=str(index) + + '/' + str(len(self.df_train)) + + ' instance') + self.master.update() + + Label(self.master, text="Gridded information was extracted").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('Extraction of gridded information completed') + + def check_ls_inventory(self): + + """ + Rows are removed with missing values or nan + """ + + rows_with_missing_values = self.df[self.df.isnull().any(axis=1)] + rows_with_nan_values = self.df[self.df.isna().any(axis=1)] + + self.df_train.drop(index=rows_with_missing_values + +rows_with_nan_values, inplace=True) + + def clean_df(self): + + """ + Rows are removed from the dataset where no_value given in settings + occurs. + """ + + self.logger.info('Cleaning of training dataset started') + + Label(self.master, text='Cleaning in progress').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + count = 0 + ind = [] + for col in self.df_train: + ind.append([i for i, x in enumerate( + list(self.df_train[col])) if x == self.properties_settings[ + 'no_value']]) + count = count + 1 + + ind = list(itertools.chain.from_iterable(ind)) + ind = list(set(ind)) + + self.logger.info(str(len(ind)) + + ' rows will be removed due to invalid data') + + Label(self.master, text=str(len(ind)) + + ' rows will be removed due to invalid data').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.df_train.drop(index=ind, inplace=True) + + self.logger.info('Cleaning of training dataset completed') + + Label(self.master, + text='Training dataset contains following features: ').grid( + row=self.row, column=1) + self.row = self.row + 1 + Label(self.master, text=str(list(self.df_train.columns))).grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('CTraining dataset contains the following features:') + self.logger.info(str(list(self.df_train.columns))) + + if not self.from_scratch and len(ind) > 0: + self.logger.warning( + 'Careful! Ratio of absence to presence data might be obscured') + Label(self.master, text='Careful! Ratio might be obscured!').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + def ensure_same_ratio(self): + + self.logger.info( + 'Process to ensure desired ratio\ + of absence to presence data started') + + len_pres = np.shape(self.df_train[self.df_train['label'] == 1])[0] + len_abs = np.shape(self.df_train[self.df_train['label'] == 0])[0] + + if self.properties_train['num_nonls'] == self.length_before_cleaning: + if len_abs > len_pres: + self.logger.info('Number of absence locations is reduced') + df_abs = self.df_train[self.df_train['label'] == 0] + + df_abs = df_abs.iloc[:len_pres] + + self.df_train = pd.concat( + [self.df_train[self.df_train['label'] == 1], df_abs], + axis=0) + elif len_abs < len_pres: + self.logger.error( + 'Undefined error in the number\ + of absence and presence data') + + Label(self.master, + text='Same ratio of absence to presence is ensured').grid( + row=self.row, column=1) + self.logger.info( + 'Same ratio of absence and presence locations ensured') + self.row = self.row+1 + self.master.update() + else: + if len_abs > self.properties_train['num_nonls']: + df_abs = self.df_train[self.df_train['label'] == 0] + + df_abs = df_abs.iloc[:len_abs + - (len_abs + - self.properties_train['num_nonls'])] + + self.df_train = pd.concat( + [self.df_train[self.df_train['label'] == 1], df_abs], + axis=0) + + Label(self.master, + text='Number of absence locations is fixed').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info( + 'Number of absence locations is set to the desired number') + + + + + def cluster_landslide_locations(self, re=False, num=None): + + """ + Cluster the landslide locations. + If clusters are too large this functions reclusters + these clusters into smaller ones. + + Input: + re: boolean, True if reclustering + num: list, numbers of clusters to be reclustered + + Output: + re_cluster_name: list, clusters for every entry + """ + + if re: + self.logger.info('Landslide location reclustering started') + re_cluster_name = [] + count, len_org = 0, len(self.bb) + + # Number of clusters to split too large cluster into + num_cluster = 4 + for i in num: + df_sub = self.df_train[self.df_train.cluster == i].copy() + # Clustering + kmeans = KMeans(init="random", + n_clusters=num_cluster, + n_init=10, + max_iter=300, + random_state=42) + kmeans.fit(np.column_stack((list(df_sub['xcoord']), + list(df_sub['ycoord'])))) + tmp = kmeans.labels_[:] + + # Rename clusters to fit into naming convention + for c, j in enumerate(tmp): + if j == 0: + tmp[c] = i + else: + tmp[c] = len_org + count*(num_cluster-1) + (j-1) + + df_sub['cluster'] = tmp + self.df_train = pd.concat( + [self.df_train[self.df_train.cluster != i], df_sub], + axis=0) + # Store cluster numbers to be returned + re_cluster_name.append(set(tmp)) + count = count + 1 + + return re_cluster_name + + else: + self.logger.info('Landslide location clustering started') + # Clustering + kmeans = KMeans(init="random", + n_clusters=self.num_clusters, + n_init=10, + max_iter=300, + random_state=42) + kmeans.fit(np.column_stack((list(self.df_train['xcoord']), + list(self.df_train['ycoord'])))) + self.df_train['cluster'] = kmeans.labels_[:] + + Label(self.master, text="Landslide locations were clustered").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('Landslide locations clustered') + + def determine_bb_for_clustering(self, re=False, re_num=None): + + """ + Determine bounding box for the individual clusters. + + Input: + re: boolean, True if reclustering + num: list, numbers of clusters to be reclustered + """ + + if self.cluster: + # Initial clustering + self.logger.info('Bounding box definition for clusters started') + if not re: + self.bb = [] + if self.from_scratch: + to_enter = range(self.num_clusters) + else: + # When adding a feature + if 'cluster' in self.df_train: + to_enter = list(set(self.df_train.cluster.to_list())) + else: + to_enter = range(self.num_clusters) + for num in to_enter: + df_tmp = self.df_train[ + self.df_train['cluster'] == num].copy() + df_tmp = df_tmp.reset_index(drop=True) + max_x = df_tmp['xcoord'].loc[df_tmp['xcoord'].idxmax()] + min_y = df_tmp['ycoord'].loc[df_tmp['ycoord'].idxmin()] + max_y = df_tmp['ycoord'].loc[df_tmp['ycoord'].idxmax()] + min_x = df_tmp['xcoord'].loc[df_tmp['xcoord'].idxmin()] + + self.bb.append([max_y, min_y, min_x, max_x]) + # Reclustering + else: + bb_new = [0 for i in range( + len(set(self.df_train.cluster.to_list())))] + bb_new[:len(self.bb)] = self.bb + + for num in re_num: + + df_tmp = self.df_train[ + self.df_train['cluster'] == num].copy() + df_tmp = df_tmp.reset_index(drop=True) + max_x = df_tmp['xcoord'].loc[df_tmp['xcoord'].idxmax()] + min_y = df_tmp['ycoord'].loc[df_tmp['ycoord'].idxmin()] + max_y = df_tmp['ycoord'].loc[df_tmp['ycoord'].idxmax()] + min_x = df_tmp['xcoord'].loc[df_tmp['xcoord'].idxmin()] + + # Make sure that the order is + # preserved to match bounding boxes properly + if num >= self.num_clusters: + bb_new[num] = [max_y, min_y, min_x, max_x] + else: + bb_new[num] = [max_y, min_y, min_x, max_x] + + self.bb = bb_new.copy() + + else: + + self.logger.info('Bounding box definition started') + + max_x = self.df_train['xcoord'].loc[ + self.df_train['xcoord'].idxmax()] + min_y = self.df_train['ycoord'].loc[ + self.df_train['ycoord'].idxmin()] + max_y = self.df_train['ycoord'].loc[ + self.df_train['ycoord'].idxmax()] + min_x = self.df_train['xcoord'].loc[ + self.df_train['xcoord'].idxmin()] + + self.bb = [max_y, min_y, min_x, max_x] + print(self.bb) + + Label(self.master, text="Bounding boxes were determined").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('Bounding boxes defined') + + def determine_if_reclustering(self): + + """ + Determine if the extent of one or several + clusters are too large for local interpolation + + Output: + num_bb: list, names of clusters that need reclustering + """ + + self.reclustering = False + num_bb = [] + + self.logger.info('Determine if reclustering is necessary') + + # Check extend of individual clusters + for count, bb in enumerate(self.bb): + point1_x = LatLon(Latitude(bb[0]), Longitude(bb[2])) + point2_x = LatLon(Latitude(bb[0]), Longitude(bb[3])) + distance_x = point1_x.distance(point2_x)*1000 + + point1_y = LatLon(Latitude(bb[0]), Longitude(bb[2])) + point2_y = LatLon(Latitude(bb[-1]), Longitude(bb[2])) + distance_y = point1_y.distance(point2_y)*1000 + + num_px_x = int(np.round( + (distance_x/self.properties_settings['resolution']))) + num_px_y = int(np.round( + (distance_y/self.properties_settings['resolution']))) + + if num_px_x or num_px_y > 10000: + num_bb.append(count) + self.reclustering = True + self.logger.info('Reclustering necessary') + else: + self.logger.info('Reclustering not necessary') + + return num_bb + + def open_secondary_window(self, kind, maximum): + + # Create secondary (or popup) window. + self.secondary_window = tk.Toplevel() + self.secondary_window.title("Progress") + self.secondary_window.config(width=300, height=200) + if kind == 'string': + self.progress_var = tk.StringVar() + else: + self.progress_var = tk.IntVar() + self.label = Label(self.secondary_window, text="") + self.label.grid(row=1, column=0) + progressbar = ttk.Progressbar(self.secondary_window, + variable=self.progress_var, + maximum=maximum) + progressbar.grid(row=0, column=0) + + # Create a button to close (destroy) this window. + button_close = ttk.Button( + self.secondary_window, + text="Close window", + command=self.secondary_window.destroy + ) + button_close.grid(row=2, column=0) + + self.secondary_window.focus() + self.secondary_window.grab_set() + + def main_cluster(self): + + """ + Main function to generate training + dataset if training locations shall be clustered. + """ + + def extract_data_from_dataset_subsets(num): + + """ + Extract information from the interpolated geospatial dataset. + + Input: + num: int, number of cluster + + Output: + df_clust: dataframe, subset of the training dataset + supplemented with the information from the training dataset + """ + + df_clust = self.df_train[self.df_train.cluster == num].copy() + + # Access interpolated subset of the dataset + arr = self.ds['Result' + str(num)][:, :].data + + # Access interpolation x and y vector + xvector = self.ds['Longitude' + str(num)][:].data + yvector = self.ds['Latitude' + str(num)][:].data + + # Extract information at training location + feature = [] + for index, row in df_clust.iterrows(): + + x_indx = int((np.abs(np.array(xvector) + - row['xcoord'])).argmin()) + y_indx = int((np.abs(np.array(yvector) + - row['ycoord'])).argmin()) + + feature.append(arr[y_indx, x_indx]) + + df_clust[dataset] = feature + + return df_clust + + self.logger.info('Training dataset is generated\ + using clustering and interpolation') + + Label(self.master, + text='Training dataset is generated using\ + clustering and interpolation').grid(row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + # If previous run failed dataset might already exist causing + # current run to crash again therefore it is deleted if it exists + if os.path.isfile('tmp.nc'): + os.remove('tmp.nc') + + if self.from_scratch: + self.logger.info('Training dataset is generated from scratch') + + if not isinstance(self.df_train, pd.DataFrame): + self.import_landslide_database() # Import landslide database + self.cluster_landslide_locations() + + if not self.from_scratch and 'cluster' not in self.df_train: + # Cluster the landslide locations + self.cluster_landslide_locations() + # Determine the bounding boxes for the individual clusters + self.determine_bb_for_clustering() + + if self.from_scratch: + # Determine if bounding boxes are too large for local interpolation + num_bb = self.determine_if_reclustering() + + if self.reclustering: + Label(self.master, text="Reclustering necessary").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + re_cluster_name = self.cluster_landslide_locations(True, + num_bb) + re_cluster_name = [item for sublist in re_cluster_name + for item in sublist] + # Determine bounding boxes for new clusters + self.determine_bb_for_clustering(True, re_cluster_name) + + dataset_counter = 0 + # Iterate over the dataset to inclue in the training dataset + k = 0 + self.open_secondary_window(kind='string', + maximum=len(self.data_to_handle)) + self.logger.info('Iteration over datasets started') + for dataset in tqdm(self.data_to_handle): + + self.logger.info('Dataset: ' + dataset) + + self.progress_var.set(k) + self.label.config(text=dataset) + self.master.update() + k += 1 + + index = self.datasets_summary['keys'].tolist().index(dataset) + # Call cut_and_interpolate class to cut and + # interpolate the current dataset + self.logger.info('Cutting and interpolation started') + s = cut_and_interpolate(key=dataset, + path=self.datasets_summary[ + 'path'].tolist()[index], + no_data_value=self.datasets_summary[ + 'no_value'].tolist()[index], + categorical=self.datasets_summary[ + 'categorical'].tolist()[index], + several=False, + several_same=False, + first=False, + bb=self.bb, + cluster=self.cluster, + prop_settings=self.properties_settings, + prop=self.properties_train, + path_properties='properties.pkl') + self.logger.info('Cutting and interpolation completed') + # Open the netcdf file which contains the interpolated subsets of + # the dataset with the extent of the bounding boxes of the clusters + self.ds = nc.Dataset('tmp.nc') + + # Determine if one or more bounding boxes is + # outside of the extend of the dataset + if not s.cuttable: + self.logger.error( + 'Bounding box larger than dataset!\ + Please adapt bounding box!') + break + + df = [] + # Iterate over the clusters extract information from the dataset + self.logger.info('Extraction of gridded information started') + for num in range(len(self.bb)): + df.append(extract_data_from_dataset_subsets(num)) + self.logger.info('Extraction of gridded information completed') + + # Concatenate all subsets of the training dataset + df = np.concatenate(df, axis=0) + + # For first dataset set up final training dataset, + # for later datasets append information + if dataset_counter == 0: + df_ges = pd.DataFrame(df, columns=list( + self.df_train.columns)+[dataset]) + else: + df_ges[dataset] = df[:, -1] + + dataset_counter = dataset_counter + 1 + self.ds.close() + + self.progress_var.set(k) + self.master.update() + + self.label.config(text='All features have been successfully created') + Label(self.master, text='All datasets have been handled').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('All features have been successfully created') + + # Remove temporary netcdf file that + # contains interpolated subsets of the dataset + self.df_train = df_ges.copy() + self.df_train.reset_index(inplace=True, drop=True) + + self.clean_df() # Clean the DataFrame from no value rows + if not self.from_scratch and not self.delete: + self.handle_categorical_features(var=self.data_to_handle) + else: + self.handle_categorical_features() + + if self.from_scratch: + # Ensure that the 1:1 ratio of the training dataset is perserved + self.ensure_same_ratio() + + self.save_training_data() # Save the training dataset as csv file + + def main(self): + + self.logger.info('Training dataset is generated using interpolation') + + Label(self.master, + text="Training dataset is generated using interpolation").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.import_landslide_database() + self.determine_bb_for_clustering() + + self.logger.warning('Warning! Depending on the size of the area of\ + interest and the set resolution this might be\ + computationally expensive. Consider clustering.') + + Label(self.master, + text='Warning! Depending on the size of the area of\ + interest and the set resolution this might be\ + computationally expensive. Consider clustering.').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('Generation if data cube started') + self.s = generate_data_matrix(from_scratch=True, + delete=False, + dataset='training', + bb=self.bb, + data_to_handle=self.data_to_handle, + geo_overview=self.overview, + settings_train_pred=self.properties_train, + settings=self.properties_settings) + Label(self.master, text="Data matrix was generated").grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + self.logger.info('Data cube generated') + + self.import_cube() + # Extract geospatial information at ls and non-ls locations + self.extract_gridded_info() + + # Concat final training dataset + self.df_train = pd.concat([self.df_train, self.df_features], axis=1) + + self.clean_df() # Clean the DataFrame from no value rows + self.handle_categorical_features() + self.ensure_same_ratio() + self.save_training_data() # Save the training dataset as csv file + + def import_raw_dataset(self, path, no_data): + + self.logger.info('Importing raw dataset') + if path.split('.')[-1] == 'tif': + data, x, y, _ = import_tif(path) + elif path.split('.')[-1] == 'nc': + data, x, y, _ = import_nc(path) + else: + self.logger.warning( + 'Not the right data format! Please provide tif or nc file') + + if y[0] < y[-1]: + y = np.flip(y) + + if no_data != 'None': + no_data = no_data.split(',') + no_data = [float(val) for val in no_data] + else: + no_data = no_data + if no_data != 'None': + for val in no_data: + data[data == val] = self.properties_settings['no_value'] + + self.logger.info('Raw dataset imported') + + return data, x, y + + def main_no_interpolation(self): + + Label(self.master, + text="Training dataset generation\ + is done without interpolation").grid(row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + self.logger.info('Training dataset generated without interpolation') + + def extract_gridded_info(row): + + x_indx = int((np.abs(np.array(x) - row['xcoord'])).argmin()) + y_indx = int((np.abs(np.array(y) - row['ycoord'])).argmin()) + + return data[y_indx, x_indx] + + if self.from_scratch: + self.logger.info('Training dataset is generated from scratch') + self.import_landslide_database() + + k = 0 + self.open_secondary_window(kind='string', + maximum=len(self.data_to_handle)) + self.logger.info('Iterate over datasets:') + for dataset in tqdm(self.data_to_handle): + self.logger.info('Dataset: ' + dataset) + self.progress_var.set(k) + self.label.config(text=dataset) + k += 1 + + index = self.datasets_summary['keys'].tolist().index(dataset) + data, x, y = self.import_raw_dataset( + self.datasets_summary['path'].tolist()[index], + self.datasets_summary['no_value'].tolist()[index]) + feat = [] + self.logger.info('Extraction of gridded information started') + for index, row in self.df_train.iterrows(): + feat.append(extract_gridded_info(row)) + self.logger.info('Extraction of gridded information completed') + + self.df_train[dataset] = feat + self.master.update() + self.progress_var.set(k) + self.master.update() + + self.label.config(text='All features have been successfully created') + Label(self.master, text='All datasets have been handled').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + + self.clean_df() # Clean the DataFrame from no value rows + if not self.from_scratch and not self.delete: + self.handle_categorical_features(var=self.data_to_handle) + else: + self.handle_categorical_features() + self.ensure_same_ratio() + self.save_training_data() + + def handle_categorical_features(self, var=None): + + """ + Function is called which performs one-hot or ordinal encoding + """ + + basic = ['ID', 'xcoord', 'ycoord', 'label'] + self.df_train = handle_categorical_values(self.df_train, + self.datasets_summary, + self.properties_train['ohe'], + basic, + var) + + def save_training_data(self): + + """ + Save dataframe as csv. If necessary folder is created. + """ + + self.logger.info('Saving of training data in progress') + + outfile = self.properties_train['train_path'] + + isExist = os.path.dirname(self.properties_train['train_path']) + if not isExist: + os.path.dirname(self.properties_train['train_path']) + + # If outfile exists already, delete + if os.path.exists(outfile): + os.remove(outfile) + + # Save dataframe as csv + self.df_train.to_csv(outfile, sep=',', index=False) + + self.logger.info('Training dataset saved') + + Label(self.master, text='Training dataset has been saved').grid( + row=self.row, column=1) + self.row = self.row + 1 + self.master.update() + + button_close = ttk.Button( + self.master, + text="Close window", + command=self.master.destroy + ) + button_close.grid(row=self.row, column=0) + self.master.update() diff --git a/src/gui_version/data_preprocessing_gui.py b/src/gui_version/data_preprocessing_gui.py new file mode 100644 index 0000000000000000000000000000000000000000..b7eb8e334088120e407de5eae1ce96da0cff6155 --- /dev/null +++ b/src/gui_version/data_preprocessing_gui.py @@ -0,0 +1,386 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import os +import netCDF4 as nc + +from tqdm import tqdm + +from utilities.ncfile_generation import generate_3dncfile +from utilities.cut_and_interpolate_gui import cut_and_interpolate +from utilities.strings_for_ncfile import char_to_string, features_to_char + +class generate_data_matrix: + + """ + This class generates a nc-file containing all datasets, + a list of all contained features and their respective longitude and + latitude vectors. Provided are interpolated excerpts of the datasets + determined by the provided bounding box. + + Input: + from_scratch: boolean, True if nc-file should be generated from + scratch, otherwise false + delete: True if dataset/feature should be deleted from nc-file + False if dataset should be added to existing nc-file + (careful: from_scratch needs to be False!) + bb: list, bounding box in the format + [<y_max>, <y_min>, <x_min>, <x_max>] + data_to_handle: list of features that should be added/deleted + datasets need to be listed in list_of_raw_datasets + keys_already_included: list of already included features in the + training/prediction dataset + (from_scratch=False, delete=False) + + Output: + netCDF4 file + + """ + + def __init__(self, + from_scratch=True, + delete=False, + dataset='undefined', + bb=None, + data_to_handle=None, + geo_overview=None, + settings=None, + settings_train_pred=None, + keys_already_included=None): + self.from_scratch = from_scratch + self.dataset = dataset + self.bb = bb + self.keys_already_included = keys_already_included + self.settings_train_pred = settings_train_pred + self.keys = geo_overview['keys'].tolist() + self.raw_datasets_path = geo_overview['path'].tolist() + self.data_no_value = geo_overview['no_value'].tolist() + self.geo_overview = geo_overview + + self.category = [] + for x in geo_overview['categorical'].tolist(): + if x == 0: + self.category.append(False) + else: + self.category.append(True) + self.settings = settings + self.data_to_handle = data_to_handle + if not from_scratch: + self.delete = delete + self.import_cube() + + if delete: + # Delete dataset from cube + self.delete_dataset() + + else: + # Add dataset to cube + self.add_dataset() + else: + # Generate cube from scratch + self.main() + + def find_dataset(self): + + """ + Find the index of the features to handle in the list of features + contained in the nc-file. + Return: + idx: list of indices + """ + + return self.features.index(self.data_to_handle) + + def add_dataset(self): + + # Number of overlapping features between datasets in the cube + # and the datasets to add/delete + print('features') + print(self.features) + + if self.dataset == 'prediction': + for_prediction = True + else: + for_prediction = False + + # Define new cube in the size of existing cube of cut and interpolated + # datasets with the depth equaling the number of existing datasets plus + # the ones to add + ges = list(self.features) + [x for x in self.data_to_handle if x not in self.features] + cube = np.zeros((np.shape(self.cube)[0], + np.shape(self.cube)[1], + len(ges))) + + for feat in self.features: + cube[:, :, ges.index(feat)] = self.cube[:, :, self.features.index(feat)] + + for key in self.data_to_handle: + s = cut_and_interpolate( + key=key, + path=self.raw_datasets_path[self.keys.index(key)], + no_data_value=self.data_no_value[self.keys.index(key)], + categorical=list(self.geo_overview['categorical'])[self.keys.index(key)], + several=True, + several_same=False, + first=False, + #bb=self.bb, + for_prediction=for_prediction, + prop_settings=self.settings, + prop=self.settings_train_pred, + path_properties=self.folder + + '/data_combined_' + + self.dataset + + '_' + + str(self.settings['resolution']) + '.pkl') + array, _, _, cuttable = s.array, s.x, s.y, s.cuttable + + if not cuttable: + print('Error! Bounding box larger than dataset!\ + Please adapt bounding_box!') + break + else: + # Store it at respective position in cube + # Add cut and interpolated dataset to cube + cube[:, :, ges.index(key)] = array + + # Save the updated cube to nc file + self.determine_outfile() + self.char_features = features_to_char(ges) + generate_3dncfile(self.outfile, + self.x, + self.y, + cube, + len(ges), + self.char_features, + crs='wgs84', + data_unit=None, + missing_value=self.settings['no_value']) + + def delete_dataset(self): + + """ + Delte datasets from data_to_handle + from nc-file and save new nc-file + """ + + # Determine indices of the datasets that shall be removed + idx = [] + for data in self.data_to_handle: + idx.append(self.find_dataset) + + # Define new cube in the size of existing cube of + # cut and interpolated datasets + cube = np.zeros((np.shape(self.cube)[0], + np.shape(self.cube)[1], + np.shape(self.cube)[2]-len(self.data_to_handle))) + count = 0 + + # Go through the datasets and transfer + # all datasets except for them to be removed + for i in range(np.shape(self.cube)[2]): + if self.features[i] not in self.data_to_handle: + cube[:, :, count] = self.cube[:, :, i] + count = count + 1 + + # Update the feature list + for data in self.data_to_handle: + self.features.remove(data) + + # Save new data cube + self.determine_outfile() + self.char_features = features_to_char(self.features) + generate_3dncfile(self.outfile, + self.x, + self.y, + cube, + len(self.features), + self.char_features, + crs='wgs84', + data_unit=None, + missing_value=self.settings['no_value']) + + def import_cube(self): + + """ + Existing nc-file is imported for adding/deleting another feature. + """ + + self.determine_outfile() # Determine where cube is stored + + # Import cube + self.ds = nc.Dataset(self.outfile) + self.cube = self.ds['Result'][:, :, :].data + self.x = self.ds['Longitude'][:].data + self.y = self.ds['Latitude'][:].data + self.features = self.ds['features'][:].data + + self.features = char_to_string(self.features) + + def determine_outfile(self): + + """ + Determine whether folder to store the nc-file already exists. + If not, it is created. Outfile path is determined. + """ + + # Cube is stored in the same folder + # as the final training/prediction dataset + if self.dataset == 'training': + self.folder = os.path.dirname(self.settings_train_pred['train_path']) + self.outfile = self.folder + '/data_combined_training_' + str(self.settings['resolution']) + '.nc' + elif self.dataset == 'prediction': + self.folder = os.path.dirname(self.settings_train_pred['pred_path']) + self.outfile = self.folder + '/data_combined_prediction_' + str(self.settings['resolution']) + '.nc' + + # Create folder if it doesn't yet exist + isExist = os.path.exists(self.folder) + if not isExist: + os.makedirs(self.folder) + + def check_existence_datasets(self): + + """ + Check if dataset exists + """ + + # Check existance of all datasets that shall be pre-processed + for i in range(len(self.raw_datasets_path)): + self.all_exist = os.path.isfile(str(self.raw_datasets_path[i])) + + if not self.all_exist: + print('Path ' + + str(self.raw_datasets_path[i]) + + ' does not exist!') + break + + def main(self): + + """ + Routine to pre-process the datasets from scratch + """ + + #self.check_existence_datasets() # Check the existance of all datasets + #if self.all_exist: # If all datasets exist + # Go through all datasets that shall be pre-processed + for i in tqdm(range(len(self.data_to_handle))): + j = self.keys.index(self.data_to_handle[i]) + print(self.data_to_handle[i]) + + if i == 0: + if self.dataset == 'prediction': + # Cut and interpolate dataset to desired resolution. + # Check script for information on input parameters. + s = cut_and_interpolate( + key=self.data_to_handle[i], + path=self.raw_datasets_path[j], + no_data_value=self.data_no_value[j], + categorical=self.category[j], + several=True, + several_same=False, + first=True, + bb=self.bb, + for_prediction=True, + prop_settings=self.settings, + prop=self.settings_train_pred, + path_properties=self.settings[ + 'pred_path'].rsplit('/', 1)[0] + + '/data_combined_prediction_' + + str(self.settings['resolution']) + + '.pkl') + else: + # Cut and interpolate dataset to desired resolution. + # Check script for information on input parameters. + s = cut_and_interpolate( + key=self.data_to_handle[i], + path=self.raw_datasets_path[j], + no_data_value=self.data_no_value[j], + categorical=self.category[j], + several=True, + several_same=False, + first=True, + bb=self.bb, + prop_settings=self.settings, + prop=self.settings_train_pred, + path_properties=self.settings[ + 'train_path'].rsplit('/', 1)[0] + + '/data_combined_training_' + + str(self.settings['resolution']) + + '.pkl') + array = s.array + self.x = s.x + self.y = s.y + cuttable = s.cuttable + + if not cuttable: + print('Error! Bounding box larger than dataset!\ + Please adapt bounding_box!') + break + # Store cut and interpolated dataset in array + cube = np.zeros((np.shape(array)[0], + np.shape(array)[1], + len(self.data_to_handle))) + cube[:, :, 0] = array + + else: + if self.dataset == 'prediction': + s = cut_and_interpolate( + key=self.data_to_handle[i], + path=self.raw_datasets_path[j], + no_data_value=self.data_no_value[j], + categorical=self.category[j], + several=True, + several_same=False, + first=False, + bb=self.bb, + for_prediction=True, + prop_settings=self.settings, + prop=self.settings_train_pred, + path_properties=self.settings[ + 'pred_path'].rsplit('/', 1)[0] + + '/data_combined_prediction_' + + str(self.settings['resolution']) + + '.pkl') + else: + # Cut and interpolate dataset to desired resolution. + # Check script for information on input parameters. + s = cut_and_interpolate( + key=self.data_to_handle[i], + path=self.raw_datasets_path[j], + no_data_value=self.data_no_value[j], + categorical=self.category[j], + several=True, + several_same=False, + first=False, + bb=self.bb, + prop_settings=self.settings, + prop=self.settings_train_pred, + path_properties=self.settings[ + 'train_path'].rsplit('/', 1)[0] + + '/data_combined_training_' + + str(self.settings['resolution']) + + '.pkl') + array, cuttable = s.array, s.cuttable + + if not cuttable: + print('Error! Bounding box larger than dataset!\ + Please adapt bounding_box!') + print(self.data_to_handle[i]) + break + # Store cut and interpolated dataset in array + cube[:, :, i] = array + + # Store the array in a nc-file and meta data in pickle file + if cuttable: + self.determine_outfile() + self.char_features = features_to_char(self.data_to_handle) + + generate_3dncfile(self.outfile, + self.x, + self.y, + cube, + len(self.data_to_handle), + self.char_features, + crs='wgs84', + data_unit=None, + missing_value=self.settings['no_value']) diff --git a/src/gui_version/shire.py b/src/gui_version/shire.py new file mode 100644 index 0000000000000000000000000000000000000000..35cc318de3bd381162b7797e86032139c295b05d --- /dev/null +++ b/src/gui_version/shire.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import pickle +import os +import tkinter as tk + +from create_training_data_gui import * +from create_prediction_data_gui import * +from RandomForest_gui import * + +from check_user_input import check_general_settings +from utilities.initialise_log import save_log +from utilities.gui import * + +""" + This script controls the hazard mapping framework SHIRE. + Ensure a data summary csv file + and a csv file containing the keys to include have been prepared. + For more information please refer to the user manual. +""" + +if os.path.isfile('tmp_settings.pkl'): + os.remove('tmp_settings.pkl') + +if os.path.isfile('tmp_train.pkl'): + os.remove('tmp_train.pkl') + +if os.path.isfile('tmp_pred.pkl'): + os.remove('tmp_pred.pkl') + +if os.path.isfile('tmp_map.pkl'): + os.remove('tmp_map.pkl') + +#Get the general settings +master = tk.Tk() +general_settings(master) +master.mainloop() + + +s = check_general_settings() + +if os.path.exists('shire_run.log'): + os.remove('shire_run.log') +logger = save_log('shire_run.log') +logger.info('SHIRE has successfully been launched') +logger.info('User input required') +logger.info('General settings defined') + +if s.error: + logger.info('There is an error in the user input. For more infos check the check_user_input.log') +else: + if os.path.isfile('tmp_settings.pkl'): + with open('tmp_settings.pkl', 'rb') as handle: + properties_settings = pickle.load(handle) + + master = tk.Tk() + if properties_settings['train'] == 1: + logger.info('Training dataset generation started') + s = create_training_data(master=master, log=logger) + os.remove('tmp_train.pkl') + logger = s.logger + if properties_settings['pred'] != 1 and properties_settings['map'] != 1: + for handler in logger.handlers: + handler.close() + logger.removeHandler(handler) + master.destroy() + + master = tk.Tk() + if properties_settings['pred'] == 1: + logger.info('Prediction dataset generation started') + s = create_prediction_data(master=master, log=logger) + os.remove('tmp_pred.pkl') + logger = s.logger + if properties_settings['pred'] != 1 and properties_settings['map'] != 1: + for handler in logger.handlers: + handler.close() + logger.removeHandler(handler) + master.destroy() + + master = tk.Tk() + if properties_settings['map'] == 1: + logger.info('Map generation started') + with open('tmp_map.pkl', 'rb') as handle: + properties_map = pickle.load(handle) + + if properties_map['training'] == 1 and properties_map['prediction'] == 1: + for mode in ['train_test', 'prediction']: + if mode == 'train_test': + s = RandomForest(master, mode, log=logger) + else: + if properties_map['parallel'] == 1: + s = RandomForest(master, mode, parallel=True, log=logger) + else: + s = RandomForest(master, mode, log=logger) + elif properties_map['training'] == 1 and properties_map['prediction'] == 0: + s = RandomForest(master, 'train_test', log=logger) + elif properties_map['prediction'] == 1 and properties_map['training'] == 0: + if properties_map['parallel'] == 1: + s = RandomForest(master, 'prediction', parallel=True, log=logger) + else: + s = RandomForest(master, 'prediction', log=logger) + os.remove('tmp_map.pkl') + logger = s.logger + for handler in logger.handlers: + handler.close() + logger.removeHandler(handler) diff --git a/src/gui_version/utilities/cut_and_interpolate_gui.py b/src/gui_version/utilities/cut_and_interpolate_gui.py new file mode 100644 index 0000000000000000000000000000000000000000..608750ef11c61c1c2264d9c0bdeb63aeac001ce8 --- /dev/null +++ b/src/gui_version/utilities/cut_and_interpolate_gui.py @@ -0,0 +1,1000 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import pickle +import os +import netCDF4 as nc + +from LatLon23 import LatLon, Latitude, Longitude +from scipy.interpolate import interp2d, interp1d + +from utilities.import_raw_dataset import import_raw_dataset +from utilities.import_format import import_tif, import_nc +from utilities.ncfile_generation import generate_basic_ncfile + +class cut_and_interpolate: + + """ + This class imports a dataset, cuts it to the desired extent and + interpolates it to the desired resolution. + The settings are taken from the user input through the gui. + """ + + def __init__(self, key=None, path=None, no_data_value=None, + categorical=None, several=None, several_same=None, + first=None, bb=None, cluster=False, for_prediction=False, + prop_settings=None, prop=None, path_properties=None): + + """ + Input: + key: key belonging to the dataset (data_summary.csv) + path: path where the dataset is stored (data_summary.csv) + no_data_value: value representing no data (data_summary.csv) + categorical: boolean if dataset contains categorical + information (data_summary.csv) + several: boolean if class is called several times + e.g. in a for loop over several datasets + several_same: boolean if all datasets have the same + spatial extent and resolution + first: boolean, + only important if several or several_same is True + bb: list of bounding boxes, format list ymax, ymin, xmin, xmax + cluster: boolean, determines whether several sections of the + dataset are interpolated in a loop + for_pediction: boolean, + results used for creating the prediction dataset + prop_settings: dictionary containing general settings + prop: dictionary containing necessary + information for cutting and interpolation + path_properties: path to separate file storing information + on applied cutting extent and + interpolation vectors + """ + + # Import cutting and interpolation information if this is not the + # first dataset of several to be cut and interpolated + if several and not first: + with open(path_properties, 'rb') as handle: + self.properties = pickle.load(handle) + self.path = path + + self.key = key + self.cluster = cluster + self.for_prediction = for_prediction + self.prop = prop + self.prop_settings = prop_settings + + # Define bounding box + if cluster: + self.bb_ges = bb + self.to_cluster = True + elif self.for_prediction: + self.to_cluster = False + self.bb = [self.prop['north'], self.prop['south'], + self.prop['west'], self.prop['east']] + if several and not first: + self.bb = [self.properties['interp_vectors']['y'][0], + self.properties['interp_vectors']['y'][-1], + self.properties['interp_vectors']['x'][0], + self.properties['interp_vectors']['x'][-1]] + else: + self.to_cluster = False + if several and not first: + self.bb = [self.properties['interp_vectors']['y'][0], + self.properties['interp_vectors']['y'][-1], + self.properties['interp_vectors']['x'][0], + self.properties['interp_vectors']['x'][-1]] + else: + self.bb = bb + + self.path_properties = path_properties + + if no_data_value != 'None': + self.no_data = no_data_value.split(',') + self.no_data = [float(val) for val in self.no_data] + else: + self.no_data = no_data_value + + self.categorical = categorical + self.several = several + self.several_same = several_same + self.first = first + + # Define limits to determine interpolation approach for dataset + self.limit_org = 50000000 + self.limit_interp = 7500000 + self.size = 200 + self.overlap = 100 + + self.data, self.x_org, self.y_org = import_raw_dataset(self.path, self.no_data, prop_settings['no_value']) # Import raw datasets + + # If training locations are clustered + if self.to_cluster: + + self.x_raw = self.x_org + self.y_raw = self.y_org + self.data_raw = self.data + + def parallized_interpolation(num): + + # Interpolate the cut dataset + a = self.interpolate_dataset( + self.subsets[num], + self.y_orgs[num], + self.x_orgs[num], + self.ds['Longitude' + str(num)][:].data, + self.ds['Latitude' + str(num)][:].data) + + # Save the interpolated dataset in the nc file/Update the cut + # and interpolated dataset for the 2nd and following datasets + if self.first_dataset: + result = self.ds.createVariable( + 'Result' + str(num), + 'f4', + ('lat' + str(num), 'lon' + str(num))) + + result[:, :] = a + else: + self.ds['Result' + str(num)][:, :] = a + + self.subsets = [] + self.x_orgs = [] + self.y_orgs = [] + self.cuttables = [] + + self.first_dataset = False + + # Iterate over all bounding boxes of + # the clustered training locations + for count, self.bb in enumerate(self.bb_ges): + + self.x_org = self.x_raw + self.y_org = self.y_raw + self.data = self.data_raw + + # Check that all bounding boxes are + # covered by the extent of the dataset + self.compare_extends() + + self.cuttables.append(self.cuttable) + # Cut the original dataset to the + # currently considered bounding box + self.cut_to_boundingbox() + # Store cut properties to be used in the interpolation + self.subsets.append(self.data) + self.x_orgs.append(self.x_org) + self.y_orgs.append(self.y_org) + + if not os.path.isfile('tmp.nc') or self.first_dataset: + + if count == 0: + # Open temporarty file to store the + # interpolated subsets of the dataset + self.ds = generate_basic_ncfile('tmp.nc') + self.first_dataset = True + + # Determine the x and y vectors for interpolation + self.determine_reference_vectors() + # Saving the interpolation vectors to the temporary file + self.ds.createDimension('lat' + str(count), len(self.y)) + self.ds.createDimension('lon' + str(count), len(self.x)) + longitude = self.ds.createVariable( + 'Longitude' + str(count), + 'f4', + 'lon' + str(count)) + latitude = self.ds.createVariable( + 'Latitude' + str(count), + 'f4', + 'lat' + str(count)) + + longitude[:] = self.x + latitude[:] = self.y + + elif (os.path.isfile('tmp.nc') + and not self.first_dataset and count == 0): + # If it's not the first dataset to be cut, open the nc file + self.ds = nc.Dataset('tmp.nc', mode='a') + + self.one_go, self.as_chunks, self.as_cols = True, False, False + + # Final decision whether cutting and interpolation is possible + if False in self.cuttables: + self.cuttable = False + else: + self.cuttable = True + + # Interpolate all subsets in parallel + #Parallel(n_jobs=5, backend='threading', timeout=999999) + #(delayed(parallized_interpolation)(num) + # for num in range(len(self.bb_ges))) + for num in range(len(self.bb_ges)): + parallized_interpolation(num) + self.ds.close() + + elif self.for_prediction: + + def test_parallel_interpolation(i): + + ref = self.interpolate_dataset( + np.array(chunks_old[i]), + np.array(np.linspace( + self.y_org[pixels_old[i][0]], + self.y_org[pixels_old[i][1]], + abs(pixels_old[i][1]-pixels_old[i][0]))), + np.array(np.linspace( + self.x_org[pixels_old[i][2]], + self.x_org[pixels_old[i][3]], + abs(pixels_old[i][3]-pixels_old[i][2]))), + self.x_final[i], + self.y_final[i]) + + return ref + self.compare_extends() + + # If bounding box is within limits of dataset + if self.cuttable: + self.cut_to_boundingbox() # Cut to the bounding box + + # Determine interpolation vectors + self.determine_reference_vectors() + # Depending on dataset size determine interpolation approach + self.determine_interpolation_approach() + + if self.one_go: + # Interpolate dataset + self.array = self.interpolate_dataset( + self.data, + self.y_org, + self.x_org, + self.x, + self.y) + + # If original dataset has to be split into chunks + elif self.as_chunks: + # Split the dataset into chunks + chunks_old, pixels_old = self.split_into_chunks() + # Determine interpolation vectors for each chunk + self.determine_new_vector() + + #ref_tmp = Parallel(n_jobs=5, + # backend='threading', + # timeout=999999) + #(delayed(test_parallel_interpolation)(num) + # for num in range(len(self.x_final))) + ref_tmp = [] + for num in range(len(self.x_final)): + ref_tmp.append(test_parallel_interpolation(num)) + # Combine the individual interpolated + # chunks into one dataset + self.array = self.reshape_chunks(ref_tmp) + + elif self.as_cols: + + self.split_into_chunks() # Split the dataset into chunks + + ref_tmp = [] + # Go through all chunks and interpolate them individually + for i in range(len(self.x_final)): + ref = self.interpolate_dataset(self.data, + self.y_org, + self.x_org, + self.x_final[i], + self.y_final[i]) + ref_tmp.append(list(ref)) + # Combine the individual interpolated + # chunks into one dataset + self.array = self.reshape_chunks(ref_tmp) + + # If a path is provided, the cutting and interpolation + # information is saved in a pickle file + if self.path_properties is not None: + with open(self.path_properties, 'wb') as handle: + pickle.dump(self.properties, handle) + + else: + # Check if bounding box is covered by limits of dataset + self.compare_extends() + + # If bounding box is within limits of dataset + if self.cuttable: + + self.cut_to_boundingbox() # Cut to the bounding box + # Determine interpolation vectors + self.determine_reference_vectors() + # Depending on dataset size determine interpolation approach + self.determine_interpolation_approach() + + # If interpolation can be done in one go + if self.one_go: + + # Interpolate dataset + self.array = self.interpolate_dataset(self.data, + self.y_org, + self.x_org, + self.x, + self.y) + + # If original dataset has to be split into chunks + elif self.as_chunks: + # Split the dataset into chunks + chunks_old, pixels_old = self.split_into_chunks() + # Determine interpolation vectors for each chunk + self.determine_new_vector() + + ref_tmp = [] + # Go through all chunks and interpolate them individually + for i in range(len(chunks_old)): + ref = self.interpolate_dataset( + np.array(chunks_old[i]), + np.array(np.linspace( + self.y_org[pixels_old[i][0]], + self.y_org[pixels_old[i][1]], + abs(pixels_old[i][1]-pixels_old[i][0]))), + np.array(np.linspace( + self.x_org[pixels_old[i][2]], + self.x_org[pixels_old[i][3]], + abs(pixels_old[i][3]-pixels_old[i][2]))), + self.x_final[i], + self.y_final[i]) + ref_tmp.append(list(ref)) + # Combine the individual interpolated + # chunks into one dataset + self.array = self.reshape_chunks(ref_tmp) + + elif self.as_cols: + + self.split_into_chunks() # Split the dataset into chunks + + ref_tmp = [] + # Go through all chunks and interpolate them individually + for i in range(len(self.x_final)): + ref = self.interpolate_dataset(self.data, + self.y_org, + self.x_org, + self.x_final[i], + self.y_final[i]) + ref_tmp.append(list(ref)) + # Combine the individual interpolated + # chunks into one dataset + self.array = self.reshape_chunks(ref_tmp) + + # If a path is provided, the cutting and interpolation + # information is saved in a pickle file + if self.path_properties is not None: + with open(self.path_properties, 'wb') as handle: + pickle.dump(self.properties, handle) + + def compare_extends(self): + + """ + Determine if the bounding box to which the dataset shall be cut is + completely covered by the dataset. + If not, the execution of the script will be aborted. + """ + + self.cuttable = True + self.left_too_short = False + self.right_too_short = False + self.bottom_too_short = False + self.top_too_short = False + y, x = [], [] + for coord in [self.y_org[0], self.y_org[-1], self.bb[0], self.bb[1]]: + + if coord >= 0: + y.append(90 + coord) + + if coord < 0: + y.append(90 - abs(coord)) + + for coord in [self.x_org[0], self.x_org[-1], self.bb[2], self.bb[3]]: + + if coord >= 0: + x.append(180 + coord) + + if coord < 0: + x.append(180 - abs(coord)) + + if y[2] > y[0]: + self.top_too_short = True + if y[3] < y[1]: + self.bottom_too_short = True + if x[2] < x[0]: + self.left_too_short = True + if x[3] > x[1]: + self.right_too_short = True + + if (self.bottom_too_short or self.top_too_short + or self.left_too_short or self.right_too_short): + self.cuttable = False + self.array = None + self.x = None + self.y = None + + return self.cuttable + + def cut_to_boundingbox(self): + + """ + Cut the dataset to the bounding box + """ + + if self.several_same and not self.first: + + # Load the indices of the bounding box from the properties file + self.top = self.properties['boundaries']['top'] + self.bottom = self.properties['boundaries']['bottom'] + self.left = self.properties['boundaries']['left'] + self.right = self.properties['boundaries']['right'] + + else: + # If several datasets shall be interpolated after another and the + # current run is the first dataset + if (self.several and self.first) or (self.several_same and self.first): + # Open empty dictionary to store the cutting and + # interpolation information in + self.properties = {} + + # Determine if the coordinate vectors + # contain both pos and neg values + if (all(val >= 0 for val in self.x_org) + or all(val <= 0 for val in self.x_org)): + + # Determine pixel index of left and right edge of bounding box + self.left = int((np.abs(self.x_org - self.bb[2])).argmin()) + self.right = int((np.abs(self.x_org - self.bb[3])).argmin()) + + else: + + if self.bb[2] <= 0: + tmp = [x for x in self.x_org if x <= 0] + else: + tmp = [x for x in self.x_org if x >= 0] + + self.left = list(self.x_org).index( + tmp[int((np.abs(np.array(tmp) - self.bb[2])).argmin())]) + + if self.bb[3] <= 0: + tmp = [x for x in self.x_org if x <= 0] + else: + tmp = [x for x in self.x_org if x >= 0] + + self.right = list(self.x_org).index( + tmp[int((np.abs(np.array(tmp) - self.bb[3])).argmin())]) + + if (all(val >= 0 for val in self.y_org) + or all(val <= 0 for val in self.y_org)): + + # Determine pixel index of top and bottom edge of bounding box + self.top = int((np.abs(self.y_org - self.bb[0])).argmin()) + self.bottom = int((np.abs(self.y_org - self.bb[1])).argmin()) + + else: + + if self.bb[0] <= 0: + tmp = [y for y in self.y_org if y <= 0] + else: + tmp = [y for y in self.y_org if y >= 0] + + self.top = list(self.y_org).index( + tmp[int((np.abs(np.array(tmp) - self.bb[0])).argmin())]) + + if self.bb[1] <= 0: + tmp = [y for y in self.y_org if y <= 0] + else: + tmp = [y for y in self.y_org if y >= 0] + + self.bottom = list(self.y_org).index( + tmp[int((np.abs(np.array(tmp) - self.bb[1])).argmin())]) + + # Add pixel in all directions to account for rounding issues + + if not self.for_prediction: + if self.left-100 >= 0: + self.left = self.left - 100 + if self.top-100 >= 0: + self.top = self.top - 100 + if self.bottom+100 <= np.shape(self.data)[0]: + self.bottom = self.bottom + 100 + if self.right+100 <= np.shape(self.data)[1]: + self.right = self.right + 100 + + if self.several_same and self.first: + # Store the indices to be used again with the next dataset + self.properties['boundaries'] = {} + self.properties['boundaries']['top'] = self.top + self.properties['boundaries']['bottom'] = self.bottom + self.properties['boundaries']['left'] = self.left + self.properties['boundaries']['right'] = self.right + + # Cut the dataset and x, y vectors to the determined extent + self.data = self.data[self.top:self.bottom, self.left:self.right] + + self.x_org = self.x_org[self.left:self.right] + self.y_org = self.y_org[self.top:self.bottom] + + def determine_reference_vectors(self): + + """ + Determine interpolation vectors x and y. + """ + + # If several datasets shall be interpolated after another and the + # current run is the first dataset + if self.several and self.first: + + # Determine distance in meters in x and y + # direction between bounds of dataset + point1_x = LatLon(Latitude(self.y_org[0]), + Longitude(self.x_org[0])) + point2_x = LatLon(Latitude(self.y_org[0]), + Longitude(self.x_org[-1])) + distance_x = point1_x.distance(point2_x)*1000 + + point1_y = LatLon(Latitude(self.y_org[0]), + Longitude(self.x_org[0])) + point2_y = LatLon(Latitude(self.y_org[-1]), + Longitude(self.x_org[0])) + distance_y = point1_y.distance(point2_y)*1000 + + # Determine interpolation vector with desired resolution + self.x = np.linspace( + self.x_org[0], + self.x_org[-1], + int(distance_x/self.prop_settings['resolution'])) + self.y = np.linspace( + self.y_org[0], + self.y_org[-1], + int(distance_y/self.prop_settings['resolution'])) + + # Store interpolation vector in properties file + self.properties['interp_vectors'] = {} + self.properties['interp_vectors']['x'] = self.x + self.properties['interp_vectors']['y'] = self.y + + # If only one dataset shall be interpolated + elif not self.several: + + # Determine distance in meters in x and y + # direction between bounds of dataset + point1_x = LatLon(Latitude(self.y_org[0]), + Longitude(self.x_org[0])) + point2_x = LatLon(Latitude(self.y_org[0]), + Longitude(self.x_org[-1])) + distance_x = point1_x.distance(point2_x)*1000 + + point1_y = LatLon(Latitude(self.y_org[0]), + Longitude(self.x_org[0])) + point2_y = LatLon(Latitude(self.y_org[-1]), + Longitude(self.x_org[0])) + distance_y = point1_y.distance(point2_y)*1000 + + # Determine interpolation vector with desired resolution + self.x = np.linspace( + self.x_org[0], + self.x_org[-1], + int(distance_x/self.prop_settings['resolution'])) + self.y = np.linspace( + self.y_org[0], + self.y_org[-1], + int(distance_y/self.prop_settings['resolution'])) + + # If several datasets shall be interpolated after another and the + # current run is not the first dataset + elif self.several and not self.first: + + self.x = np.array(self.properties['interp_vectors']['x']) + self.y = np.array(self.properties['interp_vectors']['y']) + + def determine_new_vector(self): + + """ + Determine interpolation vectors for the chunks. + """ + + # For each chunk determine the original x and y vectors + x_ref = [[self.x_org[self.x_limits[i][0]], + self.x_org[self.x_limits[i][1]]] + for i in range(len(self.x_limits))] + y_ref = [[self.y_org[self.y_limits[i][0]], + self.y_org[self.y_limits[i][1]]] + for i in range(len(self.y_limits))] + + self.x_final = [] + self.y_final = [] + + for j in range(np.shape(x_ref)[0]): + ind_min_x = int((np.abs(self.x - x_ref[j][0])).argmin()) + ind_max_x = int((np.abs(self.x - x_ref[j][1])).argmin()) + + self.x_final.append(self.x[ind_min_x:ind_max_x]) + + for j in range(np.shape(y_ref)[0]): + ind_min_y = int((np.abs(self.y - y_ref[j][0])).argmin()) + ind_max_y = int((np.abs(self.y - y_ref[j][1])).argmin()) + + self.y_final.append(self.y[ind_min_y:ind_max_y]) + + def split_into_chunks(self): + + """ + Split the dataset into chunks for interpolation + """ + + # If the dataset needs to be split into chunks + if self.as_chunks: + + y_len, x_len = np.shape(self.data)[0], np.shape(self.data)[1] + + # Split in equal sized chunks and treat the bottom and right + # differently that have different shape than the equal sized chunks + + plus_y = self.data.shape[0] % self.size + plus_x = self.data.shape[1] % self.size + + # Number of equal sized chunks in x and y direction + num_y = int(self.data.shape[0] / self.size) + num_x = int(self.data.shape[1] / self.size) + + # If final columns and row too small to be called individual + # chunks, combine with second to last row and column + if plus_y < 2/3*self.size: + num_y = num_y - 1 + + if plus_x < 2/3*self.size: + num_x = num_x - 1 + + self.num_y = num_y + self.num_x = num_x + + chunks = [] # Store the chunks + pixels = [] # Store the pixel limits to acces original coordinates + count = 0 + + # Store the coord limits to acces original coordinates + self.x_limits = [] + self.y_limits = [] + + # Save the chunks in a list + count_ges = 0 + tmpy = 0 + for i in range(num_y): + tmpx = 0 + for j in range(num_x): + if ((i+1)*self.size-1+self.overlap <= self.data.shape[0]) and ((j+1)*self.size-1+self.overlap <= self.data.shape[1]): + chunks.append( + list(self.data[i*self.size:(i+1)*self.size-1+self.overlap, + j*self.size:(j+1)*self.size-1+self.overlap])) + pixels.append( + [i*self.size, (i+1)*self.size-1+self.overlap, + j*self.size, (j+1)*self.size-1+self.overlap]) + + self.x_limits.append([j*self.size, (j+1)*self.size-1+self.overlap]) + self.y_limits.append([i*self.size, (i+1)*self.size-1+self.overlap]) + + elif ((i+1)*self.size-1+self.overlap > self.data.shape[0]) and ((j+1)*self.size-1+self.overlap <= self.data.shape[1]): + chunks.append( + list(self.data[i*self.size:, + j*self.size:(j+1)*self.size-1+self.overlap])) + pixels.append( + [i*self.size, np.shape(self.data)[0]-1, + j*self.size, (j+1)*self.size-1+self.overlap]) + elif ((j+1)*self.size-1+self.overlap > self.data.shape[1]) and ((i+1)*self.size-1+self.overlap <= self.data.shape[0]): + chunks.append( + list(self.data[i*self.size:(i+1)*self.size-1+self.overlap, + j*self.size:])) + pixels.append( + [i*self.size, (i+1)*self.size-1+self.overlap, + j*self.size, np.shape(self.data)[1]-1]) + elif ((j+1)*self.size-1+self.overlap > self.data.shape[1]) and ((i+1)*self.size-1+self.overlap > self.data.shape[0]): + chunks.append( + list(self.data[i*self.size:, + j*self.size:])) + pixels.append( + [i*self.size, np.shape(self.data)[0]-1, + j*self.size, np.shape(self.data)[1]-1]) + tmpy = tmpy + 1 + + # Chunks most bottom column + tmpx = 0 + for j in range(num_x): + if ((j+1)*self.size-1+self.overlap <= self.data.shape[1]): + chunks.append( + list(self.data[(num_y)*self.size:-1, + j*self.size:(j+1)*self.size-1+self.overlap])) + pixels.append( + [(num_y)*self.size, np.shape(self.data)[0]-1, + j*self.size, (j+1)*self.size-1+self.overlap]) + self.x_limits.append([j*self.size, (j+1)*self.size-1+self.overlap]) + self.y_limits.append([(num_y)*self.size, np.shape(self.data)[0]-1]) + else: + chunks.append( + list(self.data[(num_y)*self.size:-1, + j*self.size:])) + pixels.append( + [(num_y)*self.size, np.shape(self.data)[0]-1, + j*self.size, np.shape(self.data)[1]-1]) + self.x_limits.append([j*self.size, (j+1)*self.size-1]) + + # Chunks most right column + tmpy = 0 + for j in range(num_y): + if ((j+1)*self.size-1+self.overlap <= self.data.shape[0]): + chunks.append( + list(self.data[j*self.size:(j+1)*self.size-1+self.overlap, + (num_x)*self.size:-1])) + pixels.append( + [j*self.size, (j+1)*self.size-1+self.overlap, + (num_x)*self.size, x_len-1]) + self.y_limits.append([j*self.size, (j+1)*self.size-1+self.overlap]) + self.x_limits.append([(num_x)*self.size, x_len-1]) + else: + chunks.append( + list(self.data[j*self.size:-1, + (num_x)*self.size:-1])) + pixels.append( + [j*self.size, np.shape(self.data)[0]-1, + (num_x)*self.size, x_len-1]) + self.y_limits.append([j*self.size, (j+1)*self.size-1]) + + # Chunk bottom right + chunks.append( + list(self.data[num_y*self.size:-1, + num_x*self.size:-1])) + pixels.append( + [num_y*self.size, y_len-1, + num_x*self.size, x_len-1]) + + # Save corner indices for the chunks + self.x_limits.append([num_x*self.size, x_len-1]) + self.y_limits.append([num_y*self.size, y_len-1]) + + return chunks, pixels + + # If dataset is interpolated columns-wise + elif self.as_cols: + + chunks, pixels = None, None + self.x_limits = [[], [], [], [], [], [], [], []] + + # Determine columns to be interpolated in each chunk + i = 0 + while i <= len(self.x): + for j in range(len(self.x_limits)): + if i+j <= len(self.x)-1: + self.x_limits[j].append(i + j) + i = i + j + 1 + + # Determine the coordinates in the interpolation vector + self.x_final = [[], [], [], [], [], [], [], []] + self.y_final = [] + + for i in range(len(self.x_limits)): + for j in self.x_limits[i]: + self.x_final[i].append(self.x[j]) + self.y_final.append(list(self.y)) + + def determine_interpolation_approach(self): + + """ + Depending on the siz of the original dataset and the size of the + dataset after the interpolation, the computational + power might be exceeded and the dataset needs to be + split up to be interpolated. + + Different cases are covered in this function and depending + on the sizes, the approach is determined. + Approaches: + one_go: dataset before and after interpolation small + enough to be interpolated in one go + as_chunks: dataset before interpolation already so large that + it needs to be split into chunks which then + area interpolated independently + as_cols: dataset after interpolation so large, + that interpolation is done columnwise + + """ + + # If several datasets shall be interpolated after another and the + # current run is the first dataset + if (self.several and self.first) or not self.several: + if len(self.x_org) < 2*self.size and len(self.y_org) < 2*self.size: + self.one_go, self.as_chunks, self.as_cols = True, False, False + else: + # Assessment of the size of the dataset before and after + # interpolation and comparison with manually defined limit + # to decide for interpolation approach + if ((len(self.x) * len(self.y) < self.limit_interp) + and (len(self.x_org) * len(self.y_org) < self.limit_org)): + self.one_go, self.as_chunks, self.as_cols = True, False, False + + elif len(self.x_org) * len(self.y_org) >= self.limit_org: + self.one_go, self.as_chunks, self.as_cols = False, True, False + + elif (len(self.x) * len(self.y) > self.limit_interp): + self.one_go, self.as_chunks, self.as_cols = False, False, True + + if self.several and self.first: + # Store the interpolation approach in the properties file + self.properties['interp_approach'] = {} + self.properties['interp_approach']['one_go'] = self.one_go + self.properties['interp_approach']['as_chunks'] = self.as_chunks + self.properties['interp_approach']['as_cols'] = self.as_cols + + # If several datasets shall be interpolated after another and the + # current run is not the first dataset + elif self.several and not self.first: + + # Load the interpolation approach from the properties file + self.one_go = self.properties['interp_approach']['one_go'] + self.as_chunks = self.properties['interp_approach']['as_chunks'] + self.as_cols = self.properties['interp_approach']['as_cols'] + + def interpolate_dataset(self, data, y, x, x_new, y_new): + + """ + Interpolate dataset. Categorical data is interpolated using + nearest neighbor first into x direction then into y direction + + Input: + data: data to interpolate, depending on the interpolation + appraoch the whole dataset or a chunk + y: original y vector + x: original x vector + x_new: interpolation vector x + y_new: interpolation vector y + + Return: + data_interp: interpolated data + """ + + # Interpolation vectors + x_new = np.array(x_new) + y_new = np.array(y_new) + + # Make sure that no data values do not corrupt the interpolation + data = data.astype(float) + data[data == self.prop_settings['no_value']] = np.nan + + + if self.categorical==False: + data = np.flipud(data) + if self.prop_settings['no_value'] != None: + nan_map = np.zeros_like(data) + nan_map[np.isnan(data)] = 1 + filled_z = data.copy() + filled_z[np.isnan(data)] = 0 + # Interpolation + f = interp2d(x, np.flip(y), filled_z, kind='linear') + data_interp = f(x_new, y_new) + if self.prop_settings['no_value'] != None: + f_nan = interp2d(x, np.flip(y), nan_map, kind='linear') + nan_new = f_nan(x_new, y_new) + + # Set all by nan values affected pixels to no data value + data_interp[nan_new > 0] = self.prop_settings['no_value'] + + return np.flipud(data_interp) + + # If data is categorical + elif self.categorical==True: + # Define empty arrays to be filled + if self.prop_settings['no_value'] != None: + nan_map = np.zeros_like(data) + nan_map[np.isnan(data)] = 1 + filled_z = data.copy() + filled_z[np.isnan(data)] = 0 + + data_interp_x = np.zeros((len(y), len(x_new))) + nan_interp_x = np.zeros((len(y), len(x_new))) + + # Interpolate first in x direction + for i in range(len(y)): + + tmp = filled_z[i, :] + f = interp1d(x, tmp, kind='nearest', fill_value="extrapolate") + data_interp_x[i, :] = f(x_new) + + if self.prop_settings['no_value'] != None: + tmp = nan_map[i, :] + f = interp1d(x, tmp, kind='nearest', fill_value="extrapolate") + nan_interp_x[i, :] = f(x_new) + nan_interp = np.zeros((len(y_new), len(x_new))) + + # Define empty arrays to be filled + data_interp = np.zeros((len(y_new), len(x_new))) + + # Then interpolate in y direction + for i in range(len(x_new)): + + tmp = data_interp_x[:, i] + f = interp1d(y, tmp, kind='nearest', fill_value="extrapolate") + data_interp[:, i] = f(y_new) + if self.prop_settings['no_value'] != None: + tmp = nan_interp_x[:, i] + f = interp1d(y, tmp, kind='nearest', fill_value="extrapolate") + nan_interp[:, i] = f(y_new) + + # Set all by nan values affected pixels to no data value + data_interp[nan_interp > 0] = self.prop_settings['no_value'] + + return data_interp + + def reshape_chunks(self, chunks): + + """ + Interpolated chunks are attached to form the interpolated dataset. + The chunks overlap and for categorical features, only one version + is used. For continuous features, the overlapping parts are averaged. + + Input: + chunks: interpolated chunks, list of lists + """ + + if self.as_chunks: + array = np.zeros((len(self.y), len(self.x))) + aa = np.zeros((len(self.y), len(self.x))) + test = np.zeros((len(self.y), len(self.x))) + + shape_x, shape_y = [], [] + for chunk in chunks: + shape_x.append(np.shape(np.array(chunk))[1]) + shape_y.append(np.shape(np.array(chunk))[0]) + + count = 0 + for count, chunk in enumerate(chunks): + xt = int((np.abs(self.x - self.x_final[count][0])).argmin()) + yt = int((np.abs(self.y - self.y_final[count][0])).argmin()) + + tmp = np.array(chunks[count]) + tmp1 = array[yt:yt+shape_y[count], xt:xt+shape_x[count]] + aa[yt:yt+shape_y[count], xt:xt+shape_x[count]] = tmp + + mask = (tmp1 == 0) | (tmp1 == -999) | (tmp == -999) + + if not self.categorical: + # Calculate the element-wise average only where mask is False + average_array = np.zeros_like(tmp, dtype=float) # Initialize array for the result + average_array[~mask] = (tmp[~mask] + tmp1[~mask]) / 2 + + # Assign elements from arr2 where arr1 is equal to zero + average_array[mask] = tmp[mask] + + array[yt:yt+shape_y[count], xt:xt+shape_x[count]] = average_array + + tmp = np.ones_like(tmp, dtype=float)*count + 1 + tmp1 = test[yt:yt+shape_y[count], xt:xt+shape_x[count]] + + mask = (tmp1 == 0) + + # Calculate the element-wise average only where mask is False + average_array = np.zeros_like(tmp, dtype=float) # Initialize array for the result + average_array[~mask] = (tmp[~mask] + tmp1[~mask]) / 2 + + # Assign elements from arr2 where arr1 is equal to zero + average_array[mask] = tmp[mask] + + test[yt:yt+shape_y[count], xt:xt+shape_x[count]] = average_array + + elif self.categorical: + + average_array = np.zeros_like(tmp, dtype=float) # Initialize array for the result + average_array[~mask] = (tmp[~mask] + tmp1[~mask]) / 2 + + # Assign elements from arr2 where arr1 is equal to zero + average_array[mask] = tmp[mask] + + array[yt:yt+shape_y[count], xt:xt+shape_x[count]] = tmp + test[yt:yt+shape_y[count], xt:xt+shape_x[count]] = average_array + self.test = test.copy() + elif self.as_cols: + # Final array to be filled + array = np.zeros((len(self.y), len(self.x))) + + # Insert the columns of the individual + # chunks into the final dataset + for i in range(len(chunks)): + array[:, self.x_limits[i]] = np.array(chunks[i]) + + return array + diff --git a/src/gui_version/utilities/gui.py b/src/gui_version/utilities/gui.py new file mode 100644 index 0000000000000000000000000000000000000000..8caf53993238a9d029238cf6564247fa3c3f7f65 --- /dev/null +++ b/src/gui_version/utilities/gui.py @@ -0,0 +1,1408 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import pickle +import tkinter as tk +import os + +from tkinter import filedialog, Label, Checkbutton, StringVar, font +from PIL import Image, ImageTk + +""" + This script constructs SHIRE's user interface' +""" + +class general_settings: + + def __init__(self, master): + super().__init__() + self.master = master + self.settings_import = False + self.row = 0 + self.master.winfo_toplevel().title( + "SHIRE - Susceptibility and Hazard mappIng fRamEwork") + + default_font = font.nametofont("TkDefaultFont") + heading_font = font.Font(family=default_font.cget("family"), size=14, weight="bold") + + impath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) + '/images/Logo.png' + pil_image = Image.open(impath) + resized_image = pil_image.resize((450, 120), Image.LANCZOS) + image = ImageTk.PhotoImage(resized_image) + + image_label = tk.Label(self.master, image=image, width=450, height=120) + image_label.image = image + image_label.grid(row=self.row, column=0, columnspan=2) + self.row = self.row + 1 + + self.dic_change = {} + + b_chooseFile = tk.Button(self.master, text="Import settings", + command=self.import_file) + b_chooseFile.grid(row=self.row, column=0, columnspan=4, sticky='ew') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, text="Intention of this run", anchor='w', justify='left', font=heading_font).grid( + row=self.row, column=0, sticky='w') + self.train = tk.IntVar() + self.row = self.row + 1 + Label(self.master, text="Generation of the...", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + self.train = tk.IntVar() + + b1 = Checkbutton(self.master, text="Training dataset", + variable=self.train) + b1.grid(row=self.row, column=1, sticky='w') + self.row = self.row + 1 + self.pred = tk.IntVar() + b2 = Checkbutton(self.master, text="Prediction dataset", + variable=self.pred) + b2.grid(row=self.row, column=1, sticky='w') + self.row = self.row + 1 + self.map = tk.IntVar() + b3 = Checkbutton(self.master, text="Susceptibility or hazard map", + variable=self.map) + b3.grid(row=self.row, column=1, sticky='w') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, text="General settings", anchor='w', justify='left', font=heading_font).grid(row=self.row, column=0, sticky='w') + self.row = self.row + 1 + Label(self.master, text="Resolution [m]", anchor='w', justify='left').grid(row=self.row, column=0, sticky='w') + self.res = tk.IntVar() + i = 'resolution' + self.res.trace_add("write", lambda name, index, mode, var=self.res, + i=i: self.callback(var, i)) + entry = tk.Entry(self.master, textvariable=self.res) + entry.grid(row=self.row, column=1, sticky='ew') + self.row = self.row + 1 + + Label(self.master, text="No data value", anchor='w', justify='left').grid(row=self.row, column=0, sticky='w') + self.no_value = tk.IntVar() + i = 'no_value' + self.no_value.trace_add("write", lambda name, index, mode, + var=self.no_value, i=i: self.callback(var, i)) + entry = tk.Entry(self.master, textvariable=self.no_value) + entry.grid(row=self.row, column=1, sticky='ew') + self.row = self.row + 1 + + Label(self.master, text="CRS", anchor='w', justify='left').grid(row=self.row, column=0, sticky='w') + self.crs = tk.StringVar() + i = 'crs' + self.crs.trace_add("write", lambda name, index, mode, var=self.crs, + i=i: self.callback(var, i)) + entry = tk.Entry(self.master, textvariable=self.crs) + entry.grid(row=self.row, column=1, sticky='ew') + self.row = self.row + 1 + + Label(self.master, text="Random seed", anchor='w', justify='left').grid(row=self.row, column=0, sticky='w') + self.random_seed = tk.IntVar() + i = 'random_seed' + self.random_seed.trace_add( + "write", lambda name, index, mode, var=self.random_seed, + i=i: self.callback(var, i)) + entry = tk.Entry(self.master, textvariable=self.random_seed) + entry.grid(row=self.row, column=1, sticky='ew') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + self.save = tk.IntVar() + b4 = Checkbutton(self.master, text="Save the settings for later use", + variable=self.save, anchor='w', justify='left', font=font.Font(family=default_font.cget("family"), weight="bold")) + b4.grid(row=self.row, column=0, columnspan=2, sticky='w') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + btn = tk.Button(self.master, text="Submit", command=self.on_submit) + btn.grid(row=self.row, column=0, columnspan=4, sticky='ew') + + def import_file(self): + sourceFile = filedialog.askopenfilename( + parent=self.master, initialdir="/", title='Choose path') + + with open(sourceFile, 'rb') as handle: + self.properties = pickle.load(handle) + + self.settings_import = True + + def callback(self, var, i): + self.dic_change[i] = var.get() + + def on_submit(self): + + if self.train.get() == 0: + self.train = False + else: + self.train = True + + if self.pred.get() == 0: + self.pred = False + else: + self.pred = True + + if self.map.get() == 0: + self.map = False + else: + self.map = True + + if self.settings_import: + self.res = int(self.properties['resolution']) + self.no_value = int(self.properties['no_value']) + self.crs = self.properties['crs'] + self.random_seed = int(self.properties['random_seed']) + else: + self.res = int(self.res.get()) + self.no_value = int(self.no_value.get()) + self.crs = self.crs.get() + self.random_seed = self.random_seed.get() + + if not self.settings_import: + + dic = {} + dic['resolution'] = self.res + dic['random_seed'] = self.random_seed + dic['crs'] = self.crs + dic['no_value'] = self.no_value + dic['train'] = self.train + dic['pred'] = self.pred + dic['map'] = self.map + + elif self.settings_import: + + dic = {} + dic['resolution'] = self.properties['resolution'] + dic['random_seed'] = self.properties['random_seed'] + dic['crs'] = self.properties['crs'] + dic['no_value'] = self.properties['no_value'] + dic['train'] = self.train + dic['pred'] = self.pred + dic['map'] = self.map + + for key in self.dic_change: + dic[key] = self.dic_change[key] + + with open('tmp_settings.pkl', 'wb') as handle: + pickle.dump(dic, handle) + + if self.save.get() == 1: + + sourceDir = filedialog.askdirectory( + parent=self.master, initialdir="/", title='Choose path') + sourceFile = sourceDir + '/settings.pkl' + + with open(sourceFile, 'wb') as handle: + pickle.dump(dic, handle) + + dic = {} + dic['resolution'] = self.res + dic['random_seed'] = self.random_seed + dic['crs'] = self.crs + dic['no_value'] = self.no_value + dic['train'], dic['pred'], dic['map'] = self.train, self.pred, self.map + + self.master.destroy() + + if self.train: + settings_train(self.pred, self.map) + elif self.pred: + settings_pred(self.map) + elif self.map: + settings_map() + +class settings_pred: + + def __init__(self, maps): + super().__init__() + self.master = tk.Tk() + self.map = maps + self.settings_import = False + default_font = font.nametofont("TkDefaultFont") + heading_font = font.Font(family=default_font.cget("family"), size=14, weight="bold") + self.row = 0 + + self.master.winfo_toplevel().title( + "Prediction dataset generation - general settings") + + impath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) + '/images/Logo.png' + pil_image = Image.open(impath) + resized_image = pil_image.resize((600, 180), Image.LANCZOS) + image = ImageTk.PhotoImage(resized_image) + + image_label = tk.Label(self.master, image=image, width=600, height=180) + image_label.image = image + image_label.grid(row=self.row, column=0, columnspan=2) + self.row = self.row + 1 + + b_chooseFile = tk.Button( + self.master, text="Import settings", command=self.import_file) + b_chooseFile.grid(row=self.row, column=0, columnspan=4, sticky='ew') + self.row = self.row + 1 + + Label(self.master, text="General information", anchor='w', justify='left', font=heading_font).grid( + row=self.row, column=0, sticky='w') + self.row = self.row + 1 + + Label(self.master, text="Path to summary on geospatial data", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + + self.b_gs = tk.Button(self.master, text="Choose path to dataset_summary.csv", + command=lambda: self.choose_path('geo_path')) + self.b_gs.grid(row=self.row, column=1, columnspan=1, sticky='ew') + self.row = self.row + 1 + + Label(self.master, text="Features to include", anchor='w', justify='left').grid(row=self.row, column=0, sticky='w') + + self.b_feat = tk.Button(self.master, text="Choose path to keys_to_include.csv", + command=lambda: self.choose_path('feat_path')) + self.b_feat.grid(row=self.row, column=1, columnspan=1, sticky='ew') + self.row = self.row + 1 + + self.dic_path = {} + self.dic_change = {} + + Label(self.master, + text="Path to directory for storing\n the prediction dataset", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + + self.b_train = tk.Button(self.master, + text="Choose directory", + command=lambda: self.choose_dir('pred_path')) + self.b_train.grid(row=self.row, column=1, columnspan=1, sticky='ew') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + self.ohe = tk.IntVar() + b11 = Checkbutton(self.master, text="One-hot encoding of categorical variables", + variable=self.ohe, anchor='w', justify='left')#, command=lambda: button_pressed.set('cluster')) + b11.grid(row=self.row, column=0, columnspan=2, sticky='w') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, text="Area of interest", anchor='w', justify='left', font=heading_font).grid( + row=self.row, column=0, sticky='w') + self.row = self.row + 1 + Label(self.master, text="Bounding box", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + + self.east = tk.StringVar() + i = 'east' + self.east.trace_add( + "write", lambda name, index, mode, + var=self.east, i=i: self.callback(var, i)) + self.entry1 = tk.Entry(self.master, textvariable=self.east, fg='grey') + placeholder1 = 'East [decimal degrees]' + self.entry1.insert(0, placeholder1) # Insert placeholder text initially + self.entry1.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry1, placeholder=placeholder1)) + self.entry1.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry1, placeholder=placeholder1)) + self.entry1.grid(row=self.row, column=1, sticky='ew') + self.row = self.row + 1 + + self.west = tk.StringVar() + i = 'west' + self.east.trace_add( + "write", lambda name, index, mode, + var=self.west, i=i: self.callback(var, i)) + self.entry2 = tk.Entry(self.master, textvariable=self.west, fg='grey') + placeholder2 = 'West [decimal degrees]' + self.entry2.insert(0, placeholder2) # Insert placeholder text initially + self.entry2.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry2, placeholder=placeholder2)) + self.entry2.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry2, placeholder=placeholder2)) + self.entry2.grid(row=self.row, column=1, sticky='ew') + self.row = self.row + 1 + + self.north = tk.StringVar() + i = 'north' + self.east.trace_add( + "write", lambda name, index, mode, + var=self.north, i=i: self.callback(var, i)) + self.entry3 = tk.Entry(self.master, textvariable=self.north, fg='grey') + placeholder3 = 'North [decimal degrees]' + self.entry3.insert(0, placeholder3) # Insert placeholder text initially + self.entry3.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry3, placeholder=placeholder3)) + self.entry3.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry3, placeholder=placeholder3)) + self.entry3.grid(row=self.row, column=1, sticky='ew') + self.row = self.row + 1 + + self.south = tk.StringVar() + i = 'south' + self.east.trace_add( + "write", lambda name, index, mode, + var=self.south, i=i: self.callback(var, i)) + self.entry4 = tk.Entry(self.master, textvariable=self.south, fg='grey') + placeholder4 = 'South [decimal degrees]' + self.entry4.insert(0, placeholder4) # Insert placeholder text initially + self.entry4.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry4, placeholder=placeholder4)) + self.entry4.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry4, placeholder=placeholder4)) + self.entry4.grid(row=self.row, column=1, sticky='ew') + self.row = self.row + 1 + + self.save = tk.IntVar() + b4 = Checkbutton(self.master, + text="Save above settings for later use", + variable=self.save, anchor='w', justify='left', font=font.Font(family=default_font.cget("family"), weight="bold")) + b4.grid(row=self.row, column=0, columnspan=2, sticky='w') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, text="What do you want to do?", anchor='w', justify='left', font=heading_font).grid( + row=self.row, column=0, sticky='w') + self.row = self.row + 1 + Label(self.master, text="Choose from:", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + + button_pressed = StringVar() + self.from_scratch = tk.IntVar() + b1 = tk.Radiobutton(self.master, + text="Generate prediction dataset\n from scratch", + variable=button_pressed, + command=lambda: button_pressed.set('from_scratch'), anchor='w', justify='left') + b1.grid(row=self.row, column=1, sticky='w') + self.row = self.row + 1 + + self.delete = tk.IntVar() + b2 = tk.Radiobutton(self.master, + text="Delete feature(s) from\n existing prediction dataset", + variable=button_pressed, + command=lambda: button_pressed.set('delete'), anchor='w', justify='left') + b2.grid(row=self.row, column=1, sticky='w') + self.row = self.row + 1 + + self.add = tk.IntVar() + b3 = tk.Radiobutton(self.master, + text="Add feature(s) from\n existing prediction dataset", + variable=button_pressed, + command=lambda: button_pressed.set('add'), anchor='w', justify='left') + b3.grid(row=self.row, column=1, sticky='w') + self.row = self.row + 1 + + all_placeholders = [] + all_placeholders.append(placeholder1) + all_placeholders.append(placeholder2) + all_placeholders.append(placeholder3) + all_placeholders.append(placeholder4) + + b2.wait_variable(button_pressed) + + if button_pressed.get() == 'from_scratch': + self.delete.set(0) + self.add.set(0) + self.from_scratch.set(1) + elif button_pressed.get() == 'add': + self.delete.set(0) + self.add.set(1) + self.from_scratch.set(0) + elif button_pressed.get() == 'delete': + self.delete.set(1) + self.add.set(0) + self.from_scratch.set(0) + + btn = tk.Button(self.master, text="Submit", command=lambda: self.submit(all_placeholders)) + btn.grid(row=self.row, column=0, columnspan=3, sticky='ew') + + def on_entry_click(self, event, entry, placeholder): + """Function that handles the event when the entry field is clicked.""" + if entry.get() == placeholder: + entry.delete(0, tk.END) # Delete all the text in the entry + entry.config(fg='black') # Change the text color to black + + def on_focus_out(self, event, entry, placeholder): + """Function that handles the event when the entry field loses focus.""" + if entry.get() == '': + entry.insert(0, placeholder) # Insert the placeholder text + entry.config(fg='grey') # Change the text color to grey + + def import_file(self): + source = filedialog.askopenfilename( + parent=self.master, initialdir="/", title='Choose path') + + with open(source, 'rb') as handle: + self.properties = pickle.load(handle) + + self.settings_import = True + + def choose_path(self, cont): + sourceFile = filedialog.askopenfilename( + parent=self.master, initialdir="/", title='Choose path') + self.dic_path[cont] = sourceFile + + if self.settings_import: + self.dic_change[cont] = sourceFile + + def choose_dir(self, cont): + sourceDir = filedialog.askdirectory( + parent=self.master, initialdir="/", title='Choose path') + self.dic_path[cont] = sourceDir + + if self.settings_import: + self.dic_change[cont] = sourceDir + + def callback(self, var, i): + self.dic_change[i] = var.get() + + def submit(self, placeholders): + + if not self.settings_import: + + if self.east.get() in placeholders: + self.east.set(-999) + if self.west.get() in placeholders: + self.west.set(-999) + if self.north.get() in placeholders: + self.north.set(-999) + if self.south.get() in placeholders: + self.south.set(-999) + + dic = {} + dic['pred_path'] = self.dic_path['pred_path'] + '/prediction.nc' + dic['geo_path'] = self.dic_path['geo_path'] + dic['feat_path'] = self.dic_path['feat_path'] + dic['east'] = float(self.east.get()) + dic['west'] = float(self.west.get()) + dic['north'] = float(self.north.get()) + dic['south'] = float(self.south.get()) + dic['from_scratch'] = self.from_scratch.get() + dic['add'] = self.add.get() + dic['delete'] = self.delete.get() + dic['ohe'] = self.ohe.get() + + sourceDir = filedialog.askdirectory( + parent=self.master, initialdir="/", title='Choose path') + sourceFile = sourceDir + '/settings_pred.pkl' + + elif self.settings_import: + + dic = {} + + dic['pred_path'] = self.properties['pred_path'] + dic['geo_path'] = self.properties['geo_path'] + dic['feat_path'] = self.properties['feat_path'] + dic['east'] = self.properties['east'] + dic['west'] = self.properties['west'] + dic['north'] = self.properties['north'] + dic['south'] = self.properties['south'] + dic['ohe'] = self.properties['ohe'] + dic['from_scratch'] = self.from_scratch.get() + dic['add'] = self.add.get() + dic['delete'] = self.delete.get() + + for key in self.dic_change: + if key == 'pred_path': + dic[key] = self.dic_change[key] + '/prediction.nc' + else: + if self.dic_change[key] not in placeholders: + dic[key] = self.dic_change[key] + + with open('tmp_pred.pkl', 'wb') as handle: + pickle.dump(dic, handle) + + if self.save.get() == 1: + + sourceDir = filedialog.askdirectory( + parent=self.master, initialdir="/", title='Choose path') + sourceFile = sourceDir + '/settings_pred.pkl' + + with open(sourceFile, 'wb') as handle: + pickle.dump(dic, handle) + + self.master.destroy() + if self.map: + settings_map() + + +class settings_map: + + def __init__(self): + + def toggle_checkbox(): + if self.pred.get() == 1: + b5.config(state="normal") + else: + b5.config(state="disabled") + b5.deselect() + + self.master = tk.Tk() + self.settings_import = False + self.row = 0 + + self.dic_path = {} + self.dic_change = {} + + self.master.winfo_toplevel().title("Map generation - general settings") + + default_font = font.nametofont("TkDefaultFont") + heading_font = font.Font(family=default_font.cget("family"), size=14, weight="bold") + + impath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) + '/images/Logo.png' + pil_image = Image.open(impath) + resized_image = pil_image.resize((750, 200), Image.LANCZOS) + image = ImageTk.PhotoImage(resized_image) + + #image = tk.PhotoImage(file=impath) + image_label = tk.Label(self.master, image=image, width=750, height=200) + image_label.image = image + image_label.grid(row=self.row, column=0, columnspan=4) + self.row = self.row + 1 + + b_chooseFile = tk.Button( + self.master, text="Import settings", command=self.import_file) + b_chooseFile.grid(row=self.row, column=0, columnspan=4, sticky='ew') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, text='General settings', font=font.Font(family=default_font.cget("family"), weight='bold')).grid(row=self.row, column=0, sticky='w') + self.row = self.row + 1 + + Label(self.master, text="Path to training and prediction dataset", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + self.b_train = tk.Button( + self.master, + text="Choose path to training dataset", + command=lambda: self.choose_path('train_path')) + self.b_train.grid(row=self.row, column=1, columnspan=1, sticky='ew') + + self.b_pred = tk.Button(self.master, text="Choose path to prediction dataset", + command=lambda: self.choose_path('pred_path')) + self.b_pred.grid(row=self.row, column=2, columnspan=2, sticky='ew') + self.row = self.row + 1 + + Label(self.master, + text="Where do you want the\n models to be stored?", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + self.b_model = tk.Button(self.master, text="Choose directory", + command=lambda: self.choose_dir('model_path')) + self.b_model.grid(row=self.row, column=1, columnspan=3, sticky='ew') + self.row = self.row + 1 + + Label(self.master, text="Foldername", anchor='w', justify='left').grid(row=self.row, column=0, sticky='w') + + self.model_to_load = tk.StringVar() + i = 'model_to_load' + self.model_to_load.trace_add( + "write", lambda name, index, mode, + var=self.model_to_load, i=i: self.callback(var, i)) + self.entry3 = tk.Entry(self.master, textvariable=self.model_to_load, fg='grey') + placeholder3 = '...for loading' + self.entry3.insert(0, placeholder3) # Insert placeholder text initially + self.entry3.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry3, placeholder=placeholder3)) + self.entry3.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry3, placeholder=placeholder3)) + self.entry3.grid(row=self.row, column=2, columnspan=2, sticky='ew') + + self.model_to_save = tk.StringVar() + i = 'model_to_save' + self.model_to_save.trace_add( + "write", lambda name, index, mode, + var=self.model_to_save, i=i: self.callback(var, i)) + self.entry4 = tk.Entry(self.master, textvariable=self.model_to_save, fg='grey') + placeholder4 = '...for saving' + self.entry4.insert(0, placeholder4) # Insert placeholder text initially + self.entry4.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry4, placeholder=placeholder4)) + self.entry4.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry4, placeholder=placeholder4)) + self.entry4.grid(row=self.row, column=1, sticky='ew') + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, + text='Set-up of the training and prediction dataset', + font=font.Font(family=default_font.cget("family"), + weight='bold')).grid(row=self.row, + column=0, + columnspan=2, + sticky='w') + self.row = self.row + 1 + + Label(self.master, text="Features not to consider", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + + self.drop_train = tk.StringVar() + i = 'drop_train' + self.drop_train.trace_add( + "write", lambda name, index, mode, + var=self.drop_train, i=i: self.callback(var, i)) + self.entry1 = tk.Entry(self.master, textvariable=self.drop_train, fg='grey') + placeholder1 = 'Drop from training dataset' + self.entry1.insert(0, placeholder1) # Insert placeholder text initially + self.entry1.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry1, placeholder=placeholder1)) + self.entry1.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry1, placeholder=placeholder1)) + self.entry1.grid(row=self.row, column=1, sticky='ew') + + self.drop_pred = tk.StringVar() + i = 'drop_pred' + self.drop_pred.trace_add( + "write", lambda name, index, mode, + var=self.drop_pred, i=i: self.callback(var, i)) + self.entry2 = tk.Entry(self.master, textvariable=self.drop_pred, fg='grey') + placeholder2 = 'Drop from prediction dataset' + self.entry2.insert(0, placeholder2) # Insert placeholder text initially + self.entry2.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry2, placeholder=placeholder2)) + self.entry2.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry2, placeholder=placeholder2)) + self.entry2.grid(row=self.row, column=2, columnspan=2, sticky='ew') + self.row = self.row + 1 + + Label(self.master, + text="Name of label column in training dataset", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + self.name_label = tk.StringVar() + i = 'name_label' + self.name_label.trace_add( + "write", lambda name, index, mode, + var=self.name_label, i=i: self.callback(var, i)) + entry = tk.Entry(self.master, textvariable=self.name_label) + entry.grid(row=self.row, column=1, columnspan=3, sticky='ew') + self.row = self.row + 1 + + global all_buttons + all_buttons = [] + + Label(self.master, text="How to treat mismatching categories?", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + + self.button_pressed = tk.StringVar() + self.keep = tk.IntVar() + self.keep.set(0) + i = 'keep' + self.keep.trace_add("write", lambda name, index, mode, + var=self.keep, i=i: self.callback(var, i)) + self.b1 = tk.Radiobutton(self.master, + text="Keep instances and\n matching classes", + variable=self.keep, + value=1, + command=lambda: self.combined_command(self.b1, 'keep'), + anchor='w', justify='left') + self.b1.grid(row=self.row, column=1, columnspan=1, sticky='w') + + + self.remove_instances = tk.IntVar() + self.remove_instances.set(0) + i = 'remove_instances' + self.remove_instances.trace_add("write", lambda name, index, mode, + var=self.remove_instances, i=i: self.callback(var, i)) + self.b2 = tk.Radiobutton(self.master, + text="Remove instances of\n mismatching classes", + variable=self.remove_instances, + value=1, + command=lambda: self.combined_command(self.b2, 'remove'), anchor='w', justify='left') + self.b2.grid(row=self.row, column=2, columnspan=1, sticky='w') + + all_buttons.append(self.b1) + all_buttons.append(self.b2) + + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, text='Random Forest training and prediction', font=font.Font(family=default_font.cget("family"), weight='bold')).grid(row=self.row, column=0, columnspan=2, sticky='w') + self.row = self.row + 1 + + Label(self.master, text="Number of trees:").grid(row=self.row, column=0, sticky='w') + self.num_trees = tk.IntVar() + i = 'num_trees' + self.num_trees.trace_add( + "write", lambda name, index, mode, + var=self.num_trees, i=i: self.callback(var, i)) + entry = tk.Entry(self.master, textvariable=self.num_trees) + entry.grid(row=self.row, column=1, columnspan=3, sticky='ew') + self.row = self.row + 1 + + Label(self.master, text="Depth of the trees:", anchor='w', justify='left').grid(row=self.row, column=0, sticky='w') + self.depth_trees = tk.IntVar() + i = 'depth_trees' + self.depth_trees.trace_add( + "write", lambda name, index, mode, + var=self.depth_trees, i=i: self.callback(var, i)) + entry = tk.Entry(self.master, textvariable=self.depth_trees) + entry.grid(row=self.row, column=1, columnspan=3, sticky='ew') + self.row = self.row + 1 + + Label(self.master, text="Evaluation criterion:", anchor='w', justify='left').grid(row=self.row, column=0, sticky='w') + self.criterion = tk.StringVar() + i = 'criterion' + self.criterion.trace_add( + "write", lambda name, index, mode, + var=self.criterion, i=i: self.callback(var, i)) + entry = tk.Entry(self.master, textvariable=self.criterion) + entry.grid(row=self.row, column=1, columnspan=3, sticky='ew') + self.row = self.row + 1 + + Label(self.master, text="Size of validation dataset (0...1):", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + self.size_val = tk.DoubleVar() + i = 'size_val' + self.size_val.trace_add( + "write", lambda name, index, mode, + var=self.size_val, i=i: self.callback(var, i)) + entry = tk.Entry(self.master, textvariable=self.size_val) + entry.grid(row=self.row, column=1, columnspan=3, sticky='ew') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + self.save = tk.IntVar() + b4 = Checkbutton(self.master, + text="Save above settings for later use", + variable=self.save, anchor='w', + font=font.Font(family=default_font.cget("family"), weight="bold")) + b4.grid(row=self.row, column=0, columnspan=2, sticky='w') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, text="What do you want to do?", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + self.train = tk.IntVar() + b1 = Checkbutton(self.master, + text="Model training", + variable=self.train, anchor='w') + b1.grid(row=self.row, column=1, sticky='w') + + self.pred = tk.IntVar() + b2 = Checkbutton(self.master, + text="Mapping", + variable=self.pred, + command=toggle_checkbox, anchor='w') + b2.grid(row=self.row, column=2, sticky='w') + + self.yes = tk.IntVar() + b5 = Checkbutton(self.master, + text="Predict in parallel", + variable=self.yes, + state="disabled", anchor='w') + b5.grid(row=self.row, column=3, sticky='w') + self.row = self.row + 1 + + all_placeholders = [] + all_placeholders.append(placeholder1) + all_placeholders.append(placeholder2) + all_placeholders.append(placeholder3) + all_placeholders.append(placeholder4) + + btn = tk.Button(self.master, text="Submit", command=lambda: self.submit(all_placeholders)) + btn.grid(row=self.row, column=0, columnspan=4, sticky='ew') + + def combined_command(self, selected_button, option): + + self.disable_other_buttons(selected_button) + self.button_pressed = option + + def disable_other_buttons(self, selected_button): + for button in all_buttons: + if button != selected_button: + button.config(state=tk.DISABLED) + + def on_entry_click(self, event, entry, placeholder): + """Function that handles the event when the entry field is clicked.""" + if entry.get() == placeholder: + entry.delete(0, tk.END) # Delete all the text in the entry + entry.config(fg='black') # Change the text color to black + + def on_focus_out(self, event, entry, placeholder): + """Function that handles the event when the entry field loses focus.""" + if entry.get() == '': + entry.insert(0, placeholder) # Insert the placeholder text + entry.config(fg='grey') # Change the text color to grey + + def callback(self, var, i): + self.dic_change[i] = var.get() + + def choose_path(self, cont): + sourceFile = filedialog.askopenfilename( + parent=self.master, initialdir="/", title='Choose path') + self.dic_path[cont] = sourceFile + + if self.settings_import: + self.dic_change[cont] = sourceFile + + def choose_dir(self, cont): + sourceDir = filedialog.askdirectory( + parent=self.master, initialdir="/", title='Choose path') + self.dic_path[cont] = sourceDir + + if self.settings_import: + self.dic_change[cont] = sourceDir + + def import_file(self): + self.settings_import = True + print(self.settings_import) + source = filedialog.askopenfilename( + parent=self.master, initialdir="/", title='Choose path') + + with open(source, 'rb') as handle: + self.properties = pickle.load(handle) + + def submit(self, placeholders): + + if not self.settings_import: + + if self.drop_train in placeholders: + self.drop_train.set('') + if self.pred_train in placeholders: + self.drop_pred.set('') + if self.model_to_load in placeholders: + self.model_to_load.set('') + if self.model_to_save in placeholders: + self.model_to_save.set('') + + dic = {} + dic['train_path'] = self.dic_path['train_path'] + dic['pred_path'] = self.dic_path['pred_path'] + dic['model_path'] = self.dic_path['model_path'] + dic['drop_train'] = self.drop_train.get() + dic['drop_pred'] = self.drop_pred.get() + dic['model_to_load'] = self.model_to_load.get() + dic['model_to_save'] = self.model_to_save.get() + dic['num_trees'] = self.num_trees.get() + dic['size_val'] = self.size_val.get() + dic['depth_trees'] = self.depth_trees.get() + dic['name_label'] = self.name_label.get() + dic['criterion'] = self.criterion.get() + dic['keep'] = self.keep.get() + dic['remove_instances'] = self.remove_instances.get() + + sourceDir = filedialog.askdirectory( + parent=self.master, initialdir="/", title='Choose path') + sourceFile = sourceDir + '/settings_map.pkl' + + elif self.settings_import: + + + dic = {} + dic['train_path'] = self.properties['train_path'] + dic['pred_path'] = self.properties['pred_path'] + dic['model_path'] = self.properties['model_path'] + dic['drop_train'] = self.properties['drop_train'] + dic['drop_pred'] = self.properties['drop_pred'] + dic['model_to_load'] = self.properties['model_to_load'] + dic['model_to_save'] = self.properties['model_to_save'] + dic['num_trees'] = self.properties['num_trees'] + dic['size_val'] = self.properties['size_val'] + dic['depth_trees'] = self.properties['depth_trees'] + dic['name_label'] = self.properties['name_label'] + dic['criterion'] = self.properties['criterion'] + dic['keep'] = self.properties['keep'] + dic['remove_instances'] = self.properties['remove_instances'] + + for key in self.dic_change: + if self.dic_change[key] not in placeholders: + dic[key] = self.dic_change[key] + + if self.save.get() == 1: + sourceDir = filedialog.askdirectory( + parent=self.master, initialdir="/", title='Choose path') + sourceFile = sourceDir + '/settings_map.pkl' + + with open(sourceFile, 'wb') as handle: + pickle.dump(dic, handle) + + if self.yes == 0: + dic['training'] = self.train.get() + dic['prediction'] = self.pred.get() + dic['parallel'] = self.yes + else: + dic['training'] = self.train.get() + dic['prediction'] = self.pred.get() + dic['parallel'] = self.yes.get() + with open('tmp_map.pkl', 'wb') as handle: + pickle.dump(dic, handle) + + self.master.destroy() + + +class settings_train: + + def __init__(self, pred, maps): + super().__init__() + self.pred = pred + self.map = maps + self.master = tk.Tk() + self.row = 0 + + self.master.winfo_toplevel().title( + "Training dataset generation - general settings") + + default_font = font.nametofont("TkDefaultFont") + heading_font = font.Font(family=default_font.cget("family"), size=14, weight="bold") + + global all_buttons + all_buttons = [] + + impath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) + '/images/Logo.png' + pil_image = Image.open(impath) + resized_image = pil_image.resize((600, 150), Image.LANCZOS) + image = ImageTk.PhotoImage(resized_image) + + #image = tk.PhotoImage(file=impath) + image_label = tk.Label(self.master, image=image, width=600, height=150) + image_label.image = image + image_label.grid(row=self.row, column=0, columnspan=4) + self.row = self.row + 1 + + self.settings_import = False + b_chooseFile = tk.Button(self.master, + text="Import settings", + command=self.import_file) + b_chooseFile.grid(row=self.row, column=0, columnspan=4, sticky='ew') + + self.dic_path = {} + self.dic_change = {} + + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, text="General settings", anchor='w', justify='left', font=heading_font).grid( + row=self.row, column=0, columnspan=2, sticky='w') + self.row = self.row + 1 + + Label(self.master, text="Provide path to:", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + self.b_ls = tk.Button(self.master, + text="Landslide data", + command=lambda: self.choose_path('ls_path')) + self.b_ls.grid(row=self.row, column=1, columnspan=1, sticky='ew') + + self.b_gs = tk.Button(self.master, + text="dataset_summary.csv", + command=lambda: self.choose_path('geo_path')) + self.b_gs.grid(row=self.row, column=2, columnspan=1, sticky='ew') + + self.b_nonls = tk.Button( + self.master, + text="Absence locations", + command=lambda: self.choose_path('nonls_path')) + self.b_nonls.grid(row=self.row, column=3, columnspan=1, sticky='ew') + self.row = self.row + 1 + + Label(self.master, text="Features to include", anchor='w', justify='left').grid(row=self.row, column=0, sticky='w') + self.b_feat = tk.Button(self.master, + text="Path to keys_to_include.csv", + command=lambda: self.choose_path('feat_path')) + self.b_feat.grid(row=self.row, column=1, columnspan=3, sticky='ew') + self.row = self.row + 1 + + Label(self.master, + text="Path to directory for storing\n the training dataset", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + self.b_train = tk.Button(self.master, + text="Choose directory", + command=lambda: self.choose_dir('train_path')) + self.b_train.grid(row=self.row, column=1, columnspan=3, sticky='ew') + self.row = self.row + 1 + + Label(self.master, text="Number of absence locations\n in the training dataset:", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + self.num_nonls = tk.IntVar() + i = 'num_nonls' + self.num_nonls.trace_add( + "write", lambda name, index, mode, + var=self.num_nonls, i=i: self.callback(var, i)) + entry = tk.Entry(self.master, textvariable=self.num_nonls) + entry.grid(row=self.row, column=1, columnspan=1, sticky='ew') + + self.ohe = tk.IntVar() + b11 = Checkbutton(self.master, text="One-hot encoding of categorical variables", + variable=self.ohe)#, command=lambda: button_pressed.set('cluster')) + b11.grid(row=self.row, column=2, columnspan=2, sticky='w') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, text="Information on the landslide and absence locations inventory", anchor='w', justify='left', font=heading_font).grid( + row=self.row, column=0, columnspan=3, sticky='w') + self.row = self.row + 1 + Label(self.master, text="Column name in landslide\n inventory containing...", anchor='w', justify='left').grid(row=self.row, column=0, sticky='w') + self.x = tk.StringVar() + i = 'x' + self.x.trace_add( + "write", lambda name, index, mode, + var=self.x, i=i: self.callback(var, i)) + self.entry3 = tk.Entry(self.master, textvariable=self.x, fg='grey') + placeholder3 = 'Longitude values' + self.entry3.insert(0, placeholder3) # Insert placeholder text initially + self.entry3.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry3, placeholder=placeholder3)) + self.entry3.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry3, placeholder=placeholder3)) + self.entry3.grid(row=self.row, column=1, sticky='ew') + + self.y = tk.StringVar() + i = 'y' + self.y.trace_add( + "write", lambda name, index, mode, + var=self.y, i=i: self.callback(var, i)) + self.entry4 = tk.Entry(self.master, textvariable=self.y, fg='grey') + placeholder4 = 'Latitude values' + self.entry4.insert(0, placeholder4) # Insert placeholder text initially + self.entry4.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry4, placeholder=placeholder4)) + self.entry4.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry4, placeholder=placeholder4)) + self.entry4.grid(row=self.row, column=2, sticky='ew') + + #Label(self.master, text="Column name of the landslide ID", anchor='w', justify='left').grid(row=self.row, column=0, columnspan=2, sticky='w') + self.id = tk.StringVar() + i = 'id' + self.id.trace_add( + "write", lambda name, index, mode, + var=self.id, i=i: self.callback(var, i)) + self.entry5 = tk.Entry(self.master, textvariable=self.id, fg='grey') + placeholder5 = 'Landslide ID' + self.entry5.insert(0, placeholder5) # Insert placeholder text initially + self.entry5.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry5, placeholder=placeholder5)) + self.entry5.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry5, placeholder=placeholder5)) + self.entry5.grid(row=self.row, column=3, sticky='ew') + self.row = self.row + 1 + + Label(self.master, text="Variable in absence\n locations inventory containing...", anchor='w', justify='left').grid(row=self.row, column=0, sticky='w') + self.x_nonls = tk.StringVar() + i = 'x_nonls' + self.x_nonls.trace_add( + "write", lambda name, index, mode, + var=self.x_nonls, i=i: self.callback(var, i)) + self.entry1 = tk.Entry(self.master, textvariable=self.x_nonls, fg='grey') + placeholder1 = 'Longitude values' + self.entry1.insert(0, placeholder1) # Insert placeholder text initially + self.entry1.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry1, placeholder=placeholder1)) + self.entry1.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry1, placeholder=placeholder1)) + self.entry1.grid(row=self.row, column=1, sticky='ew') + + self.y_nonls = tk.StringVar() + i = 'y_nonls' + self.y_nonls.trace_add( + "write", lambda name, index, mode, + var=self.y_nonls, i=i: self.callback(var, i)) + self.entry2 = tk.Entry(self.master, textvariable=self.y_nonls, fg='grey') + placeholder2 = 'Latitude values' + self.entry2.insert(0, placeholder2) # Insert placeholder text initially + self.entry2.bind('<FocusIn>', lambda event:self.on_entry_click(event, entry=self.entry2, placeholder=placeholder2)) + self.entry2.bind('<FocusOut>', lambda event:self.on_focus_out(event, entry=self.entry2, placeholder=placeholder2)) + self.entry2.grid(row=self.row, column=2, sticky='ew') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + self.save = tk.IntVar() + b4 = Checkbutton(self.master, + text="Save above settings for later use", + variable=self.save, font=font.Font(family=default_font.cget("family"), weight="bold")) + b4.grid(row=self.row, column=0, columnspan=2, sticky='w') + self.row = self.row + 1 + + Label(self.master).grid(row=self.row, column=0) + self.row = self.row + 1 + + Label(self.master, text="Choose from:", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + button_pressed = StringVar() + self.from_scratch = tk.IntVar() + self.from_scratch.set(0) + b1 = tk.Radiobutton(self.master, + text="Generate training dataset from scratch", + variable=button_pressed, + value='from_scratch', + command=lambda: self.disable_other_buttons(b1)) + b1.grid(row=self.row, column=1, columnspan=2, sticky='w') + self.row = self.row + 1 + + self.delete = tk.IntVar() + self.delete.set(0) + b2 = tk.Radiobutton(self.master, + text="Delete feature(s) from existing training dataset", + variable=button_pressed, + value='delete', + command=lambda: self.disable_other_buttons(b2)) + b2.grid(row=self.row, column=1, columnspan=2, sticky='w') + self.row = self.row + 1 + + self.add = tk.IntVar() + self.add.set(0) + b3 = tk.Radiobutton(self.master, + text="Add feature(s) from existing training dataset", + variable=button_pressed, + value='add', + command=lambda: self.disable_other_buttons(b3)) + b3.grid(row=self.row, column=1, columnspan=2, sticky='w') + self.row = self.row + 1 + + all_buttons.append(b1) + all_buttons.append(b2) + all_buttons.append(b3) + + all_placeholders = [] + all_placeholders.append(placeholder1) + all_placeholders.append(placeholder2) + all_placeholders.append(placeholder3) + all_placeholders.append(placeholder4) + all_placeholders.append(placeholder5) + + b2.wait_variable(button_pressed) + + if button_pressed.get() == 'from_scratch': + + Label(self.master, + text="Compilation using:", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + + self.cluster = tk.IntVar() + self.cluster.set(0) + b11 = tk.Radiobutton(self.master, text="Clustering", + variable=self.cluster, + value=1, + command= lambda: self.disable_other_buttons(b11)) + b11.grid(row=self.row, column=1, sticky='w') + self.row = self.row + 1 + + self.interpolation = tk.IntVar() + self.interpolation.set(0) + b22 = tk.Radiobutton( + self.master, + text="Interpolation", + variable=self.interpolation, + value=1, + command= lambda: self.disable_other_buttons(b22)) + b22.grid(row=self.row, column=1, sticky='w') + self.row = self.row + 1 + + self.no_interpolation = tk.IntVar() + self.no_interpolation.set(0) + b33 = tk.Radiobutton( + self.master, + text="No interpolation", + variable=self.no_interpolation, + value=1, + command= lambda: self.disable_other_buttons(b33)) + b33.grid(row=self.row, column=1, sticky='w') + self.row = self.row + 1 + + all_buttons = [] + all_buttons.append(b11) + all_buttons.append(b22) + all_buttons.append(b33) + + self.add.set(0) + self.delete.set(0) + self.from_scratch.set(1) + if self.cluster.get() == 1: + self.interpolation.set(0) + self.no_interpolation.set(0) + elif self.interpolation.get() == 1: + self.cluster.set(0) + self.no_interpolation.set(0) + if self.no_interpolation.get() == 1: + self.cluster.set(0) + self.interpolation.set(0) + + btn = tk.Button(self.master, text="Submit", command=lambda: self.submit(all_placeholders)) + btn.grid(row=self.row, column=0, columnspan=4, sticky='ew') + + elif button_pressed.get() == 'add': + + Label(self.master, + text="Compilation using:", anchor='w', justify='left').grid( + row=self.row, column=0, sticky='w') + + self.cluster = tk.IntVar() + self.cluster.set(0) + + b111 = tk.Radiobutton(self.master, text="Clustering", + variable=self.cluster, + value=1, + command= lambda: self.disable_other_buttons(b111)) + b111.grid(row=self.row, column=1, sticky='w') + self.row = self.row + 1 + + self.interpolation = tk.IntVar() + self.interpolation.set(0) + b222 = tk.Radiobutton( + self.master, + text="Interpolation", + variable=self.interpolation, + value=1, + command= lambda: self.disable_other_buttons(b222)) + b222.grid(row=self.row, column=1, sticky='w') + self.row = self.row + 1 + + self.no_interpolation = tk.IntVar() + self.no_interpolation.set(0) + b333 = tk.Radiobutton( + self.master, + text="No interpolation", + variable=self.no_interpolation, + value=1, + command= lambda: self.disable_other_buttons(b333)) + b333.grid(row=self.row, column=1, sticky='w') + self.row = self.row + 1 + + all_buttons = [] + all_buttons.append(b111) + all_buttons.append(b222) + all_buttons.append(b333) + + self.add.set(1) + self.delete.set(0) + self.from_scratch.set(0) + + btn = tk.Button(self.master, text="Submit", command=lambda: self.submit(all_placeholders)) + btn.grid(row=self.row, column=0, columnspan=4, sticky='ew') + + elif button_pressed.get() == 'delete': + self.add.set(0) + self.delete.set(1) + self.from_scratch.set(0) + self.cluster = tk.IntVar() + self.cluster.set(0) + self.no_interpolation = tk.IntVar() + self.no_interpolation.set(0) + self.interpolation = tk.IntVar() + self.interpolation.set(0) + + btn = tk.Button(self.master, text="Submit", command=lambda: self.submit(all_placeholders)) + btn.grid(row=self.row, column=0, columnspan=4, sticky='ew') + + def disable_other_buttons(self, selected_button): + for button in all_buttons: + if button != selected_button: + button.config(state=tk.DISABLED) + + def on_entry_click(self, event, entry, placeholder): + """Function that handles the event when the entry field is clicked.""" + if entry.get() == placeholder: + entry.delete(0, tk.END) # Delete all the text in the entry + entry.config(fg='black') # Change the text color to black + + def on_focus_out(self, event, entry, placeholder): + """Function that handles the event when the entry field loses focus.""" + if entry.get() == '': + entry.insert(0, placeholder) # Insert the placeholder text + entry.config(fg='grey') # Change the text color to grey + + def callback(self, var, i): + self.dic_change[i] = var.get() + + def choose_path(self, cont): + sourceFile = filedialog.askopenfilename( + parent=self.master, initialdir="/", title='Choose path') + self.dic_path[cont] = sourceFile + + if self.settings_import: + self.dic_change[cont] = sourceFile + + def choose_dir(self, cont): + sourceDir = filedialog.askdirectory( + parent=self.master, initialdir="/", title='Choose path') + self.dic_path[cont] = sourceDir + + if self.settings_import: + if cont == 'train_path': + self.dic_change[cont] = sourceDir + '/training.csv' + else: + self.dic_change[cont] = sourceDir + + def import_file(self): + self.settings_import = True + print(self.settings_import) + source = filedialog.askopenfilename( + parent=self.master, initialdir="/", title='Choose path') + + with open(source, 'rb') as handle: + self.properties = pickle.load(handle) + + def submit(self, placeholders): + + if not self.settings_import: + + if self.x.get() in placeholders: + self.x.set('') + if self.y.get() in placeholders: + self.y.set('') + if self.x_nonls.get() in placeholders: + self.x_nonls.set('') + if self.y_nonls.get() in placeholders: + self.y_nonls.set('') + if self.id.get() in placeholders: + self.id.set('') + + dic = {} + dic['ls_path'] = self.dic_path['ls_path'] + dic['nonls_path'] = self.dic_path['nonls_path'] + dic['train_path'] = self.dic_path['train_path'] + '/training.csv' + dic['geo_path'] = self.dic_path['geo_path'] + dic['feat_path'] = self.dic_path['feat_path'] + dic['x'] = self.x.get() + dic['y'] = self.y.get() + dic['id'] = self.id.get() + dic['x_nonls'] = self.x_nonls.get() + dic['y_nonls'] = self.y_nonls.get() + dic['num_nonls'] = int(self.num_nonls.get()) + dic['from_scratch'] = self.from_scratch.get() + dic['delete'] = self.delete.get() + dic['add'] = self.add.get() + dic['cluster'] = self.cluster.get() + dic['interpolation'] = self.interpolation.get() + dic['no_interpolation'] = self.no_interpolation.get() + dic['ohe'] = self.ohe.get() + + sourceDir = filedialog.askdirectory( + parent=self.master, initialdir="/", title='Choose path') + sourceFile = sourceDir + '/settings_train.pkl' + + elif self.settings_import: + + dic = {} + dic['ls_path'] = self.properties['ls_path'] + dic['nonls_path'] = self.properties['nonls_path'] + dic['train_path'] = self.properties['train_path'] + dic['geo_path'] = self.properties['geo_path'] + dic['feat_path'] = self.properties['feat_path'] + dic['x'] = self.properties['x'] + dic['y'] = self.properties['y'] + dic['id'] = self.properties['id'] + dic['x_nonls'] = self.properties['x_nonls'] + dic['y_nonls'] = self.properties['y_nonls'] + dic['num_nonls'] = self.properties['num_nonls'] + dic['from_scratch'] = self.from_scratch.get() + dic['delete'] = self.delete.get() + dic['add'] = self.add.get() + dic['cluster'] = self.cluster.get() + dic['no_interpolation'] = self.no_interpolation.get() + dic['interpolation'] = self.interpolation.get() + dic['ohe'] = self.properties['ohe'] + + for key in self.dic_change: + if self.dic_change[key] not in placeholders: + dic[key] = self.dic_change[key] + + with open('tmp_train.pkl', 'wb') as handle: + pickle.dump(dic, handle) + + if self.save.get() == 1: + sourceDir = filedialog.askdirectory( + parent=self.master, initialdir="/", title='Choose path') + sourceFile = sourceDir + '/settings_train.pkl' + + with open(sourceFile, 'wb') as handle: + pickle.dump(dic, handle) + + self.master.destroy() + if self.pred: + settings_pred(self.map) + elif not self.pred and self.map: + settings_map() diff --git a/src/gui_version/utilities/handle_categorical_values.py b/src/gui_version/utilities/handle_categorical_values.py new file mode 100644 index 0000000000000000000000000000000000000000..100a276b59e467f99d28a84f11abfed7d81b4f3b --- /dev/null +++ b/src/gui_version/utilities/handle_categorical_values.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import pandas as pd +import numpy as np + +from sklearn.preprocessing import OneHotEncoder + +def handle_categorical_values(df, datasets_summary, ohe, basic, var=None): + + """ + Categorical features in the training dataset are either one hot + encoded or ordinal encoded + + Input: + df: DataFrame containing continuous and categorical features, Pandas DataFrame + datasets_summary: Information on the datasets from which the values in df have been extracted, Pandas DataFrame + ohe: True for One-hot encoding, False for ordinal encoding, Boolean + basic: columns in df not to be considered such as coordinates, ID and label, list + var: specific features to consider only, list + """ + + if var == None: + cat = [] + for feat in df.columns.tolist(): + if feat not in basic: + index = datasets_summary['keys'].tolist().index(feat) + if bool(datasets_summary['categorical'].tolist()[index]) == True: + cat.append(feat) + else: + cat = [] + for feat in var: + index = datasets_summary['keys'].tolist().index(feat) + if bool(datasets_summary['categorical'].tolist()[index]) == True: + cat.append(feat) + + if len(cat) > 0: + if ohe: + encoder = OneHotEncoder(sparse=False) + encoded_data = encoder.fit_transform(df[cat]) + unique_categories = {col: df[col].unique() for col in cat} + print(unique_categories) + custom_column_names = [] + for col in cat: + for unique_value in unique_categories[col]: + if isinstance(unique_value, (float, np.float32)): + unique_value = int(unique_value) + custom_column_names.append(f'{col}_{str(unique_value)}_encode') + encoded_df = pd.DataFrame(encoded_data, columns=custom_column_names) + df = pd.concat([df.drop(columns=cat), encoded_df], axis=1) + else: + columns_to_encode = df.select_dtypes(include=['object', 'category']).columns.tolist() + encoder = OrdinalEncoder() + encoded_data = encoder.fit_transform(df[columns_to_encode]) + encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_encoded" for col in columns_to_encode]) + df = pd.concat([df.drop(columns=columns_to_encode), encoded_df], axis=1) + + return df + + \ No newline at end of file diff --git a/src/gui_version/utilities/import_format.py b/src/gui_version/utilities/import_format.py new file mode 100644 index 0000000000000000000000000000000000000000..a94456582517c761c96a718cc11118e1ac23e1a6 --- /dev/null +++ b/src/gui_version/utilities/import_format.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +import rasterio +import numpy as np +import netCDF4 as nc +import pandas as pd + +def import_tif(path): + + """ + Import a geotiff file + + Input: + path: Path to the tif file to open, string + missing_value = no data value of the + """ + + raster = rasterio.open(path, 'r') + data = raster.read()[0, :, :] + + if np.dtype(data[0, 0]) == 'uint8': + data = np.int32(data) + + bounds = raster.bounds + x = np.linspace(bounds[0], bounds[2], np.shape(data)[1]) + y = np.linspace(bounds[1], bounds[3], np.shape(data)[0]) + crs = raster.crs + + if y[0] < y[-1]: + y = np.flip(y) + + return data, x, y, crs + + +def import_nc(path): + + """ + Import a netCDF4 file and contained metadata + + Input: + path: Path to the netCDF4 file to open, string + """ + + ds = nc.Dataset(path) + x = ds['Longitude'][:] + y = ds['Latitude'][:] + + if 'Result' in ds.variables.keys(): + data = ds['Result'][:][:] + data = np.float64(data) + data = data.data + else: + data = None + + if 'Time' in ds.variables.keys(): + data = ds['Result'][:][:] + data = data.data + + if hasattr(ds.variables['Longitude'], 'units'): + crs = ds['Longitude'].units + else: + crs = None + + x = x.data + y = y.data + + if y[0] < y[-1]: + y = np.flip(y) + + return data, x, y, crs + + +def import_cvs(path): + + """ + Import a csv file + + Input: + path: Path to the csv file to open, string + """ + + df = pd.read_csv(path) + + return df diff --git a/src/gui_version/utilities/import_raw_dataset.py b/src/gui_version/utilities/import_raw_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..67fcf1b36422efab7aab7194318042c9f16f57f2 --- /dev/null +++ b/src/gui_version/utilities/import_raw_dataset.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np + +from utilities.import_format import import_tif, import_nc + +def import_raw_dataset(path, no_data, no_value): + + """ + Import geotiff or netCDF4 file + Input: + path: path to the dataset, string + no_data: no data values, list + no_value: general no data value, int or float + + Output: + data: dataset, numpy array + x_org: longitude coordinates, list + y_org: latitude coordinates, list + + """ + + warning = False + if path.split('.')[-1] == 'tif': + data, x_org, y_org, _ = import_tif(path) + elif path.split('.')[-1] == 'nc': + data, x_org, y_org, _ = import_nc(path) + else: + warning = True + + if y_org[0] < y_org[-1]: + y_org = np.flip(y_org) + + if no_data != 'None': + for val in no_data: + data[data == val] = no_value + data[np.isnan(data)] = no_value + + if warning: + return None, None, None + else: + return data, x_org, y_org + + \ No newline at end of file diff --git a/src/gui_version/utilities/initialise_log.py b/src/gui_version/utilities/initialise_log.py new file mode 100644 index 0000000000000000000000000000000000000000..91f4b59f54c7a4cd230132643c7dcaf4b9629855 --- /dev/null +++ b/src/gui_version/utilities/initialise_log.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import logging + +def save_log(path): + + """ + Initialisation of a log file using the python package logging to store + information, warnings and errors + + Input: + path: Path where to store the log file + Output: + logger: Logger + + """ + + path_log = os.path.dirname(path) + logger = logging.getLogger() + logger.setLevel(logging.INFO) + formatter = logging.Formatter( + '%(asctime)s | %(levelname)s | %(message)s') + + file_handler = logging.FileHandler(path) + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(formatter) + + logger.addHandler(file_handler) + + return logger \ No newline at end of file diff --git a/src/gui_version/utilities/ncfile_generation.py b/src/gui_version/utilities/ncfile_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..4d4c038936de632cf892db8a0cdaeee339733697 --- /dev/null +++ b/src/gui_version/utilities/ncfile_generation.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import netCDF4 as nc +import settings + +def generate_basic_ncfile(outfile, crs=None): + + """ + Initialise basic netCDF4 file + + Input: + Outfile: path to store the netcdf file, string + crs: coordinate reference system, string + """ + + # If outfile exists already, delete + if os.path.exists(outfile): + os.remove(outfile) + + ds = nc.Dataset(outfile, 'w', format='NETCDF4') + + return ds + + +def generate_ncfile(outfile, x, y, data, crs=None, + data_unit=None, missing_value=settings.no_value): + + """ + Save 2D dataset as netCDF4 file + + Input: + Outfile: path to store the netcdf file, string + x: longitude vector, list + y: latitude vector, list + data: 2D data array + crs: coordinate reference system, string + data_unit: data unit, string + missing_value: no data value, integer or float + """ + + # If outfile exists already, delete + if os.path.exists(outfile): + os.remove(outfile) + + ds = nc.Dataset(outfile, 'w', format='NETCDF4') + ds.createDimension('lon', len(x)) + ds.createDimension('lat', len(y)) + longitude = ds.createVariable('Longitude', 'f4', 'lon') + latitude = ds.createVariable('Latitude', 'f4', 'lat') + result = ds.createVariable('Result', 'f4', ('lat', 'lon')) + + longitude[:] = x + latitude[:] = y + result[:, :] = data + + # Provide global information in output-file + if crs is not None: + longitude.units = crs + latitude.units = crs + if data_unit is not None: + result.units = data_unit + ds.missing_value = missing_value + ds.close() + + +def generate_3dncfile(outfile, x, y, data, dim, features, crs='wgs84', + data_unit=None, missing_value=settings.no_value): + + """ + Save 3D dataset as netCDF4 file, e.g. data cube + + Input: + Outfile: path to store the netcdf file, string + x: longitude vector, list + y: latitude vector, list + dim: number of 2D datasets, integer + data: 2D data array + features: contained features in prediction dataset, list of chars + crs: coordinate reference system, string + data_unit: data unit, string + missing_value: no data value, integer or float + """ + + # If outfile exists already, delete + if os.path.exists(outfile): + os.remove(outfile) + + ds = nc.Dataset(outfile, 'w', format='NETCDF4') + ds.createDimension('lon', len(x)) + ds.createDimension('lat', len(y)) + ds.createDimension('dim', dim) + ds.createDimension('feat', len(features)) + longitude = ds.createVariable('Longitude', 'f4', 'lon') + latitude = ds.createVariable('Latitude', 'f4', 'lat') + result = ds.createVariable('Result', 'f4', ('lat', 'lon', 'dim')) + Features = ds.createVariable('features', 'S1', 'feat') + + longitude[:] = x + latitude[:] = y + result[:, :, :] = data + Features[:] = features + + # Provide global information in output-file + if crs is not None: + longitude.units = crs + latitude.units = crs + if data_unit is not None: + result.units = data_unit + ds.missing_value = missing_value + ds.close() + + +def generate_2dncfile(outfile, x, y, data, features, crs='wgs84', + data_unit=None, missing_value=settings.no_value): + + """ + Save 2D dataset as netCDF4 file, e.g. Prediction dataset + + Input: + Outfile: path to store the netcdf file, string + x: longitude vector, list + y: latitude vector, list + data: 2D data array + features: contained features in prediction dataset, list of chars + crs: coordinate reference system, string + data_unit: data unit, string + missing_value: no data value, integer or float + """ + + # If outfile exists already, delete + if os.path.exists(outfile): + os.remove(outfile) + + ds = nc.Dataset(outfile, 'w', format='NETCDF4') + ds.createDimension('lon', len(x)) + ds.createDimension('lat', len(y)) + ds.createDimension('feat', len(features)) + longitude = ds.createVariable('Longitude', 'f4', 'lon') + latitude = ds.createVariable('Latitude', 'f4', 'lat') + result = ds.createVariable('Result', 'f4', ('lat', 'lon')) + Features = ds.createVariable('features', 'S1', 'feat') + + longitude[:] = x + latitude[:] = y + result[:, :] = data + Features[:] = features + + # Provide global information in output-file + if crs is not None: + longitude.units = crs + latitude.units = crs + if data_unit is not None: + result.units = data_unit + ds.missing_value = missing_value + ds.close() diff --git a/src/gui_version/utilities/properties_user_input.csv b/src/gui_version/utilities/properties_user_input.csv new file mode 100644 index 0000000000000000000000000000000000000000..80f4fefdc4f9b10559fc53344533c68bf772a46f --- /dev/null +++ b/src/gui_version/utilities/properties_user_input.csv @@ -0,0 +1,48 @@ +key,type,range,extension,path +ls_path,str,None,csv,1 +nonls_path,str,None,nc,1 +train_path,str,None,csv,1 +geo_path,str,None,csv,1 +feat_path,str,None,csv,1 +x,str,None,None,0 +y,str,None,None,0 +id,str,None,None,0 +x_nonls,str,None,None,0 +y_nonls,str,None,None,0 +num_nonls,"int,float",None,None,0 +from_scratch,"int,bool",None,"0,1",0 +delete,"int,bool",None,"0,1",0 +add,"int,bool",None,"0,1",0 +cluster,"int,bool",None,"0,1",0 +data_to_handle,str,None,None,0 +preprocess,str,None,None,0 +no_interpolation,"int,bool",None,"0,1",0 +interpolation,"int,bool",None,"0,1",0 +resolution,int,"1,inf",None,0 +random_seed,int,"1,inf",None,0 +crs,str,None,None,0 +no_value,int,None,None,0 +train,bool,None,None,0 +pred,bool,None,None,0 +map,bool,None,None,0 +pred_path,str,None,nc,1 +east,"int,float","-180,180",None,0 +west,"int,float","-180,180",None,0 +north,"int,float","-90,90",None,0 +south,"int,float","-90,90",None,0 +model_path,str,None,None,1 +drop_train,str,None,None,0 +drop_pred,str,None,None,0 +model_to_load,str,None,None,0 +model_to_save,str,None,None,0 +num_trees,int,"1,inf",None,0 +size_val,"int,float","0,1",None,0 +depth_trees,int,"1,inf",None,0 +name_label,str,None,None,0 +criterion,str,None,None,0 +training,"int,bool",None,None,0 +prediction,"int,bool",None,None,0 +parallel,"int,bool",None,None,0 +keep,"int,bool","0,1",None,0 +remove_instances,"int,bool","0,1",None,0 +ohe,"int,bool","0,1",None,0 \ No newline at end of file diff --git a/src/gui_version/utilities/strings_for_ncfile.py b/src/gui_version/utilities/strings_for_ncfile.py new file mode 100644 index 0000000000000000000000000000000000000000..783ed4d9f157ad84b873ef5a5dd8b185b35b2c22 --- /dev/null +++ b/src/gui_version/utilities/strings_for_ncfile.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +def features_to_char(feat): + + """ + Turn list of features to chars so it can be stored in the nc-file. + """ + + char_features = [] + for feature in feat: + for letter in feature: + char_features.append(letter) + char_features.append('/') + char_features = char_features[0:-1] + + return char_features + + +def char_to_string(features): + + """ + Input: + features: list of features as chars + + Return: + features as strings + + Turns list of chars into strings providing information on + contained features in nc-file. Feature names have to be separated + by '/'. + """ + + features_decode = [] + for feature in features: + + features_decode.append(feature.decode('UTF-8')) + + tmp = ''.join(features_decode) + + return tmp.split('/') diff --git a/src/plain_scripts/.gitignore b/src/plain_scripts/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/plain_scripts/RandomForest.py b/src/plain_scripts/RandomForest.py new file mode 100644 index 0000000000000000000000000000000000000000..6e2e6880bd40dea4a86b96bd088dc5dadcd51d84 --- /dev/null +++ b/src/plain_scripts/RandomForest.py @@ -0,0 +1,564 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import pandas as pd +import netCDF4 as nc +import pickle as pkl +import os +import pickle + +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import mean_squared_error, f1_score, roc_curve, auc, fbeta_score +from joblib import Parallel, delayed + +import settings +from utilities.ncfile_generation import generate_ncfile +from utilities.strings_for_ncfile import char_to_string + + +class prepare_data: + + """ + This class prepares the data to be + used in the Random Forest classifier. + """ + + def __init__(self, aim, logger): + + invalid = False + self.aim = aim + self.logger = logger + if aim == 'train_test': + print('Train the model') + invalid = False + elif aim == 'prediction': + print('Prepare the hazard map') + invalid = False + else: + print('Not a valid command. Enter train_test or prediction.') + invalid = True + + if not invalid: + self.test_size = settings.size # Size of validation dataset + # Column name of label in training dataset + self.label_name = 'label' + self.xy = pd.DataFrame() # Array to save coordinates for reshaping + + if aim == 'train_test': + self.import_features_labels() # Prepare the training + # Generate train/validation dataset + self.split_training_testing() + elif aim == 'prediction': + self.import_features() # Import prediction dataset + + def import_features(self): + + """ + Imports the features for prediction. + """ + + # Import prediction dataset either as csv file or nc file + if settings.path_pred.split('.')[-1] == 'csv': + self.features = pd.read_csv(settings.path_pred) + + elif settings.path_pred.split('.')[-1] == 'nc': + ds = nc.Dataset(settings.path_pred) + pred = ds['Result'][:, :].data + pred_features = ds['features'][:].data + self.feature_list = char_to_string(pred_features) + + if 'xcoord' in self.feature_list and 'ycoord' in self.feature_list: + self.features = pd.DataFrame(pred, columns=self.feature_list) + else: + self.features = pd.DataFrame(pred, columns=['xcoord', 'ycoord']+self.feature_list) + + self.dropped = ds['Dropped'][:].data + self.dropped = [int(x) for x in self.dropped] + + # Save the prediction coordinates in the prediction dataset + self.xy['ycoord'] = self.features['ycoord'] + self.xy['xcoord'] = self.features['xcoord'] + + # Remove all features that shall not be included in + # prediction from DataFrame (see settings!) + if len(settings.not_included_pred_data) > 0: + for dataset in settings.not_included_pred_data: + self.features = self.features.drop(dataset, axis=1) + + # Determine which classes are contained in the categorical features + # It is distinguished between one-hot and ordinal encoded features + self.categorical_classes = {} + cat_subset = [feat for feat in self.features.columns.tolist() if '_encode' in feat] + df_sub = self.features[cat_subset] + cat_feat = ['_'.join(col.split('_')[:-2]) for col in df_sub.columns.tolist()] + for feat in list(set(cat_feat)): + classes = [] + if cat_feat.count(feat)>1: + classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f]) + else: + classes = list(set(df_sub[feat + '_encode'].tolist())) + self.categorical_classes[feat] = {} + self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist] + self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat) + + self.feature_list = list(self.features.columns) + self.features_org = self.features.copy() + self.logger.info('Features for prediction were imported') + self.logger.info('The following ' + + str(len(self.feature_list)) + + ' features are included in the prediction dataset: ' + + str(self.feature_list)) + + def import_features_labels(self): + + """ + Imports the features for training. + """ + + # Import training dataset as csv file + self.features = pd.read_csv(settings.path_train) + # Extract and remove labels from training dataset + self.labels = np.array(self.features[self.label_name]).reshape( + [np.shape(self.features[self.label_name])[0], 1]) + self.features = self.features.drop(self.label_name, axis=1) + + # Store coordinates from training data + self.xy['ycoord'] = self.features['ycoord'] + self.xy['xcoord'] = self.features['xcoord'] + + # Drop ID from training data + self.features = self.features.drop('ID', axis=1) + self.features = self.features.drop(['xcoord', 'ycoord'], axis=1) + + # Remove all features that shall not be included in + # training from DataFrame (see settings!) + if len(settings.not_included_train_data) > 0: + for dataset in settings.not_included_train_data: + self.features = self.features.drop(dataset, axis=1) + + # Determine which classes are contained in the categorical features + # It is distinguished between one-hot and ordinal encoded features + self.categorical_classes = {} + cat_subset = [feat for feat in self.features.columns.tolist() if '_encode' in feat] + df_sub = self.features[cat_subset] + cat_feat = ['_'.join(col.split('_')[:-2]) for col in df_sub.columns.tolist()] + for feat in list(set(cat_feat)): + classes = [] + if cat_feat.count(feat)>1: + classes.append([f.split('_')[-2] for f in df_sub.columns.tolist() if feat in f]) + else: + classes = list(set(df_sub[feat + '_encode'].tolist())) + self.categorical_classes[feat] = {} + self.categorical_classes[feat]['classes'] = [item for sublist in classes for item in sublist] + self.categorical_classes[feat]['num_cols'] = cat_feat.count(feat) + + + self.feature_list = list(self.features.columns) + self.logger.info('Features for training were imported') + self.logger.info('The following ' + str(len(self.feature_list)) + + ' features are included in the training dataset: ' + + str(self.feature_list)) + self.features = np.array(self.features) + + def split_training_testing(self): + + """ + Splits the training data into training and validation data. + """ + + self.train_features, self.test_features, self.train_labels, self.test_labels = \ + train_test_split(self.features, + self.labels, + test_size=self.test_size, + random_state=settings.random_seed, + stratify=self.labels) + print('Data split') + self.logger.info('Training data split in training and test dataset') + + +class RandomForest(prepare_data): + + def __init__(self, aim, parallel=False, log=None): + + super().__init__(aim, log) + + self.aim = aim + self.parallel = parallel + self.logger = log + self.num_chunks = 10 + # Random Forest settings + self.criterion = settings.criterion + self.n_estimators = settings.num_trees + self.max_depth = settings.depth + + if aim == 'prediction': + self.model_dir = settings.model_database_dir + self.model_to_load = settings.model_to_load + else: + self.model_dir = settings.model_database_dir + self.output_dir = None + + if aim == 'train_test': + print('Model is trained') + self.define() + self.train() + self.predict() + self.evaluate() + self.create_output_dir() + self.save_model() + self.save_parameters() + self.feature_importance() + + elif aim == 'prediction': + print('Prediction is performed') + self.create_output_dir() + self.load_model() + if not self.error: + self.predict() + self.extract_pos_neg_predictions() + self.reshape_prediction() + self.save_prediction() + + def define(self): + + """ + Define the Random Forest Classifier model. + """ + + self.model = RandomForestClassifier(n_estimators=self.n_estimators, + max_depth=self.max_depth, + random_state=settings.random_seed) + self.logger.info('Model is defined') + + def train(self): + + """ + Train the Random Forest Classifier model. + """ + + self.model.fit(self.train_features, np.ravel(self.train_labels)) + self.logger.info('Model is trained') + + def predict(self): + + """ + Make the prediction. + """ + + print('Predicting...') + self.logger.info('Predicting...') + if self.aim == 'prediction': + pred = self.features + elif self.aim == 'train_test': + pred = self.test_features + + if self.parallel: + self.split_array_into_chunks() + prediction = Parallel(n_jobs=10)( + delayed(self.model.predict)(chunk) for chunk in self.chunks) + self.prediction = np.concatenate(prediction, axis=0) + else: + self.prediction = self.model.predict(pred) + + def split_array_into_chunks(self): + + """ + Split a NumPy array into chunks without changing the number of columns. + + """ + self.logger.info('Prediction dataset is split in chunks') + # Calculate the number of rows in each chunk + rows_per_chunk = self.features.shape[0] // self.num_chunks + remaining_rows = self.features.shape[0] % self.num_chunks + + # Create chunks + self.chunks = [] + start = 0 + for i in range(self.num_chunks): + end = start + rows_per_chunk + (1 if i < remaining_rows else 0) + chunk = self.features[start:end, :] + self.chunks.append(chunk) + start = end + + def evaluate(self): + + """ + Evaluate the validation dataset. + """ + + self.logger.info('Model is evaluated') + y_pred_prob = self.model.predict_proba(self.test_features)[:, 1] + self.fpr, self.tpr, self.thresholds = roc_curve(self.test_labels, y_pred_prob) + + # Calculate AUC (Area Under the Curve) + self.roc_auc = auc(self.fpr, self.tpr) + + diff = [abs(pred-test_labels) for pred, test_labels + in zip(list(self.prediction), list(self.test_labels))] + self.acc = str(diff.count(1)) + '/' + str(len(diff)) + self.mae = round(np.mean(diff), 2) + print('Mean absolute error: ' + str(self.mae)) + print('Wrongly predicted: ' + + str(np.count_nonzero(diff)) + + '/' + str(len(diff))) + self.mse = mean_squared_error(self.test_labels, self.prediction) + self.f1 = f1_score(self.test_labels, self.prediction) + self.fbeta = fbeta_score(self.test_labels, self.prediction, beta=2) + print('Mean squared error: ' + str(self.mse)) + + def create_output_dir(self): + + """ + Define and create the output directory. + """ + + self.output_dir = self.model_dir + settings.model_to_save + + if not os.path.isdir(self.output_dir): + os.mkdir(self.output_dir) + + def save_model(self): + + """ + Save the Random Forest Classifier model. + """ + + with open(self.output_dir + 'saved_model.pkl', 'wb') as file: + pkl.dump(self.model, file) + self.logger.info('Model is saved') + + def save_parameters(self): + + """ + Save the metadata associated with the prediction. + """ + + params = {'Area': settings.bounding_box, + 'criterion': [self.criterion], + 'n_estimators': [self.n_estimators], + 'max_depth': [self.max_depth], + 'features': self.feature_list, + 'mse': self.mse, + 'mae': self.mae, + 'f1': self.f1, + 'roc_threshold': self.thresholds, + 'roc_fpr': self.fpr, + 'roc_tpr': self.tpr, + 'roc_auc': self.roc_auc, + 'accuracy': self.acc, + 'fbeta': self.fbeta, + 'categories': self.categorical_classes + } + + with open(settings.model_database_dir + + settings.model_to_save + + 'model_params.pkl', 'wb') as file: + pkl.dump(params, file) + + self.logger.info('Parameters are saved') + + def adapt_categorical_features(self, train_classes): + + """ + The encoded features in the training and prediction dataset are + compared regarding the contained classes. Depending on the user + input, instances in the prediction dataset with classes that are + not included in the training dataset are either set to no_value or + nevertheless considered in the prediction. The surplus additional + features are removed either way to achieve the same set of features + as in the training dataset + """ + + self.instances_to_drop = [] + for feat in list(train_classes.keys()): + if feat not in list(self.categorical_classes.keys()): + print('Categorical feature ' + feat + ' not in prediction dataset') + print('Error: cannot proceed with mapping') + self.logger.error('Error: Categorical feature ' + feat + ' not in prediction dataset') + self.error = True + else: + if (train_classes[feat]['num_cols'] < self.categorical_classes[feat]['num_cols']) or (set(train_classes[feat]['classes']) != set(self.categorical_classes[feat]['classes'])): + print(feat + ': Prediction dataset contains more or other classes than training dataset') + print('Apply user defined handling approach') + self.logger.warning(feat + ': Prediction dataset contains more classes than training dataset') + self.logger.info('Apply user defined handling approach') + common_elements = set(train_classes[feat]['classes']).intersection(set(self.categorical_classes[feat]['classes'])) + if settings.keep_cat_features: + if len(common_elements) == 0: + print('Error: no common classes for ' + feat + ' in training and prediction dataset') + self.logger.error('Error: no common classes for ' + feat + ' in training and prediction dataset') + self.error = True + else: + to_drop = [feat + '_' + f + '_encode' for f in self.categorical_classes[feat]['classes'] if f not in common_elements] + self.features = self.features.drop(to_drop, axis=1) + self.feature_list = self.features.columns.tolist() + elif settings.remove_instances: + to_drop_col = [feat + '_' + f + '_encode' for f in self.categorical_classes[feat]['classes'] if f not in common_elements] + to_drop_row = [] + for col in to_drop_col: + to_drop_row = to_drop_row + self.features.index[self.features[col] == 1].tolist() + self.features = self.features.drop(to_drop_col, axis=1) + print('Not matching features have been removed') + self.logger.info('Not matching features have been removed') + self.feature_list = self.features.columns.tolist() + self.instances_to_drop = self.instances_to_drop + to_drop_row + print('Instances to consider during mapping have been adapted') + self.logger.info('Instances to consider during mapping have been adapted') + print('Categorical features have been handled and hamonised') + self.logger.info('Categorical features have been handled and hamonised') + self.logger.info('Remaining features: ' + str(self.feature_list)) + + + def load_model(self): + + """ + Load the Random Forest Classifier model and the metadata. + Make sure to compare features of training and prediction dataset + as well as their order. + """ + + print('Loading model ' + self.model_dir + + self.model_to_load + 'saved_model.pkl') + self.logger.info('Loading model ' + self.model_dir + + self.model_to_load + 'saved_model.pkl') + with open(self.model_dir + + self.model_to_load + 'saved_model.pkl', 'rb') as file: + self.model = pkl.load(file) + + with open(settings.model_database_dir + + settings.model_to_save + + 'model_params.pkl', 'rb') as f: + params = pkl.load(f) + features = params['features'] + self.error = False + self.adapt_categorical_features(params['categories']) + + if not self.error: + if len(self.feature_list) == len(features): + if set(self.feature_list) != set(features): + print('Error: Not all features of the model are contained in the prediction dataset') + self.logger.error('Error: Not all features of the model are contained in the prediction dataset') + self.error = True + elif self.feature_list != features: + print('The order or features differs. Prediction features are reordered') + self.logger.info('The order or features differs. Prediction features are reordered') + self.features = self.features[features] + #print(self.features_org.columns.tolist(), features) + if self.features.columns.tolist() != features: + print('There is still something wrong with the order of the features!') + elif self.feature_list == features: + print('Prediction and training dataset have the same order') + self.logger.info('Prediction and training dataset have the same order') + elif len(self.feature_list) < len(features): + print('Error: Not all features of the model are contained in the prediction dataset') + self.logger.error('Error: Not all features of the model are contained in the prediction dataset') + self.error = True + elif len(self.feature_list) > len(features): + if set(features).issubset(self.feature_list): + to_drop = list(set(self.feature_list)-set(features)) + self.features_org = self.features_org.drop(to_drop, axis=1) + self.features_org = self.features_org[features] + if self.features_org.columns.tolist() != features: + print('There is still something wrong with the order of the features!') + else: + self.features = self.features_org.to_numpy() + self.feature_list = self.features_org.columns.tolist() + print('Features in the prediction dataset which were not used for training were removed') + print('Features in the prediction dataset were sorted to match the training features') + self.logger.warning('Features in the prediction dataset which were not used for training were removed') + self.logger.info('Features left: ' + str(self.feature_list)) + self.logger.info('Features in the prediction dataset were sorted to match the training features') + else: + print('Error: Not all features of the model are contained in the prediction dataset') + self.logger.error('Error: Not all features of the model are contained in the prediction dataset') + self.error = True + if not self.error: + self.feature_list = self.features.columns.tolist() + self.features = self.features.to_numpy() + + def save_prediction(self): + + """ + Save the prediction. + """ + + if self.aim == 'prediction': + output_dir = self.model_dir + self.model_to_load + + self.xy.to_csv(output_dir + 'prediction_results.csv', + columns=['xcoord', 'ycoord', 'pred'], + index=True) + self.df_pos.to_csv(output_dir + 'pos_prediction_results.csv', + columns=['xcoord', 'ycoord', 'pred'], + index=True) + self.df_neg.to_csv(output_dir + 'neg_prediction_results.csv', + columns=['xcoord', 'ycoord', 'pred'], + index=True) + print('Predictions saved in ' + output_dir) + self.logger.info('Prediction saved in ' + output_dir) + + def reshape_prediction(self): + + """ + Reshape the individual predictions into a map. + """ + + dropped = list(set(self.dropped + self.instances_to_drop)) + arr_xy = np.array(self.xy) + arr_xy[dropped, :] = settings.no_value#*np.shape(arr_xy)[1] + + result = np.reshape(list(arr_xy[:, 2]), + (len(list(set(self.xy['ycoord']))), + len(list(set(self.xy['xcoord']))))) + self.logger.info('Prediction is reshaped into the final map') + + self.save_prediction_as_nc(result) + + def extract_pos_neg_predictions(self): + + """ + Distinguish between the classes of the Classifier. + """ + + print('Extract pos and neg predictions...') + self.logger.info('Extract positive and negative predictions...') + self.xy['pred'] = self.prediction + self.df_pos = self.xy[self.xy.pred == 1] + self.df_neg = self.xy[self.xy.pred == 0] + + def save_prediction_as_nc(self, prediction): + + """ + Save the hazard map to a netCDF4 file. + """ + + print('Saving as nc-File') + outfile_name = self.model_dir + self.model_to_load + 'prediction.nc' + + if os.path.exists(outfile_name): + os.remove(outfile_name) + + generate_ncfile(outfile_name, + np.array(list(set(self.xy['xcoord']))), + np.array(list(set(self.xy['ycoord']))), + prediction, + crs=settings.crs, + missing_value=settings.no_value) + self.logger.info('Map is saved as nc-file') + + def feature_importance(self): + + """ + Access feature importance information from the Random Forest. + """ + + feature_imp = pd.Series(self.model.feature_importances_, + index=self.feature_list).sort_values( + ascending=False) + + feature_imp.to_csv(self.model_dir + + settings.model_to_load + + 'feature_importance.csv') + self.logger.info('Feature importance is saved') diff --git a/src/plain_scripts/check_user_input.py b/src/plain_scripts/check_user_input.py new file mode 100644 index 0000000000000000000000000000000000000000..470dbe0f7eb1202c33812422f3249a3690c7d861 --- /dev/null +++ b/src/plain_scripts/check_user_input.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import pandas as pd + +from settings import * +from utilities.initialise_log import save_log + + +class check_general_settings(): + + def __init__(self): + + if training_dataset or map_generation: + save_path = os.path.dirname(path_train) + '/check_user_input.log' + elif prediction_dataset: + save_path = os.path.dirname(path_pred) + '/check_user_input.log' + else: + save_path = 'check_user_input.log' + + if os.path.exists(save_path): + os.remove(save_path) + + self.logger = save_log(save_path) + self.logger.info("Start checking user input") + + self.error = False + + self.set_up_dic() + self.check_bools() + self.check_list() + self.check_int() + self.check_int_float() + self.check_string() + self.check_path() + self.check_bb() + + self.check_if_right_params_are_set() + self.check_extension() + self.check_path_extension_geosummary() + + for handler in self.logger.handlers: + handler.close() + self.logger.removeHandler(handler) + + def check_if_right_params_are_set(self): + + if training_dataset is None and prediction_dataset is None and map_generation is None: + self.logger.error('Specify a purpose of the run! Set either training_dataset, prediction_dataset and/or map_generation') + self.error = True + + if None in [crs, no_value, random_seed, resolution]: + self.logger.error('Set the general settings crs, no_value, random_seed and resolution!') + self.error = True + + if training_dataset: + if train_from_scratch is None and train_delete is None: + self.logger.error('Speciy whether you want to generate training dataset from scratch or add/remove feature(s)') + self.error = True + else: + if None in [cluster, interpolation, data_summary_path, key_to_include_path, path_train, path_landslide_database, ID, landslide_database_x, landslide_database_y, path_nonls_locations, num_nonls, nonls_database_x, nonls_database_y]: + self.logger.error('Speciy all necessary parameters for training dataset generation!') + self.error = True + + if prediction_dataset: + if pred_from_scratch is None and pred_delete is None: + self.logger.error('Speciy whether you want to generate prediction dataset from scratch or add/remove feature(s)') + self.error = True + else: + if None in [data_summary_path, key_to_include_path, bounding_box, path_pred]: + self.logger.error('Speciy all necessary parameters for prediction dataset generation!') + self.error = True + + if map_generation: + if None in [path_ml, size, not_included_pred_data, not_included_train_data, num_trees, criterion, depth, model_to_save, model_to_load, model_database_dir, parallel]: + self.logger.error('Speciy all necessary parameters for map generation!') + self.error = True + + def set_up_dic(self): + + self.dic = {} + self.dic['bool'] = [training_dataset, train_from_scratch, train_delete, prediction_dataset, pred_from_scratch, pred_delete, map_generation, parallel] + self.dic['path'] = [path_ml, data_summary_path, key_to_include_path, path_train, path_landslide_database, path_nonls_locations, path_pred, model_database_dir] + self.dic['str'] = [crs, ID, landslide_database_x, landslide_database_y, nonls_database_x, nonls_database_y, criterion, model_to_save, model_to_load] + self.dic['int'] = [resolution, random_seed, num_nonls, num_trees, depth] + self.dic['int_float'] = [size, no_value] + self.dic['list'] = [bounding_box, not_included_pred_data, not_included_train_data] + + self.dic_steps = {} + self.dic_steps['general'] = [] + self.dic_steps['run_purpose'] = [training_dataset, prediction_dataset, map_generation] + + def check_extension(self): + for path in [data_summary_path, key_to_include_path, path_landslide_database, path_train]: + if path is not None: + if len(path.split('.')) != 2: + self.logger.error(path + ': Paths must not contain full stops!') + self.error = True + else: + if path.split('.')[1] != 'csv': + self.logger.error(path + ': wrong file format! Needs to be csv') + self.error = True + + for path in [path_pred, path_nonls_locations]: + if path is not None: + if len(path.split('.')) != 2: + self.logger.error(path + ': Paths must not contain full stops!') + self.error = True + else: + if path.split('.')[1] != 'nc': + self.logger.error(path + ': wrong file format! Needs to be nc') + self.error = True + + def check_bools(self): + self.logger.info("Start checking bools") + for key in self.dic['bool']: + if key is not None: + if type(key) is not bool: + self.logger.info(key + ': not a bool') + self.error = True + + def check_list(self): + self.logger.info("Start checking list") + for key in self.dic['list']: + if key is not None: + if type(key) is not list: + self.logger.info(key + ': not a list') + self.error = True + + def check_int(self): + self.logger.info("Start checking integers") + for key in self.dic['int']: + if key is not None: + if type(key) is not int: + self.logger.info(key + ': not an integer') + self.error = True + + def check_int_float(self): + self.logger.info("Start checking integers and floats") + for key in self.dic['int_float']: + if key is not None: + if type(key) is not int and type(key) is not float: + self.logger.info(key + ': not an integer or float') + self.error = True + + def check_string(self): + self.logger.info("Start checking strings") + for key in self.dic['str']: + if key is not None: + if type(key) is not str: + self.logger.info(key + ': not a string') + self.error = True + + def check_path(self): + self.logger.info("Start checking paths") + for key in self.dic['path']: + if key is not None: + if type(key) is not str: + self.logger.info(key + ': path is not a string') + self.error = True + else: + if key == path_train and training_dataset is True: + pass + elif key == path_pred and prediction_dataset is True: + pass + else: + if not os.path.exists(key): + self.logger.error(key + ': path could not be found!') + self.error = True + + def check_bb(self): + + if bounding_box is not None: + if bounding_box[1] >= bounding_box[0]: + self.logger.error('Careful! South coordinate north of north coordinate!') + self.error = True + + if bounding_box[2] >= bounding_box[3]: + if (((bounding_box[2] < 0 and bounding_box[2] > -10) and (bounding_box[3] > 0 and bounding_box[3] < 10)) + or ((bounding_box[2] > 0 and bounding_box[2] > 170) and (bounding_box[3] < 0 and bounding_box[3] < -170))): + self.logger.warning('Careful! Please check east and west coordinates!') + else: + self.logger.error('Careful! West coordinate east of east coordinate!') + + def check_path_extension_geosummary(self): + + self.logger.info('Start checking paths in geospatial data summary') + if data_summary_path is not None and key_to_include_path is not None: + if os.path.exists(data_summary_path) and os.path.exists(key_to_include_path): + if data_summary_path.split('.')[1] != 'csv' and key_to_include_path.split('.')[1] != 'csv': + summary = pd.read_csv(data_summary_path) + keys_to_include = pd.read_csv(key_to_include_path) + for key in list(keys_to_include['keys_to_include']): + idx = list(summary['keys']).index(key) + + if summary.at[idx, 'path'].split('.')[1] not in ['nc', 'tif', 'tiff']: + self.logger.error(key + ': Wrong file format!') + self.error = True + + if not os.path.exists(summary.at[idx, 'path']): + self.logger.error(key + ': File cannot be found!') + self.error = True diff --git a/src/plain_scripts/create_prediction_data.py b/src/plain_scripts/create_prediction_data.py new file mode 100644 index 0000000000000000000000000000000000000000..820adaebec8bba17489da985a436ea17ba4b626b --- /dev/null +++ b/src/plain_scripts/create_prediction_data.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import pandas as pd +import netCDF4 as nc +import os +import pickle + +from tqdm import tqdm + +import settings +from data_preprocessing import generate_data_matrix +from utilities.ncfile_generation import generate_basic_ncfile +from utilities.strings_for_ncfile import char_to_string, features_to_char +from utilities.handle_categorical_values import handle_categorical_values + + +class create_prediction_data: + + """ + This class creates the prediction data + for the Random Forest classifier. + + Input: + from_scratch: boolean, True if prediction dataset should + be generated from scratch, otherwise false + delete: True if dataset/feature should be deleted + from prediction dataset + False if dataset should be added to existing + prediction dataset + (careful: from_scratch needs to be False!) + + Output: + netCDF4 file + """ + + def __init__(self, from_scratch=True, delete=False, log=None): + + self.from_scratch = from_scratch + self.delete = delete + self.bb = settings.bounding_box + self.logger = log + + if self.from_scratch: + self.logger.info('Prediction dataset is generated from scratch') + self.s = generate_data_matrix(from_scratch=True, + delete=False, + dataset='prediction', + bb=self.bb) + self.import_cube() # Import data cube + if not self.no_dataset_found: + # Add coordinate information to prediction + # dataset for map generation + self.add_coordinates() + # Flatten data cube for efficient information extraction + self.flatten_cube() + # Determine no value instances in DataFrame + self.clean_df() + self.import_data_summary() + self.handle_categorical_features() + # Save prediction dataset + self.save_as_nc() + + elif not self.from_scratch and not self.delete: + self.logger.info( + 'Feature(s) will be added to the prediction dataset') + # Import information on the geospatial datasets + self.import_data_summary() + # Import existing prediction dataset + self.import_prediction_dataset() + # Import data cube that contains cut and + # interpolate dataset to be added + self.import_cube() + + # Check if datasets to be added are contained in the data cube + not_included = False + for feature in self.data_to_handle: + if feature not in self.features: + print(str(feature) + + ' not included in cube, it has to be added first') + not_included = True + + if not_included: + print('not included') + self.s = generate_data_matrix( + from_scratch=False, + delete=False, + dataset='prediction', + bb=self.bb, + data_to_handle=self.data_to_handle, + keys_already_included=self.features) + self.import_cube() + + self.add_feature() # Add feature + # Save prediction dataset + self.clean_df() + self.handle_categorical_features(var=self.data_to_handle) + self.save_as_nc() + + elif not self.from_scratch and self.delete: + + self.logger.info( + 'Feature(s) will be removed from the prediction dataset') + # Import existing prediction dataset + self.import_data_summary() + self.import_prediction_dataset() + self.delete_features() # Delete features from prediction dataset + # Save prediction dataset + self.save_as_nc() + + def import_data_summary(self): + """ + Import the information on the geospatial datasets and the + keys to include in the prediction dataset + """ + + self.data_properties = pd.read_csv(settings.data_summary_path) + self.keys_to_include = pd.read_csv(settings.key_to_include_path)['keys_to_include'].tolist() + self.data_to_handle = self.keys_to_include + self.logger.info('Data summary in keys to include have been imported') + + def handle_categorical_features(self, var=None): + + """ + Function is called which performs one-hot or ordinal encoding + """ + + basic = ['xcoord', 'ycoord'] + self.df_pred = handle_categorical_values(self.df_pred, + self.data_properties, + settings.ohe, + basic, + var) + + to_drop = [] + for col in self.df_pred.columns.tolist(): + if str(settings.no_value) in col: + to_drop.append(col) + + self.df_pred = self.df_pred.drop(to_drop, axis=1) + + def add_feature(self): + + """ + Add feature to the prediction dataset + """ + + for count, key in enumerate(self.data_to_handle): + # Delete feature if it already exists in training dataset + if key in self.df_pred.columns: + print( + 'Feature already exists in prediction dataset.\ + Existing feature is deleted') + self.logger.info('Feature already exists in prediction dataset.\ + Existing feature is deleted') + + self.df_pred = self.df_pred.drop(key, axis=1) + + print('Adding ' + key + '...') + self.logger.info('Adding ' + key) + + # Create empty DataFrame + if count == 0: + self.df_features = pd.DataFrame( + index=range(len(self.df_pred)), + columns=self.data_to_handle) + + data_flat = self.cube[:, :, self.features.index(key)].flatten() + self.df_features[key] = data_flat + + # Combine old training dataset with additional features + self.df_pred = pd.concat([self.df_pred, self.df_features], axis=1) + # Adapt column order + #self.df_pred = self.df_pred[ref] + + print('Prediction dataset contains following features: ' + + str(list(self.df_pred.columns))) + self.logger.info('Prediction dataset contains the following features: ' + + str(list(self.df_pred.columns))) + + def import_cube(self): + + """ + Import cut and interpolated data cube + which was created in pre-processing.py + """ + self.logger.info('Import cube with interpolated datasets') + + # Path to the stored data cube (see data_preprocessing.py) + folder = settings.path_pred.rsplit('/', 1)[0] + path = folder + '/data_combined_prediction_' + str(settings.resolution) + '.nc' + + # Check if path exists and import the cube + # as well as list of datasets it contains + if not os.path.exists(path): + print('Error: Dataset not found!') + self.logger.error('Error! Cube not found!') + self.no_dataset_found = True + else: + self.no_dataset_found = False + ds = nc.Dataset(path) + self.cube = ds['Result'][:, :, :].data + self.x = ds['Longitude'][:].data + self.y = ds['Latitude'][:].data + self.pred_features = ds['features'][:].data + self.features = char_to_string(self.pred_features) + + print('Features included in cube: ' + str(self.features)) + self.logger.info('Features included in cube ' + str(self.features)) + + def flatten_cube(self): + + """ + Flatten the individual datasets of the data cube + """ + + print('Flatten cube...') + self.logger.info('Flatten cube...') + # Go through all datasets in the data cube + for i in tqdm(range(np.shape(self.cube)[2])): + data = self.cube[:, :, i] + data_flat = data.flatten() # Flatten the dataset + # Save it to the DataFrame + self.df_pred[self.features[i]] = data_flat + + def add_coordinates(self): + + """ + Add coordinate for which the model shall + make an prediction to the DataFrame. + """ + + print('Add coordinates...') + self.logger.info('Adding coordinates...') + self.df_pred = pd.DataFrame(columns=['xcoord', 'ycoord'] + + self.features) + self.X, self.Y = np.meshgrid(self.x, self.y) + + data_flat = self.X.flatten() + self.df_pred['xcoord'] = data_flat + data_flat = self.Y.flatten() + self.df_pred['ycoord'] = data_flat + + def clean_df(self): + + """ + Clean the DataFrame from rows with no data values + """ + + self.logger.info('Check prediction dataset for no value rows') + # Go through the DataFrame column by column and + # check for indices of no value data + ind = [] + for feature in tqdm(list(self.df_pred.columns)): + tmp = [i for i, x in + enumerate(self.df_pred[feature].tolist()) + if x == settings.no_value] + ind.append(tmp) + + # Remove duplicates + self.idx = [] + for i in range(len(ind)): + self.idx = self.idx + ind[i] + self.idx = list(set(self.idx)) + + # Save information on invalid locations so that they + # can masked out during hazard map generation + print(str(len(list(set(self.idx)))) + ' rows will be saved to be\ + handled during map generation') + self.logger.info(str(len(list(set(self.idx)))) + ' rows will be saved to be\ + handled during map generation') + + def delete_features(self): + + """ + Delete feature from prediction dataset + """ + + to_drop = [] + for feat in self.data_to_handle: + for col in self.df_pred.columns.tolist(): + if feat in col: + to_drop.append(col) + + self.df_pred.drop(columns=to_drop, inplace=True) + + print('Features now included in prediction dataset: ' + + str(list(self.df_pred.columns))) + self.logger.info('Features now included in prediction dataset: ' + + str(list(self.df_pred.columns))) + + def import_prediction_dataset(self): + + """ + Import existing prediction dataset + """ + + if 'prediction.nc' in settings.path_pred: + path = settings.path_pred + else: + path = settings.path_pred + 'prediction.nc' + + + if not os.path.exists(path): + print('Error: existing prediction dataset could not be found') + else: + ds = nc.Dataset(path) + pred = ds['Result'][:, :].data + pred_features = ds['features'][:].data + self.features = char_to_string(pred_features) + self.idx = ds['Dropped'][:].data + if 'xcoord' in self.features and 'ycoord' in self.features: + cols = self.features + else: + cols = ['xcoord', 'ycoord'] + self.features + self.df_pred = pd.DataFrame(pred, columns=cols) + + print('Features included in dataset: ' + str(self.features)) + + def save_as_nc(self): + + """ + Save prediction dataset and information on dropped rows as nc-file + """ + + df_pred = self.df_pred.to_numpy() + self.char_features = features_to_char(self.df_pred.columns.tolist()) + + if 'prediction.nc' not in settings.path_pred: + outfile = settings.path_pred + 'prediction.nc' + else: + outfile = settings.path_pred + + isExist = os.path.exists(os.path.dirname(outfile)) + if not isExist: + os.makedirs(os.path.dirname(outfile)) + + ds = generate_basic_ncfile(outfile, crs=None) + ds.createDimension('lat', (np.shape(df_pred)[0])) + ds.createDimension('lon', (np.shape(df_pred)[1])) + ds.createDimension('ix', (len(self.idx))) + ds.createDimension('feat', len(self.char_features)) + result = ds.createVariable('Result', 'f4', ('lat', 'lon')) + dropped = ds.createVariable('Dropped', 'f4', 'ix') + Features = ds.createVariable('features', 'S1', 'feat') + result[:, :] = df_pred + dropped[:] = self.idx + Features[:] = self.char_features + ds.close() diff --git a/src/plain_scripts/create_training_data.py b/src/plain_scripts/create_training_data.py new file mode 100644 index 0000000000000000000000000000000000000000..bef20ce71f6133b6fe392aa3d6380ec37850366c --- /dev/null +++ b/src/plain_scripts/create_training_data.py @@ -0,0 +1,897 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import pandas as pd +import netCDF4 as nc +import os +import itertools + +from tqdm import tqdm +from LatLon23 import LatLon, Latitude, Longitude +from sklearn.cluster import KMeans + +import settings +from data_preprocessing import generate_data_matrix +from utilities.import_format import import_nc, import_tif +from utilities.cut_and_interpolate import cut_and_interpolate +from utilities.strings_for_ncfile import char_to_string +from utilities.initialise_log import save_log +from utilities.import_raw_dataset import import_raw_dataset +from utilities.handle_categorical_values import handle_categorical_values + + +class create_training_data: + + """ + This class generates the training dataset + for the Random Forest Classifier. + + Input: + from_scratch: boolean, True if training dataset should be + generated from scratch, otherwise False + delete: True if dataset/feature should be deleted from csv-file + False if dataset should be added to existing csv-file + (careful: from_scratch needs to be False!) + data_to_handle: list of features that should be added/deleted + datasets need to be listed in list_of_raw_datasets + cluster: boolean, True if training locations are to be clustered + interpolation: boolean, True if datasets are supposed to be + interpolated before extracting information + preprocessing: list of length equal to data_to_handle, + preprocessing methods for the individual datasets, + can be either 'no_interpolation', + 'interpolation' or 'cluster' + log: logger to add information to shire_run.log + + Output: + csv file + """ + + def __init__(self, + from_scratch=True, + delete=True, + data_to_handle=None, + cluster=False, + interpolation=True, + preprocessing=None, + log=None): + + self.from_scratch = from_scratch + self.delete = delete + self.data_to_handle = data_to_handle + self.num_clusters = 15 + self.cluster = cluster + self.interpolation = interpolation + self.how_to_preprocess = preprocessing + self.df_train = 'not_set_yet' + self.logger = log + + self.import_data_summary() + self.data_to_handle = self.keys_to_include + + if self.from_scratch: + if cluster is True: + self.main_cluster() + elif interpolation is True: + self.main() + elif interpolation is False and cluster is False: + self.main_no_interpolation() + elif not self.from_scratch and self.delete: + # Import existing training dataset + self.import_input_training_dataset() + self.delete_feature() # Delete feature from training dataset + self.save_training_data() # Save training dataset as csv + + elif not self.from_scratch and not self.delete: + # Import existing training dataset + self.import_input_training_dataset() + self.add_feature() # Add feature to the training dataset + self.save_training_data() # Save training dataset as csv + + def import_data_summary(self): + + """ + Import the information on the geospatial datasets and the + keys to include in the training dataset + """ + + self.data_properties = pd.read_csv(settings.data_summary_path) + self.keys_to_include = pd.read_csv(settings.key_to_include_path)['keys_to_include'].tolist() + self.logger.info('Data summary in keys to include have been imported') + + def handle_categorical_features(self, var=None): + + """ + Function is called which performs one-hot or ordinal encoding + """ + + basic = ['ID', 'xcoord', 'ycoord', 'label'] + self.df_train = handle_categorical_values(self.df_train, + self.data_properties, + settings.ohe, + basic, + var) + + def sort_into_preprocessing_groups(self): + + """ + Sort the features in data_to_handle according to their + preprocessing method into a dictionary + """ + self.logger.info('Sort datasets into preprocessing groups') + data = self.data_to_handle + + if len(data) != len(self.how_to_preprocess): + self.logger.error( + 'Not all datasets have been assigned a preprocessing method!') + print( + 'Not all datasets have been assigned a preprocessing method!') + error = True + + else: + error = False + self.preprocessing_dic = {} + no_interp, cluster, interp = [], [], [] + for i in range(len(data)): + if self.how_to_preprocess[i] == 'no_interpolation': + no_interp.append(data[i]) + elif self.how_to_preprocess[i] == 'cluster': + cluster.append(data[i]) + if self.how_to_preprocess[i] == 'interpolation': + interp.append(data[i]) + self.preprocessing_dic['no_interpolation'] = no_interp + self.preprocessing_dic['interpolation'] = interp + self.preprocessing_dic['cluster'] = cluster + self.logger.info('Datasets sorted into preprocessing groups') + return error + + def delete_feature(self): + + """ + Features in data_to_handle are deleted from the training dataset + """ + self.logger.info( + 'Feature(s) will be deleted') + to_drop = [] + for feat in self.data_to_handle: + for col in self.df_train.columns.tolist(): + if feat in col: + to_drop.append(col) + + self.df_train.drop(columns=to_drop, inplace=True) + + self.logger.info( + 'Feature has been deleted') + self.logger.info( + 'Training dataset contains the following features: ' + + str(self.df_train.columns)) + print('Training dataset contains following features: ' + + str(self.df_train.columns)) + + def import_cube(self): + + """ + Import data cube created in data_preprocessing.py + """ + self.logger.info( + 'Import the cube with interpolated geospatial data') + + if not os.path.exists(self.s.outfile): + self.logger.error( + 'Cube could not be found!') + + self.ds = nc.Dataset(self.s.outfile) + self.x = self.ds['Longitude'][:].data + self.y = self.ds['Latitude'][:].data + self.cube = self.ds['Result'][:, :, :].data + + features = self.ds['features'][:].data + + self.features = char_to_string(features) + self.logger.info( + 'Cube contains the following features: ' + + str(self.features)) + print('Cube contains following features: ' + str(self.features)) + + def import_landslide_database(self): + + """ + Import training dataset which needs to be provided as csv file + """ + + self.df_train = pd.read_csv(settings.path_landslide_database) + self.logger.info('Landslide inventory imported') + self.check_ls_inventory() + + for column in list(self.df_train.columns): + if column not in [settings.ID, + settings.landslide_database_x, + settings.landslide_database_y]: + self.df_train.drop(inplace=True, labels=column, axis=1) + self.add_nonls_locations() + self.length_before_cleaning = len(self.df_train) + self.df_train = pd.concat([self.df_train, self.df_absence], + axis=0, + ignore_index=True) + self.logger.info('Absence locations added') + self.df_train = self.df_train.rename( + columns={settings.landslide_database_x: 'xcoord', + settings.landslide_database_y: 'ycoord', + settings.ID: 'ID'}) + self.df_train['label'] = self.label_training_data() + self.logger.info('Label added') + + def add_nonls_locations(self): + + """ + Supplement presence data with absence data. It needs to be + pre-generated. + """ + + ds = nc.Dataset(settings.path_nonls_locations) + + x = ds[settings.nonls_database_x][:].data + y = ds[settings.nonls_database_y][:].data + + self.df_absence = pd.DataFrame(index=range(len(x)), + columns=list(self.df_train.columns)) + + self.df_absence[settings.ID] = ['nonls_event_' + str(i) + for i in range(len(x))] + self.df_absence[settings.landslide_database_x] = list(x) + self.df_absence[settings.landslide_database_y] = list(y) + + def label_training_data(self): + + """ + Provide labels to the training data + """ + + label = [1 for i in range(np.shape(self.df_train)[0])] + label[ + self.length_before_cleaning:len(label)] = \ + [0 for i in range(len(label)-self.length_before_cleaning)] + + return label + + def import_input_training_dataset(self): + + """ + Existing training dataset is imported. + """ + + self.logger.info('Existing training dataset will be imported') + # Check if training dataset exists and import it + if not os.path.exists(settings.path_train): + print('Training dataset does not exist yet.\ + Please generate from scratch.') + self.logger.error('Training dataset could not be found!') + else: + self.df_train = pd.read_csv(settings.path_train) + self.features = list(self.df_train.columns) + print('Training dataset consists the following features: ' + + str(self.features)) + self.logger.info('Training dataset contains the following features: ' + + str(self.features)) + self.length_before_cleaning = len(self.df_train[self.df_train['label'] == 1]) + + def add_feature(self): + + """ + Add feature to the training dataset + """ + + self.logger.info('Feature adding started') + + # Features contained in the training dataset + cols = list(self.df_train.columns) + # List of labels for basic information + basic = ['Ereignis-Nr', 'xcoord', 'ycoord', 'label'] + print(self.data_to_handle) + if self.interpolation: + # Call data_preprocessing.py + self.s = generate_data_matrix( + from_scratch=False, + delete=False, + dataset='training', + keys_already_included=[x for x in cols if x not in basic]) + + # Import generated cube of cut and interpolated datasets + self.import_cube() + + for count, key in enumerate(self.data_to_handle): + # Delete feature if it already exists in training dataset + if key in self.df_train.columns: + self.logger.warning('Feature already exists in Dataset.\ + Existing feature is deleted') + self.df_train = self.df_train.drop(key, axis=1) + + self.logger.info('Adding ' + key + '...') + # Create empty DataFrame + if count == 0: + self.df_features = pd.DataFrame( + index=range(len(self.df_train)), + columns=self.data_to_handle) + + for index, row in self.df_train.iterrows(): + x_ind = int( + (np.abs( + self.x + - row[settings.landslide_database_x])).argmin()) + y_ind = int( + (np.abs( + self.y + - row[settings.landslide_database_y])).argmin()) + + self.df_features.at[index, key] = self.cube[ + y_ind, + x_ind, + self.features.index(key)] + # Combine old training dataset with additional features + self.df_train = pd.concat([self.df_train, self.df_features], + axis=1) + # Adapt column order + self.logger.info('Feature successfully added') + + self.clean_df() + self.handle_categorical_features(var=self.data_to_handle) + self.logger.info('One-hot encoding completed') + + elif self.cluster: + for key in self.data_to_handle: + if key in self.df_train.columns: + self.logger.info( + 'Feature already exists in dataset.\ + Existing feature is deleted') + self.df_train = self.df_train.drop(key, axis=1) + + self.main_cluster() + self.logger.info('Feature successfully added') + elif not self.cluster and not self.interpolation: + self.main_no_interpolation() + self.logger.info('Feature successfully added') + + self.logger.info('Training dataset contains following features:') + self.logger.info(str(self.df_train.columns)) + + def extract_gridded_info(self): + + """ + Extraction of the information of the geospatial datasets at all + elements of the training dataset. + + If training data is located within prediction area and if area is + small enough no further interpolation is necessary. + """ + + self.df_features = pd.DataFrame(index=range(len(self.df_train)), + columns=self.features) + + # Iterate over all instances of the training dataset + # and extract geospatial information + for index, row in tqdm(self.df_train.iterrows()): + # Indices of training data elements are determined + x_ind = int((np.abs(self.x - row['xcoord'])).argmin()) + y_ind = int((np.abs(self.y - row['ycoord'])).argmin()) + + tmp = list(self.cube[y_ind, x_ind, :]) + self.df_features.loc[index] = tmp + + def check_ls_inventory(self): + + """ + Rows are removed with missing values or nan + """ + + self.logger.info('Landslide inventory is check for missing and nan values') + rows_with_missing_values = self.df_train[self.df_train.isnull().any(axis=1)].index.tolist() + rows_with_nan_values = self.df_train[self.df_train.isna().any(axis=1)].index.tolist() + + if len(set(rows_with_missing_values + rows_with_nan_values)) > 0: + self.df_train.drop(index=set(rows_with_missing_values + +rows_with_nan_values), inplace=True) + self.logger.info(str(len(set(rows_with_missing_values + rows_with_nan_values))) + + ' rows are removed due to missing or nan values') + + def clean_df(self): + + """ + Rows are removed from the dataset where no_value given in settings + occurs. + """ + + self.logger.info('Clean the dataframe from no data instances') + count = 0 + ind = [] + for col in self.df_train: + ind.append([i for i, x in enumerate(list(self.df_train[col])) + if x == settings.no_value]) + count = count + 1 + + ind = list(itertools.chain.from_iterable(ind)) + ind = list(set(ind)) + + self.logger.info(str(len(ind)) + ' rows will be removed') + print(str(len(ind)) + ' rows will be removed due to invalid data') + + self.df_train.drop(index=ind, inplace=True) + + if not self.from_scratch and len(ind) > 0: + print('Careful! Ratio might be obscured!') + self.logger.warning('Careful! Ratio might be obscured!') + + def ensure_same_ratio(self): + + """ + Ensure that the desired ratio of presence to absence data is + kept even after removing instances with no data values + """ + + self.logger.info('Ensure that the ratio of presence to absence is kept') + len_pres = np.shape(self.df_train[self.df_train['label'] == 1])[0] + len_abs = np.shape(self.df_train[self.df_train['label'] == 0])[0] + + if settings.num_nonls == self.length_before_cleaning: + if len_abs > len_pres: + print('Number of absence locations is reduced') + self.logger.info('Number of absence locations is reduced') + df_abs = self.df_train[self.df_train['label'] == 0] + + df_abs = df_abs.iloc[:len_abs-(len_abs-len_pres)] + + self.df_train = pd.concat( + [self.df_train[self.df_train['label'] == 1], df_abs], + axis=0) + elif len_abs < len_pres: + print( + 'Undefined error in the number\ + of absence and presence data') + self.logger.error('Undefined error in the number of absence and presence instances') + else: + if len_abs > settings.num_nonls: + df_abs = self.df_train[self.df_train['label'] == 0] + + df_abs = df_abs.iloc[:len_abs + - (len_abs + - settings.num_nonls)] + + self.df_train = pd.concat( + [self.df_train[self.df_train['label'] == 1], df_abs], + axis=0) + self.logger.info('Surplus absence locations have been removed') + + def cluster_landslide_locations(self, re=False, num=None): + + """ + Cluster the landslide locations. If clusters are too large this + functions reclusters these clusters into smaller ones. + + Input: + re: boolean, True if reclustering + num: list, numbers of clusters to be reclustered + + Output: + re_cluster_name: list, clusters for every entry + """ + + if re: + re_cluster_name = [] + count, len_org = 0, len(self.bb) + + # Number of clusters to split too large cluster into + num_cluster = 4 + for i in num: + + df_sub = self.df_train[self.df_train.cluster == i].copy() + # Clustering + kmeans = KMeans(init="random", + n_clusters=num_cluster, + n_init=10, + max_iter=300, + random_state=42) + kmeans.fit(np.column_stack((list(df_sub['xcoord']), + list(df_sub['ycoord'])))) + tmp = kmeans.labels_[:] + + # Rename clusters to fit into naming convention + for c, j in enumerate(tmp): + if j == 0: + tmp[c] = i + else: + tmp[c] = len_org + count*(num_cluster-1) + (j-1) + + df_sub['cluster'] = tmp + self.df_train = pd.concat( + [self.df_train[self.df_train.cluster != i], df_sub], + axis=0) + # Store cluster numbers to be returned + re_cluster_name.append(set(tmp)) + count = count + 1 + + return re_cluster_name + + else: + print('Start clustering...') + self.logger.info('Start clustering...') + # Clustering + kmeans = KMeans(init="random", + n_clusters=self.num_clusters, + n_init=10, + max_iter=300, + random_state=42) + kmeans.fit(np.column_stack((list(self.df_train['xcoord']), + list(self.df_train['ycoord'])))) + self.df_train['cluster'] = kmeans.labels_[:] + + def determine_bb_for_clustering(self, re=False, re_num=None): + + """ + Determine bounding box for the individual clusters. + + Input: + re: boolean, True if reclustering + num: list, numbers of clusters to be reclustered + """ + + print('Determine bounding boxes...') + self.logger.info('Determine bounding box(es)...') + + if self.cluster: + # Initial clustering + if not re: + self.bb = [] + if self.from_scratch: + to_enter = range(self.num_clusters) + else: + # When adding a feature + if 'cluster' in self.df_train: + to_enter = list(set(self.df_train.cluster.to_list())) + else: + to_enter = range(self.num_clusters) + for num in to_enter: + df_tmp = self.df_train[ + self.df_train['cluster'] == num].copy() + + df_tmp = df_tmp.reset_index(drop=True) + max_x = df_tmp['xcoord'].loc[df_tmp['xcoord'].idxmax()] + min_y = df_tmp['ycoord'].loc[df_tmp['ycoord'].idxmin()] + max_y = df_tmp['ycoord'].loc[df_tmp['ycoord'].idxmax()] + min_x = df_tmp['xcoord'].loc[df_tmp['xcoord'].idxmin()] + + self.bb.append([max_y, min_y, min_x, max_x]) + # Reclustering + else: + bb_new = [0 for i in range( + len(set(self.df_train.cluster.to_list())))] + bb_new[:len(self.bb)] = self.bb + + for num in re_num: + + df_tmp = self.df_train[ + self.df_train['cluster'] == num].copy() + df_tmp = df_tmp.reset_index(drop=True) + max_x = df_tmp['xcoord'].loc[df_tmp['xcoord'].idxmax()] + min_y = df_tmp['ycoord'].loc[df_tmp['ycoord'].idxmin()] + max_y = df_tmp['ycoord'].loc[df_tmp['ycoord'].idxmax()] + min_x = df_tmp['xcoord'].loc[df_tmp['xcoord'].idxmin()] + + # Make sure that the order is preserved + # to match bounding boxes properly + if num >= self.num_clusters: + bb_new[num] = [max_y, min_y, min_x, max_x] + else: + bb_new[num] = [max_y, min_y, min_x, max_x] + + self.bb = bb_new.copy() + + else: + max_x = self.df_train['xcoord'].loc[ + self.df_train['xcoord'].idxmax()] + min_y = self.df_train['ycoord'].loc[ + self.df_train['ycoord'].idxmin()] + max_y = self.df_train['ycoord'].loc[ + self.df_train['ycoord'].idxmax()] + min_x = self.df_train['xcoord'].loc[ + self.df_train['xcoord'].idxmin()] + + self.bb = [max_y, min_y, min_x, max_x] + print(self.bb) + self.logger.info('Bounding box of training samples: ' + + str(max_y) + ', ' + str(min_y) + ', ' + + str(min_x) + ', '+ str(max_x)) + + def determine_if_reclustering(self): + + """ + Determine if the extent of one or several + clusters are too large for local interpolation + + Output: + num_bb: list, names of clusters that need reclustering + """ + + self.reclustering = False + num_bb = [] + + # Check extend of individual clusters + for count, bb in enumerate(self.bb): + point1_x = LatLon(Latitude(bb[0]), Longitude(bb[2])) + point2_x = LatLon(Latitude(bb[0]), Longitude(bb[3])) + distance_x = point1_x.distance(point2_x)*1000 + + point1_y = LatLon(Latitude(bb[0]), Longitude(bb[2])) + point2_y = LatLon(Latitude(bb[-1]), Longitude(bb[2])) + distance_y = point1_y.distance(point2_y)*1000 + + num_px_x = int(np.round((distance_x/settings.resolution))) + num_px_y = int(np.round((distance_y/settings.resolution))) + + if num_px_x or num_px_y > 10000: + num_bb.append(count) + self.reclustering = True + + return num_bb + + def main_cluster(self): + + """ + Main function to generate training dataset if training + locations shall be clustered. + """ + + def extract_data_from_dataset_subsets(num): + + """ + Extract information from the interpolated geospatial dataset. + + Input: + num: int, number of cluster + + Output: + df_clust: dataframe, subset of the training dataset + supplemented with the information from the training dataset + """ + + df_clust = self.df_train[self.df_train.cluster == num].copy() + #print(len(df_clust[df_clust['label'] == 0]), len(df_clust[df_clust['label'] == 1])) + + # Access interpolated subset of the dataset + arr = self.ds['Result' + str(num)][:, :].data + + # Access interpolation x and y vector + yvector = self.ds['Latitude' + str(num)][:].data + xvector = self.ds['Longitude' + str(num)][:].data + + # Extract information at training location + feature = [] + for index, row in df_clust.iterrows(): + x_indx = int((np.abs(np.array(xvector) + - row['xcoord'])).argmin()) + y_indx = int((np.abs(np.array(yvector) + - row['ycoord'])).argmin()) + + feature.append(arr[y_indx, x_indx]) + + df_clust[dataset] = feature + + return df_clust + + self.logger.info('Approach: clustering') + # If previous run failed dataset might already exist causing + # current run to crash again therefore it is deleted if it exists + if os.path.isfile('tmp.nc'): + os.remove('tmp.nc') + + if self.from_scratch: + self.logger.info('Training dataset generated from scratch') + if not isinstance(self.df_train, pd.DataFrame): + self.import_landslide_database() # Import landslide database + self.cluster_landslide_locations() + + if not self.from_scratch and 'cluster' not in self.df_train: + # Cluster the landslide locations + self.logger.info('Existing training dataset has not been generated using clustering') + self.cluster_landslide_locations() + # Determine the bounding boxes for the individual clusters + self.determine_bb_for_clustering() + + if self.from_scratch: + # Determine if bounding boxes are too large for local interpolation + num_bb = self.determine_if_reclustering() + + if self.reclustering: + print('Reclustering necessary...') + self.logger.info('Reclustering necessary...') + re_cluster_name = self.cluster_landslide_locations(True, + num_bb) + re_cluster_name = [item for sublist in re_cluster_name + for item in sublist] + # Determine bounding boxes for new clusters + self.determine_bb_for_clustering(True, re_cluster_name) + + dataset_counter = 0 + # Iterate over the dataset to inclue in the training dataset + for dataset in tqdm(self.data_to_handle): + self.logger.info('Currently handling: ' + dataset) + index = list(self.data_properties['keys']).index(dataset) + # Call cut_and_interpolate class to cut + # and interpolate the current dataset + + s = cut_and_interpolate( + key=dataset, + path=list(self.data_properties['path'])[index], + no_data_value=list(self.data_properties['no_value'])[index], + categorical=bool(list(self.data_properties['categorical'])[index]), + several=False, + several_same=False, + first=False, + bb=self.bb, + cluster=self.cluster, + path_properties=settings.path_ml + 'properties.pkl') + # Open the netcdf file which contains the interpolated subsets of + # the dataset with the extent of the bounding boxes of the clusters + self.logger.info('Interpolation completed') + self.ds = nc.Dataset('tmp.nc') + + # Determine if one or more bounding boxes is + # outside of the extend of the dataset + if not s.cuttable: + print('Error! Bounding box larger than dataset!\ + Please adapt bounding_box!') + self.logger.error('Error! Bounding box larger than dataset!') + break + + df = [] + # Iterate over the clusters extract information from the dataset + for num in range(len(self.bb)): + df.append(extract_data_from_dataset_subsets(num)) + + # Concatenate all subsets of the training dataset + df = np.concatenate(df, axis=0) + self.logger.info('Extraction of values from dataset completed') + # For first dataset set up final training dataset, + # for later datasets append information + if dataset_counter == 0: + df_ges = pd.DataFrame(df, columns=list(self.df_train.columns) + + [dataset]) + else: + df_ges[dataset] = df[:, -1] + + + dataset_counter = dataset_counter + 1 + self.ds.close() + + # Remove temporary netcdf file that contains + # interpolated subsets of the dataset + os.remove('tmp.nc') + + self.df_train = df_ges.copy() + self.df_train.reset_index(inplace=True, drop=True) + + self.clean_df() # Clean the DataFrame from no value rows + if not self.from_scratch and not self.delete: + self.handle_categorical_features(var=self.data_to_handle) + else: + self.handle_categorical_features() + if self.from_scratch: + # Ensure that the 1:1 ratio of the training dataset is perserved + self.ensure_same_ratio() + + self.save_training_data() # Save the training dataset as csv file + + def main(self): + + self.logger.info('Approach: interpolation') + if not isinstance(self.df_train, pd.DataFrame): + self.import_landslide_database() # Import landslide database + + print('Warning! Depending on the size of the area of interest and\ + the set resolution this might be computationally expensive.\ + Consider clustering.') + self.logger.warning('Warning! Depending on the size of the AOI and\ + the set resolution this might be computationally\ + expensive. Consider clustering') + self.logger.info('Cube of interpolated datasets will be generated') + self.determine_bb_for_clustering(re=False, re_num=None) + self.s = generate_data_matrix(from_scratch=True, + delete=False, + dataset='training', + bb=self.bb) + self.logger.info('Cube of interpolated datasets successfully generated') + self.import_cube() + # Extract geospatial information at ls and non-ls locations + self.extract_gridded_info() + self.logger.info('Gridded information has been extraced') + + # Concat final training dataset + self.df_train = pd.concat([self.df_train, self.df_features], axis=1) + + self.clean_df() # Clean the DataFrame from no value rows + if not self.from_scratch and not self.delete: + self.handle_categorical_features(var=self.data_to_handle) + elif self.from_scratch: + print('ohe') + self.handle_categorical_features() + self.ensure_same_ratio() + self.save_training_data() # Save the training dataset as csv file + + def raw_dataset(self, path, no_data): + + if no_data != 'None': + no_data = no_data.split(',') + no_data = [float(val) for val in no_data] + + data, x, y = import_raw_dataset(path, no_data, settings.no_value) + if isinstance(data, np.ndarray): + self.logger.info('Raw dataset imported') + else: + print( + 'Not the right data format! Please provide tif or nc file') + self.logger.info('Wrong file format!') + + return data, x, y + + def main_no_interpolation(self): + + print('no interpolation...') + self.logger.info('Approach: no interpolation') + + def extract_gridded_info(row): + + x_indx = int((np.abs(np.array(x) - row['xcoord'])).argmin()) + y_indx = int((np.abs(np.array(y) - row['ycoord'])).argmin()) + + return data[y_indx, x_indx] + + if self.from_scratch: + self.import_landslide_database() + else: + if not isinstance(self.df_train, pd.DataFrame): + self.import_landslide_database() # Import landslide database + + for dataset in tqdm(self.data_to_handle): + self.logger.info('Currently handling: ' + dataset) + index = list(self.data_properties['keys']).index(dataset) + data, x, y = self.raw_dataset(self.data_properties['path'][index], + self.data_properties['no_value'][index]) + feat = [] + for index, row in self.df_train.iterrows(): + feat.append(extract_gridded_info(row)) + + self.df_train[dataset] = feat + self.logger.info('Gridded information has been extracted') + + self.clean_df() # Clean the DataFrame from no value rows + if not self.from_scratch and not self.delete: + self.handle_categorical_features(var=self.data_to_handle) + else: + self.handle_categorical_features() + self.ensure_same_ratio() + self.save_training_data() + + def save_training_data(self): + + """ + Save dataframe as csv. If necessary folder is created. + """ + + self.df_train = self.df_train.rename(columns={settings.ID: 'ID', settings.landslide_database_x: 'xcoord', settings.landslide_database_y: 'ycoord'}) + + if 'training.csv' not in settings.path_train: + outfile = settings.path_train + 'training.csv' + else: + outfile = settings.path_train + + isExist = os.path.exists(outfile.split('/')[-1]) + if not isExist: + os.makedirs(outfile.split('/')[-1]) + + # If outfile exists already, delete + if os.path.exists(outfile): + os.remove(outfile) + + # Save dataframe as csv + self.df_train.to_csv(outfile, sep=',', index=False) + self.logger.info('Training dataset has been saved to ' + + outfile) diff --git a/src/plain_scripts/data_preprocessing.py b/src/plain_scripts/data_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..5435a98e3897560a5bdd69f5fcb6e70f942ed01d --- /dev/null +++ b/src/plain_scripts/data_preprocessing.py @@ -0,0 +1,348 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +from tqdm import tqdm +import os +import netCDF4 as nc +import settings +import pandas as pd + +import settings +from utilities.ncfile_generation import generate_3dncfile +from utilities.cut_and_interpolate import cut_and_interpolate +from utilities.strings_for_ncfile import features_to_char, char_to_string + +class generate_data_matrix: + + """ + This class generates a nc-file containing all datasets, + a list of all contained features and their respective longitude and + latitude vectors. Provided are interpolated excerpts of the datasets + determined by the provided bounding box. + + Input: + from_scratch: boolean, True if nc-file should be generated from + scratch, otherwise false + delete: True if dataset/feature should be deleted from nc-file + False if dataset should be added to existing nc-file + (careful: from_scratch needs to be False!) + bb: list, bounding box in the format + [<y_max>, <y_min>, <x_min>, <x_max>] + data_to_handle: list of features that should be added/deleted + datasets need to be listed in list_of_raw_datasets + keys_already_included: list of already included features in + the training/prediction dataset + (from_scratch=False, delete=False) + + Output: + netCDF4 file + """ + + def __init__(self, + from_scratch=True, + delete=False, + dataset='undefined', + bb=None, + data_to_handle=None, + keys_already_included=None): + + self.from_scratch = from_scratch + self.dataset = dataset + self.bb = bb + self.keys_already_included = keys_already_included + + self.import_data_summary() + + if not from_scratch: + self.data_to_handle = self.keys_to_include + self.delete = delete + self.import_cube() + + if delete: + # Delete dataset from cube + self.delete_dataset() + else: + # Add dataset to cube + self.add_dataset() + else: + # Generate cube from scratch + # Parameter stored in list_of_raw_datasets + self.data_to_handle = self.keys_to_include + self.main() + + def import_data_summary(self): + + """ + Import the csv files containing information on the geospatial datasets + and the keys to include + """ + + self.data_properties = pd.read_csv(settings.data_summary_path) + self.keys_to_include = pd.read_csv(settings.key_to_include_path)['keys_to_include'].tolist() + + def find_dataset(self): + + """ + Find the index of the features to handle in the list of features + contained in the nc-file. + Return: + idx: list of indices + """ + + return self.features.index(self.data_to_handle) + + def add_dataset(self): + + # Number of overlapping features between datasets in the cube + # and the datasets to add/delete + print('features') + print(self.features) + + if self.dataset == 'prediction': + for_prediction = True + else: + for_prediction = False + + # Define new cube in the size of existing cube of cut and interpolated + # datasets with the depth equaling the number of existing datasets plus + # the ones to add + ges = list(self.features) + [x for x in self.data_to_handle if x not in self.features] + cube = np.zeros((np.shape(self.cube)[0], + np.shape(self.cube)[1], + len(ges))) + + for feat in self.features: + cube[:, :, ges.index(feat)] = self.cube[:, :, self.features.index(feat)] + + for key in self.data_to_handle: + s = cut_and_interpolate( + key=key, + path=list(self.data_properties['path'])[list(self.data_properties['keys']).index(key)], + no_data_value=list(self.data_properties['no_value'])[list(self.data_properties['keys']).index(key)], + categorical=list(self.data_properties['categorical'])[list(self.data_properties['keys']).index(key)], + several=True, + several_same=False, + first=False, + #bb=self.bb, + for_prediction=for_prediction, + path_properties=self.folder + + '/data_combined_' + + self.dataset + + '_' + + str(settings.resolution) + '.pkl') + array, _, _, cuttable = s.array, s.x, s.y, s.cuttable + + if not cuttable: + print('Error! Bounding box larger than dataset!\ + Please adapt bounding_box!') + break + else: + # Store it at respective position in cube + # Add cut and interpolated dataset to cube + cube[:, :, ges.index(key)] = array + + # Save the updated cube to nc file + self.determine_outfile() + self.char_features = features_to_char(ges) + generate_3dncfile(self.outfile, + self.x, + self.y, + cube, + len(ges), + self.char_features, + crs='wgs84', + data_unit=None, + missing_value=settings.no_value) + + def delete_dataset(self): + + """ + Delte datasets from data_to_handle + from nc-file and save new nc-file + """ + + # Determine indices of the datasets that shall be removed + idx = [] + for data in self.data_to_handle: + idx.append(self.find_dataset) + + # Define new cube in the size of existing + # cube of cut and interpolated datasets + cube = np.zeros((np.shape(self.cube)[0], + np.shape(self.cube)[1], + np.shape(self.cube)[2]-len(self.data_to_handle))) + count = 0 + + # Go through the datasets and transfer all + # datasets except for them to be removed + for i in range(np.shape(self.cube)[2]): + if self.features[i] not in self.data_to_handle: + cube[:, :, count] = self.cube[:, :, i] + count = count + 1 + + # Update the feature list + for data in self.data_to_handle: + self.features.remove(data) + print('Dataset now contains the following features: ' + + str(self.features)) + + # Save new data cube + self.determine_outfile() + self.char_features = features_to_char(self.from_scratch, self.features) + generate_3dncfile(self.outfile, + self.x, + self.y, + cube, + len(self.features), + self.char_features, + crs='wgs84', + data_unit=None, + missing_value=settings.no_value) + + def import_cube(self): + + """ + Existing nc-file is imported for adding/deleting another feature. + """ + + self.determine_outfile() # Determine where cube is stored + + # Import cube + self.ds = nc.Dataset(self.outfile) + self.cube = self.ds['Result'][:, :, :].data + self.x = self.ds['Longitude'][:].data + self.y = self.ds['Latitude'][:].data + self.features = self.ds['features'][:].data + + self.features = char_to_string(self.features) + print('Dataset contains the following features: ' + str(self.features)) + + def determine_outfile(self): + + """ + Determine whether folder to store the nc-file already exists. + If not, it is created. Outfile path is determined. + """ + + # Cube is stored in the same folder + # as the final training/prediction dataset + if self.dataset == 'training': + self.folder = settings.path_train.rsplit('/', 1)[0] + self.outfile = self.folder + '/data_combined_training_' + str(settings.resolution) + '.nc' + elif self.dataset == 'prediction': + self.folder = settings.path_pred.rsplit('/', 1)[0] + self.outfile = self.folder + '/data_combined_prediction_' + str(settings.resolution) + '.nc' + + # Create folder if it doesn't yet exist + isExist = os.path.exists(self.folder) + if not isExist: + os.makedirs(self.folder) + + def main(self): + + """ + Routine to pre-process the datasets from scratch + """ + + # Go through all datasets that shall be pre-processed + for i in tqdm(range(len(self.data_to_handle))): + j = list(self.data_properties['keys']).index(self.data_to_handle[i]) + + if i == 0: + if self.dataset == 'prediction': + # Cut and interpolate dataset to desired resolution. + # Check script for information on input parameters. + s = cut_and_interpolate( + key=self.data_to_handle[i], + path=self.data_properties['path'].tolist()[j], + no_data_value=self.data_properties['no_value'].tolist()[j], + categorical=bool(self.data_properties['categorical'].tolist()[j]), + several=True, + several_same=False, + first=True, + bb=self.bb, + for_prediction=True, + path_properties=settings.path_pred.rsplit('/', 1)[0] + '/data_combined_prediction_' + str(settings.resolution) + '.pkl') + else: + # Cut and interpolate dataset to desired resolution. + # Check script for information on input parameters. + s = cut_and_interpolate( + key=self.data_to_handle[i], + path=self.data_properties['path'].tolist()[j], + no_data_value=self.data_properties['no_value'].tolist()[j], + categorical=bool(self.data_properties['categorical'].tolist()[j]), + several=True, + several_same=False, + first=True, + bb=self.bb, + path_properties=settings.path_train.rsplit('/', 1)[0] + '/data_combined_training_' + str(settings.resolution) + '.pkl') + array = s.array + self.x = s.x + self.y = s.y + cuttable = s.cuttable + + if not cuttable: + print('Error! Bounding box larger than dataset!\ + Please adapt bounding_box!') + break + + # Store cut and interpolated dataset in array + cube = np.zeros((np.shape(array)[0], + np.shape(array)[1], + len(self.data_to_handle))) + cube[:, :, 0] = array + else: + if self.dataset == 'prediction': + s = cut_and_interpolate( + key=self.data_to_handle[i], + path=self.data_properties['path'].tolist()[j], + no_data_value=self.data_properties['no_value'].tolist()[j], + categorical=bool(self.data_properties['categorical'].tolist()[j]), + several=True, + several_same=False, + first=False, + bb=self.bb, + for_prediction=True, + path_properties=settings.path_pred.rsplit('/', 1)[0] + + '/data_combined_prediction_' + + str(settings.resolution) + + '.pkl') + else: + # Cut and interpolate dataset to desired resolution. + # Check script for information on input parameters. + s = cut_and_interpolate( + key=self.data_to_handle[i], + path=self.data_properties['path'].tolist()[j], + no_data_value=self.data_properties['no_value'].tolist()[j], + categorical=bool(self.data_properties['categorical'].tolist()[j]), + several=True, + several_same=False, + first=False, + bb=self.bb, + path_properties=settings.path_train.rsplit('/', 1)[0] + + '/data_combined_training_' + + str(settings.resolution) + + '.pkl') + array, cuttable = s.array, s.cuttable + + if not cuttable: + print('Error! Bounding box larger than dataset!\ + Please adapt bounding_box!') + break + # Store cut and interpolated dataset in array + cube[:, :, i] = array + + # Store the array in a nc-file and meta data in pickle file + if cuttable: + self.determine_outfile() + self.char_features = features_to_char(self.data_to_handle) + generate_3dncfile(self.outfile, + self.x, + self.y, + cube, + len(self.data_to_handle), + self.char_features, + crs='wgs84', + data_unit=None, + missing_value=settings.no_value) diff --git a/src/plain_scripts/settings_template.py b/src/plain_scripts/settings_template.py new file mode 100644 index 0000000000000000000000000000000000000000..ee0f299ace89d8dd89dd00117d33e8ca4ea10889 --- /dev/null +++ b/src/plain_scripts/settings_template.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" + This is a template file for settings.py + Either duplicate and rename or fill out and rename. + More information on the individual meaning and what to consider can be + found in the user manual +""" + +import logging +import json +import types + +def export_variables(logger): + + variables = globals() + # Filter out non-serializable objects + defined_vars = {} + for k, v in variables.items(): + if not k.startswith('__') and not callable(v) and not isinstance(v, types.ModuleType): + try: + # Test if the value is JSON serializable + json.dumps(v) + defined_vars[k] = v + except (TypeError, OverflowError): + # Skip non-serializable values + pass + # Convert the dictionary to a JSON string + vars_json = json.dumps(defined_vars, indent=4) + logger.info("Exported variables: %s", vars_json) + +# Mandatory parameters + +# Steps +training_dataset = # Boolean, if training dataset shall be created +preprocessing = # Defines preprocessing approach: 'cluster', 'interpolation', 'no_interpolation' +train_from_scratch = +train_delete = None + +prediction_dataset = # Boolean, if prediction dataset shall be created +pred_from_scratch = +pred_delete = None + +map_generation = # Boolean, if mapping shall be performed + +# General + +crs = # Coordinate reference system, string +no_value = # No data value, integer, suggestion -999 +random_seed = # Random seed, integer +resolution = # Resolution in m of the final map, integer, all datasets will be interpolated to this resolution +path_ml = # Path to where shire framework related parameters/files will be stored +data_summary_path = # Path to the data summary file, string, relevant only for training/prediction dataset generation +key_to_include_path = # Path to kets_to_include file, string, relevant only for training/prediction dataset generation + +# Training dataset generation + +size = # Size of the validation dataset, float number between 0 and 1 +path_train = # Path to directory where the training dataset is/shall be stored +ohe = # One-hot encoding, bool + +path_landslide_database = # Path to where the landslide database is stored, string +ID = # Name of the column containing landslide ID, string +landslide_database_x = # Name of the column containing longitude values, string +landslide_database_y = # Name of the column containing latitude values, string + +path_nonls_locations = # Path to where the non-landslide database is stored, string +num_nonls = # Number of non-landslide locations to include in the training dataset, integer +nonls_database_x = # Name of the column containing longitude values, string +nonls_database_y = # Name of the column containing longitude values, string + +cluster = # Use clustering for training dataset generation, bool +interpolation = # Use interpolation for training dataset generation, bool + +# Prediction dataset generation + +bounding_box = # Coordinates of the edges of the bounding box of the area of interest, list, [<ymax>, <ymin>, <xmin>, <xmax>] +path_pred = # Path to directory where the prediction dataset is/shall be stored + +# Map generation + +RF_training = # Train the RF, bool +RF_prediction = # Make a prediction using the RF, bool + +not_included_pred_data = ['xcoord', 'ycoord']# List of features in the training dataset not to be considered in prediction +not_included_train_data = [] # List of features in the training dataset not to be considered in model training + +num_trees = # Number of trees in the Random Forest, integer +criterion = # Criterion for the Random Forest, string +depth = # Number of nodes of the RF, integer + +model_to_save = # Folder name for storage of the RF results, string +model_to_load = # Folder where RF model is stored, string, identical to model_to_save if training and prediction is done at the same time +model_database_dir = # Directory where models should be stored +parallel = # Boolean, true if prediction data shall be split to predict in parallel + +keep_cat_features = #bool, true if categorical features shall be kept even if some instances in prediction dataset have classes not covered by the prediction dataset +remove_instances = # bool, true of instances in prediction dataset shall be removed if they have different classes than the instances in the training dataset \ No newline at end of file diff --git a/src/plain_scripts/shire.py b/src/plain_scripts/shire.py new file mode 100644 index 0000000000000000000000000000000000000000..d93025d5d0a66a334ef39dbb2b857568ec5075b5 --- /dev/null +++ b/src/plain_scripts/shire.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import settings +import pandas as pd +import os + +from create_training_data import create_training_data +from create_prediction_data import create_prediction_data +from RandomForest import RandomForest +from check_user_input import check_general_settings +from utilities.initialise_log import save_log + +""" + This script controls the hazard mapping framework SHIRE. + Adapt settings.py before running this script. Ensure a data summary csv file + and a csv file containing the keys to include. + For more information please refer to the user manual. +""" + +print('SHIRE - Landslide hazard mapping framework') +print('If you have not prepared settings.py and \ + the necessary csv files, stop the script.') + +# Check user input +s = check_general_settings() +if s.error: + print('Please check settings.py again, there are errors listed in the log.') +else: + if settings.training_dataset or settings.map_generation: + save_path = os.path.dirname(settings.path_train) + '/shire_run.log' + elif settings.prediction_dataset: + save_path = os.path.dirname(settings.path_pred) + '/shire_run.log' + + if os.path.exists(save_path): + os.remove(save_path) + logger = save_log(save_path) + + settings.export_variables(logger) + + if settings.training_dataset: + print('Training dataset will be generated') + logger.info('Training dataset generation started') + if settings.preprocessing is None: + if settings.preprocessing == 'cluster': + cluster = True + interpolation = True + elif settings.preprocessing == 'interpolation': + cluster = False + interpolation = True + elif settings.preprocessing == 'no_interpolation': + cluster = False + interpolation = False + + s = create_training_data( + from_scratch=settings.train_from_scratch, + delete=settings.train_delete, + data_to_handle=list(pd.read_csv(settings.key_to_include_path)['keys_to_include']), + cluster=settings.cluster, + interpolation=settings.interpolation, + preprocessing=settings.preprocessing, + log=logger) + + print('Training dataset successfully created') + logger = s.logger + logger.info('Training dataset successfully created') + + if settings.prediction_dataset: + print('Prediction dataset will be generated') + logger.info('Prediction dataset generation started') + + s = create_prediction_data( + from_scratch=settings.pred_from_scratch, + delete=settings.pred_delete, + log=logger) + + print('Prediction dataset successfully created') + logger = s.logger + logger.info('Prediction dataset successfully created') + if settings.map_generation: + print('Map will be generated') + logger.info('Map generation started') + + if settings.parallel: + print('Prediction will run in parallel') + logger.info('Prediction will run in parallel') + if settings.RF_training: + logger.info('Random Forest training is launched') + s = RandomForest('train_test', parallel=settings.parallel, log=logger) + logger = s.logger + if settings.RF_prediction: + logger.info('Random Forest prediction in launched') + s = RandomForest('prediction', parallel=settings.parallel, log=logger) + logger = s.logger + + print('Map successfully created') + logger.info('Map successfully created') + + for handler in logger.handlers: + handler.close() + logger.removeHandler(handler) diff --git a/src/plain_scripts/utilities/cut_and_interpolate.py b/src/plain_scripts/utilities/cut_and_interpolate.py new file mode 100644 index 0000000000000000000000000000000000000000..77549817a676ee3287f0ea0878a51cd22727b893 --- /dev/null +++ b/src/plain_scripts/utilities/cut_and_interpolate.py @@ -0,0 +1,1002 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np +import pickle +import os +import netCDF4 as nc +import matplotlib.pyplot as plt + +from LatLon23 import LatLon, Latitude, Longitude +from scipy.interpolate import interp2d, interp1d + +import settings +from utilities.import_raw_dataset import import_raw_dataset +from utilities.ncfile_generation import generate_basic_ncfile +from utilities.import_format import import_tif, import_nc + +class cut_and_interpolate: + + """ + This class imports a dataset, cuts it to the desired extent and + interpolates it to the desired resolution. + The settings for this is stored in settings.py + + """ + + def __init__(self, key=None, path=None, no_data_value=None, + categorical=None, several=None, several_same=None, + first=None, bb=None, cluster=False, for_prediction=False, + path_properties=None): + + """ + Input: + key: key belonging to the dataset in list_of_raw_datasets.py + path: path where the dataset in list_of_raw_datasets.py + no_data_value: value representing no data + categorical: boolean if dataset contains + categorical information + several: boolean if class is called several times + e.g. in a for loop + several_same: boolean if several datasets have the same + spatial extent and resolution + first: boolean if several, is this the first dataset + bb: bounding box, format list ymax, ymin, xmin, xmax + information in settings.py + cluster: boolean if training locations are clustered + path_properties: path to seperate file storing information + on applied interpolation and extent + """ + + # Import cutting and interpolation information if this is not the first + # dataset of several to be cut and interpolated + if several and not first: + with open(path_properties, 'rb') as handle: + self.properties = pickle.load(handle) + + self.path = path + self.key = key + self.cluster = cluster + self.for_prediction = for_prediction + + # Define bounding box + if cluster: + self.bb_ges = bb + self.to_cluster = True + elif self.for_prediction: + self.to_cluster = False + + if several and not first: + self.bb = [self.properties['interp_vectors']['y'][0], + self.properties['interp_vectors']['y'][-1], + self.properties['interp_vectors']['x'][0], + self.properties['interp_vectors']['x'][-1]] + else: + self.bb = settings.bounding_box + + else: + self.to_cluster = False + + if several and not first: + self.bb = [self.properties['interp_vectors']['y'][0], + self.properties['interp_vectors']['y'][-1], + self.properties['interp_vectors']['x'][0], + self.properties['interp_vectors']['x'][-1]] + else: + self.bb = bb + + self.path_properties = path_properties + if no_data_value != 'None': + self.no_data = no_data_value.split(',') + self.no_data = [float(val) for val in self.no_data] + else: + self.no_data = no_data_value + + self.categorical = categorical + self.several = several + self.several_same = several_same + self.first = first + + # Define limits to determine interpolation approach for dataset + self.limit_org = 500000 + self.limit_interp = 500000000 + self.size = 200 + self.overlap = 100 + + self.data, self.x_org, self.y_org = import_raw_dataset(self.path, self.no_data, settings.no_value) # Import raw datasets + + # If training locations are clustered + if self.to_cluster: + + self.x_raw = self.x_org + self.y_raw = self.y_org + self.data_raw = self.data + + def parallized_interpolation(num): + + # Interpolate the cut dataset + a = self.interpolate_dataset( + self.subsets[num], + self.y_orgs[num], + self.x_orgs[num], + self.ds['Longitude' + str(num)][:].data, + self.ds['Latitude' + str(num)][:].data) + + # Save the interpolated dataset in the nc file/Update + # the cut and interpolated dataset for the 2nd + # and following datasets + if self.first_dataset: + result = self.ds.createVariable( + 'Result' + str(num), + 'f4', + ('lat' + str(num), 'lon' + str(num))) + result[:, :] = a + else: + self.ds['Result' + str(num)][:, :] = a + + self.subsets = [] + self.x_orgs = [] + self.y_orgs = [] + self.cuttables = [] + + self.first_dataset = False + + # Iterate over all bounding boxes of the + # clustered training locations + for count, self.bb in enumerate(self.bb_ges): + + self.x_org = self.x_raw + self.y_org = self.y_raw + self.data = self.data_raw + + # Check that all bounding boxes are covered + # by the extent of the dataset + self.compare_extends() + self.cuttables.append(self.cuttable) + + # Cut the original dataset to the currently + # considered bounding box + self.cut_to_boundingbox() + # Store cut properties to be used in the interpolation + self.subsets.append(self.data) + self.x_orgs.append(self.x_org) + self.y_orgs.append(self.y_org) + + if not os.path.isfile('tmp.nc') or self.first_dataset: + + if count == 0: + # Open temporarty file to store the + # interpolated subsets of the dataset + self.ds = generate_basic_ncfile('tmp.nc') + self.first_dataset = True + + # Determine the x and y vectors for interpolation + self.determine_reference_vectors() + # Saving the interpolation vectors + # to the temporary file + self.ds.createDimension('lat' + str(count), + len(self.y)) + self.ds.createDimension('lon' + str(count), + len(self.x)) + + longitude = self.ds.createVariable( + 'Longitude' + str(count), + 'f4', + 'lon' + str(count)) + latitude = self.ds.createVariable( + 'Latitude' + str(count), + 'f4', + 'lat' + str(count)) + + longitude[:] = self.x + latitude[:] = self.y + + elif (os.path.isfile('tmp.nc') + and not self.first_dataset and count == 0): + # If it's not the first dataset to be cut, + # open the nc file + self.ds = nc.Dataset('tmp.nc', mode='a') + + self.one_go, self.as_chunks, self.as_cols = True, False, False + + # Final decision whether cutting and interpolation is possible + if False in self.cuttables: + self.cuttable = False + else: + self.cuttable = True + + # Interpolate all subsets in parallel + #Parallel(n_jobs=5, backend='threading', timeout=999999) + #(delayed(parallized_interpolation)(num) + # for num in range(len(self.bb_ges))) + + for num in range(len(self.bb_ges)): + parallized_interpolation(num) + + self.ds.close() + + elif self.for_prediction: + def test_parallel_interpolation(i): + + ref = self.interpolate_dataset( + np.array(chunks_old[i]), + np.array(np.linspace( + self.y_org[pixels_old[i][0]], + self.y_org[pixels_old[i][1]], + abs(pixels_old[i][1]-pixels_old[i][0]))), + np.array(np.linspace( + self.x_org[pixels_old[i][2]], + self.x_org[pixels_old[i][3]], + abs(pixels_old[i][3]-pixels_old[i][2]))), + self.x_final[i], + self.y_final[i]) + + return ref + + self.compare_extends() + + # If bounding box is within limits of dataset + if self.cuttable: + # Cut to the bounding box + self.cut_to_boundingbox() + # Determine interpolation vectors + self.determine_reference_vectors() + # Depending on dataset size determine interpolation approach + self.determine_interpolation_approach() + + if self.one_go: + # Interpolate dataset + self.array = self.interpolate_dataset(self.data, + self.y_org, + self.x_org, + self.x, + self.y) + + # If original dataset has to be split into chunks + elif self.as_chunks: + # Split the dataset into chunks + chunks_old, pixels_old = self.split_into_chunks() + # Determine interpolation vectors for each chunk + self.determine_new_vector() + + #ref_tmp = Parallel(n_jobs=5, + # backend='threading', + # timeout=999999) + #(delayed(test_parallel_interpolation)(num) + # for num in range(len(self.x_final))) + ref_tmp = [] + for num in range(len(self.x_final)): + ref_tmp.append(test_parallel_interpolation(num)) + + # Combine the individual interpolated + # chunks into one dataset + self.array = self.reshape_chunks(ref_tmp) + + elif self.as_cols: + + self.split_into_chunks() # Split the dataset into chunks + + ref_tmp = [] + # Go through all chunks and interpolate them individually + for i in range(len(self.x_final)): + ref = self.interpolate_dataset(self.data, + self.y_org, + self.x_org, + self.x_final[i], + self.y_final[i]) + ref_tmp.append(list(ref)) + + self.array = self.reshape_chunks(ref_tmp) + + # If a path is provided, the cutting and interpolation + # information is saved in a pickle file + if self.path_properties is not None: + with open(self.path_properties, 'wb') as handle: + pickle.dump(self.properties, handle) + + else: + # Check if bounding box is covered by limits of dataset + self.compare_extends() + + # If bounding box is within limits of dataset + if self.cuttable: + + self.cut_to_boundingbox() # Cut to the bounding box + # Determine interpolation vectors + self.determine_reference_vectors() + # Depending on dataset size determine interpolation approach + self.determine_interpolation_approach() + + # If interpolation can be done in one go + if self.one_go: + + # Interpolate dataset + self.array = self.interpolate_dataset(self.data, + self.y_org, + self.x_org, + self.x, + self.y) + + # If original dataset has to be split into chunks + elif self.as_chunks: + # Split the dataset into chunks + chunks_old, pixels_old = self.split_into_chunks() + # Determine interpolation vectors for each chunk + self.determine_new_vector() + + ref_tmp = [] + # Go through all chunks and interpolate them individually + for i in range(len(chunks_old)): + ref = self.interpolate_dataset( + np.array(chunks_old[i]), + np.array(np.linspace( + self.y_org[pixels_old[i][0]], + self.y_org[pixels_old[i][1]], + abs(pixels_old[i][1]-pixels_old[i][0]))), + np.array(np.linspace( + self.x_org[pixels_old[i][2]], + self.x_org[pixels_old[i][3]], + abs(pixels_old[i][3]-pixels_old[i][2]))), + self.x_final[i], + self.y_final[i]) + ref_tmp.append(list(ref)) + + # Combine the individual interpolated + # chunks into one dataset + self.array = self.reshape_chunks(ref_tmp) + + elif self.as_cols: + + self.split_into_chunks() # Split the dataset into chunks + + ref_tmp = [] + # Go through all chunks and interpolate them individually + for i in range(len(self.x_final)): + ref = self.interpolate_dataset(self.data, + self.y_org, + self.x_org, + self.x_final[i], + self.y_final[i]) + ref_tmp.append(list(ref)) + # Combine the individual interpolated chunks + # into one dataset + self.array = self.reshape_chunks(ref_tmp) + + # If a path is provided, the cutting and interpolation + # information is saved in a pickle file + if self.path_properties is not None: + with open(self.path_properties, 'wb') as handle: + pickle.dump(self.properties, handle) + + def compare_extends(self): + + """ + Determine if the bounding box to which the dataset shall be cut is + completely covered by the dataset. + If not, the execution of the script will be aborted. + """ + + self.cuttable = True + self.left_too_short = False + self.right_too_short = False + self.bottom_too_short = False + self.top_too_short = False + y, x = [], [] + + for coord in [self.y_org[0], self.y_org[-1], self.bb[0], self.bb[1]]: + + if coord >= 0: + y.append(90 + coord) + + if coord < 0: + y.append(90 - abs(coord)) + + for coord in [self.x_org[0], self.x_org[-1], self.bb[2], self.bb[3]]: + + if coord >= 0: + x.append(180 + coord) + + if coord < 0: + x.append(180 - abs(coord)) + + if y[2] > y[0]: + self.top_too_short = True + if y[3] < y[1]: + self.bottom_too_short = True + if x[2] < x[0]: + self.left_too_short = True + if x[3] > x[1]: + self.right_too_short = True + + if (self.bottom_too_short or self.top_too_short + or self.left_too_short or self.right_too_short): + self.cuttable = False + self.array = None + self.x = None + self.y = None + + return self.cuttable + + def cut_to_boundingbox(self): + + """ + Cut the dataset to the bounding box + """ + + if self.several_same and not self.first: + + # Load the indices of the bounding box from the properties file + self.top = self.properties['boundaries']['top'] + self.bottom = self.properties['boundaries']['bottom'] + self.left = self.properties['boundaries']['left'] + self.right = self.properties['boundaries']['right'] + + else: + # If several datasets shall be interpolated after another and the + # current run is the first dataset + if (self.several and self.first) or (self.several_same and self.first): + # Open empty dictionary to store the cutting and + # interpolation information in + self.properties = {} + + # Determine if the coordinate vectors + # contain both pos and neg values + if (all(val >= 0 for val in self.x_org) + or all(val <= 0 for val in self.x_org)): + + # Determine pixel index of left and right edge of bounding box + self.left = int((np.abs(self.x_org - self.bb[2])).argmin()) + self.right = int((np.abs(self.x_org - self.bb[3])).argmin()) + + else: + + if self.bb[2] <= 0: + tmp = [x for x in self.x_org if x <= 0] + else: + tmp = [x for x in self.x_org if x >= 0] + + self.left = list(self.x_org).index( + tmp[int((np.abs(np.array(tmp) - self.bb[2])).argmin())]) + + if self.bb[3] <= 0: + tmp = [x for x in self.x_org if x <= 0] + else: + tmp = [x for x in self.x_org if x >= 0] + + self.right = list(self.x_org).index( + tmp[int((np.abs(np.array(tmp) - self.bb[3])).argmin())]) + + if (all(val >= 0 for val in self.y_org) + or all(val <= 0 for val in self.y_org)): + + # Determine pixel index of top and bottom edge of bounding box + self.top = int((np.abs(self.y_org - self.bb[0])).argmin()) + self.bottom = int((np.abs(self.y_org - self.bb[1])).argmin()) + + else: + + if self.bb[0] <= 0: + tmp = [y for y in self.y_org if y <= 0] + else: + tmp = [y for y in self.y_org if y >= 0] + + self.top = list(self.y_org).index( + tmp[int((np.abs(np.array(tmp) - self.bb[0])).argmin())]) + + if self.bb[1] <= 0: + tmp = [y for y in self.y_org if y <= 0] + else: + tmp = [y for y in self.y_org if y >= 0] + + self.bottom = list(self.y_org).index( + tmp[int((np.abs(np.array(tmp) - self.bb[1])).argmin())]) + + # Add pixel in all directions to account for rounding issues + + if not self.for_prediction: + if self.left-100 >= 0: + self.left = self.left - 100 + if self.top-100 >= 0: + self.top = self.top - 100 + if self.bottom+100 <= np.shape(self.data)[0]: + self.bottom = self.bottom + 100 + if self.right+100 <= np.shape(self.data)[1]: + self.right = self.right + 100 + + if self.several_same and self.first: + # Store the indices to be used again with the next dataset + self.properties['boundaries'] = {} + self.properties['boundaries']['top'] = self.top + self.properties['boundaries']['bottom'] = self.bottom + self.properties['boundaries']['left'] = self.left + self.properties['boundaries']['right'] = self.right + + # Cut the dataset and x, y vectors to the determined extent + self.data = self.data[self.top:self.bottom, self.left:self.right] + + self.x_org = self.x_org[self.left:self.right] + self.y_org = self.y_org[self.top:self.bottom] + + def determine_reference_vectors(self): + + """ + Determine interpolation vectors x and y. + """ + + # If several datasets shall be interpolated after another and the + # current run is the first dataset + if self.several and self.first: + + # Determine distance in meters in x and y + # direction between bounds of dataset + point1_x = LatLon(Latitude(self.y_org[0]), + Longitude(self.x_org[0])) + point2_x = LatLon(Latitude(self.y_org[0]), + Longitude(self.x_org[-1])) + distance_x = point1_x.distance(point2_x)*1000 + + point1_y = LatLon(Latitude(self.y_org[0]), + Longitude(self.x_org[0])) + point2_y = LatLon(Latitude(self.y_org[-1]), + Longitude(self.x_org[0])) + distance_y = point1_y.distance(point2_y)*1000 + + # Determine interpolation vector with desired resolution + self.x = np.linspace(self.x_org[0], + self.x_org[-1], + int(distance_x/settings.resolution)) + self.y = np.linspace(self.y_org[0], + self.y_org[-1], + int(distance_y/settings.resolution)) + + # Store interpolation vector in properties file + self.properties['interp_vectors'] = {} + self.properties['interp_vectors']['x'] = self.x + self.properties['interp_vectors']['y'] = self.y + + # If only one dataset shall be interpolated + elif not self.several: + + # Determine distance in meters in x and y + # direction between bounds of dataset + point1_x = LatLon(Latitude(self.y_org[0]), + Longitude(self.x_org[0])) + point2_x = LatLon(Latitude(self.y_org[0]), + Longitude(self.x_org[-1])) + distance_x = point1_x.distance(point2_x)*1000 + + point1_y = LatLon(Latitude(self.y_org[0]), + Longitude(self.x_org[0])) + point2_y = LatLon(Latitude(self.y_org[-1]), + Longitude(self.x_org[0])) + distance_y = point1_y.distance(point2_y)*1000 + + # Determine interpolation vector with desired resolution + self.x = np.linspace(self.x_org[0], + self.x_org[-1], + int(distance_x/settings.resolution)) + self.y = np.linspace(self.y_org[0], + self.y_org[-1], + int(distance_y/settings.resolution)) + + # If several datasets shall be interpolated after another and the + # current run is not the first dataset + elif self.several and not self.first: + # Load the interpolation vectors from the properties file + self.x = np.array(self.properties['interp_vectors']['x']) + self.y = np.array(self.properties['interp_vectors']['y']) + + def determine_new_vector(self): + + """ + Determine interpolation vectors for the chunks. + """ + + # For each chunk determine the original x and y vectors + x_ref = [[self.x_org[self.x_limits[i][0]], + self.x_org[self.x_limits[i][1]]] + for i in range(len(self.x_limits))] + y_ref = [[self.y_org[self.y_limits[i][0]], + self.y_org[self.y_limits[i][1]]] + for i in range(len(self.y_limits))] + + self.x_final = [] + self.y_final = [] + + # For each original vector find the corresponding values in the + # interpolation vectors + for j in range(np.shape(x_ref)[0]): + ind_min_x = int((np.abs(self.x - x_ref[j][0])).argmin()) + ind_max_x = int((np.abs(self.x - x_ref[j][1])).argmin()) + + self.x_final.append(self.x[ind_min_x:ind_max_x]) + + for j in range(np.shape(y_ref)[0]): + ind_min_y = int((np.abs(self.y - y_ref[j][0])).argmin()) + ind_max_y = int((np.abs(self.y - y_ref[j][1])).argmin()) + + self.y_final.append(self.y[ind_min_y:ind_max_y]) + + def split_into_chunks(self): + + """ + Split the dataset into chunks for interpolation. Make sure that + the chunks overlap. + """ + + # If the dataset needs to be split into chunks + if self.as_chunks: + + y_len, x_len = np.shape(self.data)[0], np.shape(self.data)[1] + + # Split in equal sized chunks and treat the bottom and right + # differently that have different shape than the equal sized chunks + plus_y = self.data.shape[0] % self.size + plus_x = self.data.shape[1] % self.size + + # Number of equal sized chunks in x and y direction + num_y = int(self.data.shape[0] / self.size) + num_x = int(self.data.shape[1] / self.size) + + # If final columns and row too small to be called individual + # chunks, combine with second to last row and column + if plus_y < 2/3*self.size: + num_y = num_y - 1 + + if plus_x < 2/3*self.size: + num_x = num_x - 1 + + self.num_y = num_y + self.num_x = num_x + + chunks = [] # Store the chunks + pixels = [] # Store the pixel limits to acces original coordinates + count = 0 + + # Store the coord limits to acces original coordinates + self.x_limits = [] + self.y_limits = [] + + # Save the chunks in a list + count_ges = 0 + tmpy = 0 + for i in range(num_y): + tmpx = 0 + for j in range(num_x): + # Make sure that with the overlap the boundaries are not exceeded + if ((i+1)*self.size-1+self.overlap <= self.data.shape[0]) and ((j+1)*self.size-1+self.overlap <= self.data.shape[1]): + chunks.append( + list(self.data[i*self.size:(i+1)*self.size-1+self.overlap, + j*self.size:(j+1)*self.size-1+self.overlap])) + pixels.append( + [i*self.size, (i+1)*self.size-1+self.overlap, + j*self.size, (j+1)*self.size-1+self.overlap]) + + self.x_limits.append([j*self.size, (j+1)*self.size-1+self.overlap]) + self.y_limits.append([i*self.size, (i+1)*self.size-1+self.overlap]) + + elif ((i+1)*self.size-1+self.overlap > self.data.shape[0]) and ((j+1)*self.size-1+self.overlap <= self.data.shape[1]): + + chunks.append( + list(self.data[i*self.size:, + j*self.size:(j+1)*self.size-1+self.overlap])) + pixels.append( + [i*self.size, np.shape(self.data)[0]-1, + j*self.size, (j+1)*self.size-1+self.overlap]) + elif ((j+1)*self.size-1+self.overlap > self.data.shape[1]) and ((i+1)*self.size-1+self.overlap <= self.data.shape[0]): + chunks.append( + list(self.data[i*self.size:(i+1)*self.size-1+self.overlap, + j*self.size:])) + pixels.append( + [i*self.size, (i+1)*self.size-1+self.overlap, + j*self.size, np.shape(self.data)[1]-1]) + elif ((j+1)*self.size-1+self.overlap > self.data.shape[1]) and ((i+1)*self.size-1+self.overlap > self.data.shape[0]): + chunks.append( + list(self.data[i*self.size:, + j*self.size:])) + pixels.append( + [i*self.size, np.shape(self.data)[0]-1, + j*self.size, np.shape(self.data)[1]-1]) + tmpy = tmpy + 1 + + # Chunks most bottom column + tmpx = 0 + for j in range(num_x): + if ((j+1)*self.size-1+self.overlap <= self.data.shape[1]): + chunks.append( + list(self.data[(num_y)*self.size:-1, + j*self.size:(j+1)*self.size-1+self.overlap])) + pixels.append( + [(num_y)*self.size, np.shape(self.data)[0]-1, + j*self.size, (j+1)*self.size-1+self.overlap]) + self.x_limits.append([j*self.size, (j+1)*self.size-1+self.overlap]) + self.y_limits.append([(num_y)*self.size, np.shape(self.data)[0]-1]) + else: + chunks.append( + list(self.data[(num_y)*self.size:-1, + j*self.size:])) + pixels.append( + [(num_y)*self.size, np.shape(self.data)[0]-1, + j*self.size, np.shape(self.data)[1]-1]) + self.x_limits.append([j*self.size, (j+1)*self.size-1]) + + # Chunks most right column + tmpy = 0 + for j in range(num_y): + if ((j+1)*self.size-1+self.overlap <= self.data.shape[0]): + chunks.append( + list(self.data[j*self.size:(j+1)*self.size-1+self.overlap, + (num_x)*self.size:-1])) + pixels.append( + [j*self.size, (j+1)*self.size-1+self.overlap, + (num_x)*self.size, x_len-1]) + self.y_limits.append([j*self.size, (j+1)*self.size-1+self.overlap]) + self.x_limits.append([(num_x)*self.size, x_len-1]) + else: + chunks.append( + list(self.data[j*self.size:-1, + (num_x)*self.size:-1])) + pixels.append( + [j*self.size, np.shape(self.data)[0]-1, + (num_x)*self.size, x_len-1]) + self.y_limits.append([j*self.size, (j+1)*self.size-1]) + + # Chunk bottom right + chunks.append( + list(self.data[num_y*self.size:-1, + num_x*self.size:-1])) + pixels.append( + [num_y*self.size, y_len-1, + num_x*self.size, x_len-1]) + + # Save corner indices for the chunks + self.x_limits.append([num_x*self.size, x_len-1]) + self.y_limits.append([num_y*self.size, y_len-1]) + + return chunks, pixels + + # If dataset is interpolated columns-wise + elif self.as_cols: + + chunks, pixels = None, None + self.x_limits = [[], [], [], [], [], [], [], []] + + # Determine columns to be interpolated in each chunk + i = 0 + while i <= len(self.x): + for j in range(len(self.x_limits)): + if i+j <= len(self.x)-1: + self.x_limits[j].append(i + j) + i = i + j + 1 + + # Determine the coordinates in the interpolation vector + self.x_final = [[], [], [], [], [], [], [], []] + self.y_final = [] + + for i in range(len(self.x_limits)): + for j in self.x_limits[i]: + self.x_final[i].append(self.x[j]) + self.y_final.append(list(self.y)) + + def determine_interpolation_approach(self): + + """ + Depending on the siz of the original dataset and the size of the + dataset after the interpolation, the computational power + might be exceeded and the dataset needs to be + split up to be interpolated. + + Different cases are covered in this function and depending + on the sizes, the approach is determined. + Approaches: + one_go: dataset before and after interpolation small enough + to be interpolated in one go + as_chunks: dataset before interpolation already so large + that it needs to be split into chunks which + then area interpolated independently + as_cols: dataset after interpolation so large, + that interpolation is done columnwise + """ + + # If several datasets shall be interpolated after another and the + # current run is the first dataset + + if not self.several or (self.several and self.first): + if len(self.x_org) < 2*self.size and len(self.y_org) < 2*self.size: + self.one_go, self.as_chunks, self.as_cols = True, False, False + else: + # to decide for interpolation approach + if ((len(self.x) * len(self.y) < self.limit_interp) + and (len(self.x_org) * len(self.y_org) < self.limit_org)): + self.one_go, self.as_chunks, self.as_cols = True, False, False + + elif len(self.x_org) * len(self.y_org) >= self.limit_org: + self.one_go, self.as_chunks, self.as_cols = False, True, False + + elif (len(self.x) * len(self.y) > self.limit_interp): + self.one_go, self.as_chunks, self.as_cols = False, False, True + + if self.several and self.first: + + self.properties['interp_approach'] = {} + self.properties['interp_approach']['one_go'] = self.one_go + self.properties['interp_approach']['as_chunks'] = self.as_chunks + self.properties['interp_approach']['as_cols'] = self.as_cols + + # If several datasets shall be interpolated after another and the + # current run is not the first dataset + elif self.several and not self.first: + + # Load the interpolation approach from the properties file + self.one_go = self.properties['interp_approach']['one_go'] + self.as_chunks = self.properties['interp_approach']['as_chunks'] + self.as_cols = self.properties['interp_approach']['as_cols'] + + def interpolate_dataset(self, data, y, x, x_new, y_new): + + """ + Interpolate dataset. Categorical data is interpolated using + nearest neighbor first into x direction then into y direction + + Input: + data: data to interpolate, depending on the interpolation + appraoch the whole dataset or a chunk + y: original y vector + x: original x vector + x_new: interpolation vector x + y_new: interpolation vector y + + Return: + data_interp: interpolated data + """ + + # Interpolation vectors + x_new = np.array(x_new) + y_new = np.array(y_new) + + # Make sure that no data values do not corrupt the interpolation + data = data.astype(float) + exists = np.any(data == settings.no_value) + data[data == settings.no_value] = np.nan + + if self.categorical==False: + data = np.flipud(data) + if exists: + nan_map = np.zeros_like(data) + nan_map[np.isnan(data)] = 1 + filled_z = data.copy() + filled_z[np.isnan(data)] = 0 + # Interpolation + f = interp2d(x, np.flip(y), filled_z, kind='linear') + data_interp = f(x_new, y_new) + if exists: + f_nan = interp2d(x, np.flip(y), nan_map, kind='linear') + nan_new = f_nan(x_new, y_new) + data_interp[nan_new > 0] = settings.no_value + + return np.flipud(data_interp) + + # If data is categorical + elif self.categorical==True: + + if exists: + nan_map = np.zeros_like(data) + nan_map[np.isnan(data)] = 1 + filled_z = data.copy() + filled_z[np.isnan(data)] = 0 + + data_interp_x = np.zeros((len(y), len(x_new))) + if exists: + nan_interp_x = np.zeros((len(y), len(x_new))) + + # Interpolate first in x direction + for i in range(len(y)): + + tmp = filled_z[i, :] + f = interp1d(x, tmp, kind='nearest', fill_value="extrapolate") + data_interp_x[i, :] = f(x_new) + + if exists: + tmp = nan_map[i, :] + f = interp1d(x, tmp, kind='nearest', fill_value="extrapolate") + nan_interp_x[i, :] = f(x_new) + + # Define empty arrays to be filled + data_interp = np.zeros((len(y_new), len(x_new))) + if exists: + nan_interp = np.zeros((len(y_new), len(x_new))) + + # Then interpolate in y direction + for i in range(len(x_new)): + + tmp = data_interp_x[:, i] + f = interp1d(y, tmp, kind='nearest', fill_value="extrapolate") + data_interp[:, i] = f(y_new) + + if exists: + tmp = nan_interp_x[:, i] + f = interp1d(y, tmp, kind='nearest', fill_value="extrapolate") + nan_interp[:, i] = f(y_new) + + # Set all by nan values affected pixels to no data value + data_interp[nan_interp > 0] = settings.no_value + + return data_interp + + def reshape_chunks(self, chunks): + + """ + Interpolated chunks are attached to form the interpolated dataset. + The chunks overlap and for categorical features, only one version + is used. For continuous features, the overlapping parts are averaged. + + Input: + chunks: interpolated chunks, list of lists + """ + + if self.as_chunks: + array = np.zeros((len(self.y), len(self.x))) + aa = np.zeros((len(self.y), len(self.x))) + test = np.zeros((len(self.y), len(self.x))) + + shape_x, shape_y = [], [] + for chunk in chunks: + shape_x.append(np.shape(np.array(chunk))[1]) + shape_y.append(np.shape(np.array(chunk))[0]) + + count = 0 + for count, chunk in enumerate(chunks): + xt = int((np.abs(self.x - self.x_final[count][0])).argmin()) + yt = int((np.abs(self.y - self.y_final[count][0])).argmin()) + + tmp = np.array(chunks[count]) + tmp1 = array[yt:yt+shape_y[count], xt:xt+shape_x[count]] + aa[yt:yt+shape_y[count], xt:xt+shape_x[count]] = tmp + + mask = (tmp1 == 0) | (tmp1 == -999) | (tmp == -999) + + if not self.categorical: + # Calculate the element-wise average only where mask is False + average_array = np.zeros_like(tmp, dtype=float) # Initialize array for the result + average_array[~mask] = (tmp[~mask] + tmp1[~mask]) / 2 + + # Assign elements from arr2 where arr1 is equal to zero + average_array[mask] = tmp[mask] + + array[yt:yt+shape_y[count], xt:xt+shape_x[count]] = average_array + + tmp = np.ones_like(tmp, dtype=float)*count + 1 + tmp1 = test[yt:yt+shape_y[count], xt:xt+shape_x[count]] + + mask = (tmp1 == 0) + + # Calculate the element-wise average only where mask is False + average_array = np.zeros_like(tmp, dtype=float) # Initialize array for the result + average_array[~mask] = (tmp[~mask] + tmp1[~mask]) / 2 + + # Assign elements from arr2 where arr1 is equal to zero + average_array[mask] = tmp[mask] + + test[yt:yt+shape_y[count], xt:xt+shape_x[count]] = average_array + + elif self.categorical: + + average_array = np.zeros_like(tmp, dtype=float) # Initialize array for the result + average_array[~mask] = (tmp[~mask] + tmp1[~mask]) / 2 + + # Assign elements from arr2 where arr1 is equal to zero + average_array[mask] = tmp[mask] + + array[yt:yt+shape_y[count], xt:xt+shape_x[count]] = tmp + test[yt:yt+shape_y[count], xt:xt+shape_x[count]] = average_array + + self.test = test.copy() + elif self.as_cols: + # Final array to be filled + array = np.zeros((len(self.y), len(self.x))) + + for i in range(len(chunks)): + array[:, self.x_limits[i]] = np.array(chunks[i]) + + return array + diff --git a/src/plain_scripts/utilities/handle_categorical_values.py b/src/plain_scripts/utilities/handle_categorical_values.py new file mode 100644 index 0000000000000000000000000000000000000000..3324229d781bc5a6534553802ae4db8614b353c5 --- /dev/null +++ b/src/plain_scripts/utilities/handle_categorical_values.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import pandas as pd +import numpy as np + +from sklearn.preprocessing import OneHotEncoder + +def handle_categorical_values(df, datasets_summary, ohe, basic, var=None): + + """ + Categorical features in the training dataset are either one hot + encoded or ordinal encoded + + Input: + df: DataFrame containing continuous and categorical features, Pandas DataFrame + datasets_summary: Information on the datasets from which the values in df have been extracted, Pandas DataFrame + ohe: True for One-hot encoding, False for ordinal encoding, Boolean + basic: columns in df not to be considered such as coordinates, ID and label, list + var: specific features to consider only, list + + """ + + if var == None: + cat = [] + for feat in df.columns.tolist(): + if feat not in basic: + index = datasets_summary['keys'].tolist().index(feat) + if bool(datasets_summary['categorical'].tolist()[index]) == True: + cat.append(feat) + else: + cat = [] + for feat in var: + index = datasets_summary['keys'].tolist().index(feat) + if bool(datasets_summary['categorical'].tolist()[index]) == True: + cat.append(feat) + + if len(cat) > 0: + if ohe: + encoder = OneHotEncoder(sparse=False) + encoded_data = encoder.fit_transform(df[cat]) + unique_categories = {col: df[col].unique() for col in cat} + + custom_column_names = [] + for col in cat: + for unique_value in unique_categories[col]: + if isinstance(unique_value, (float, np.float32)): + unique_value = int(unique_value) + custom_column_names.append(f'{col}_{str(unique_value)}_encode') + encoded_df = pd.DataFrame(encoded_data, columns=custom_column_names) + df = pd.concat([df.drop(columns=cat), encoded_df], axis=1) + else: + columns_to_encode = df.select_dtypes(include=['object', 'category']).columns.tolist() + encoder = OrdinalEncoder() + encoded_data = encoder.fit_transform(df[columns_to_encode]) + encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_encoded" for col in columns_to_encode]) + df = pd.concat([df.drop(columns=columns_to_encode), encoded_df], axis=1) + + return df + + \ No newline at end of file diff --git a/src/plain_scripts/utilities/import_format.py b/src/plain_scripts/utilities/import_format.py new file mode 100644 index 0000000000000000000000000000000000000000..4b31672d7403b9bb33878dfb3aa946f158882b61 --- /dev/null +++ b/src/plain_scripts/utilities/import_format.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import rasterio +import numpy as np +import netCDF4 as nc +import pandas as pd + +def import_tif(path): + + """ + Import a geotiff file + + Input: + path: Path to the tif file to open, string + missing_value = no data value of the + """ + + raster = rasterio.open(path, 'r') + data = raster.read()[0, :, :] + + if np.dtype(data[0, 0]) == 'uint8': + data = np.int32(data) + + bounds = raster.bounds + x = np.linspace(bounds[0], bounds[2], np.shape(data)[1]) + y = np.linspace(bounds[1], bounds[3], np.shape(data)[0]) + crs = raster.crs + + if y[0] < y[-1]: + y = np.flip(y) + + return data, x, y, crs + + +def import_nc(path): + + """ + Import a netCDF4 file and contained metadata + + Input: + path: Path to the netCDF4 file to open, string + """ + + ds = nc.Dataset(path) + x = ds['Longitude'][:] + y = ds['Latitude'][:] + + if 'Result' in ds.variables.keys(): + data = ds['Result'][:][:] + data = np.float64(data) + data = data.data + else: + data = None + + if 'Time' in ds.variables.keys(): + data = ds['Result'][:][:] + data = data.data + + crs = ds['Longitude'].units + + x = x.data + y = y.data + + if y[0] < y[-1]: + y = np.flip(y) + + return data, x, y, crs + + +def import_csv(path): + + """ + Import a csv file + + Input: + path: Path to the csv file to open, string + """ + + df = pd.read_csv(path) + + return df diff --git a/src/plain_scripts/utilities/import_raw_dataset.py b/src/plain_scripts/utilities/import_raw_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..cdbbe4b592dc9de43331613b4cef3eb39e38b043 --- /dev/null +++ b/src/plain_scripts/utilities/import_raw_dataset.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import numpy as np + +from utilities.import_format import import_tif, import_nc + +def import_raw_dataset(path, no_data, no_value): + + """ + Import Tif or netCDF4 file + + Input: + path: path to the dataset, string + no_data: no data values, list + no_value: general no data value, int or float + + Output: + data: dataset, numpy array + x_org: longitude coordinates, list + y_org: latitude coordinates, list + + """ + + warning = False + if path.split('.')[-1] == 'tif': + data, x_org, y_org, _ = import_tif(path) + elif path.split('.')[-1] == 'nc': + data, x_org, y_org, _ = import_nc(path) + else: + warning = True + + if y_org[0] < y_org[-1]: + y_org = np.flip(y_org) + + if no_data != 'None': + for val in no_data: + data[data == val] = no_value + + if warning: + return None, None, None + else: + return data, x_org, y_org + diff --git a/src/plain_scripts/utilities/initialise_log.py b/src/plain_scripts/utilities/initialise_log.py new file mode 100644 index 0000000000000000000000000000000000000000..55aec1f7e43fdeade58937538e17d1f2c1035275 --- /dev/null +++ b/src/plain_scripts/utilities/initialise_log.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import logging + +def save_log(path): + + """ + Initialisation of a log file using the python package logging to store + information, warnings and errors + + Input: + path: Path where to store the log file + Output: + logger: Logger + + """ + + path_log = os.path.dirname(path) + logger = logging.getLogger() + logger.setLevel(logging.INFO) + formatter = logging.Formatter( + '%(asctime)s | %(levelname)s | %(message)s') + + file_handler = logging.FileHandler(path) + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(formatter) + + logger.addHandler(file_handler) + + return logger \ No newline at end of file diff --git a/src/plain_scripts/utilities/ncfile_generation.py b/src/plain_scripts/utilities/ncfile_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..f6879c17863fb442958e2a426abd7e73cf7366ca --- /dev/null +++ b/src/plain_scripts/utilities/ncfile_generation.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import netCDF4 as nc +import settings + + +def generate_basic_ncfile(outfile, crs=None): + + """ + Initialise basic netCDF4 file + + Input: + Outfile: path to store the netcdf file, string + crs: coordinate reference system, string + """ + + # If outfile exists already, delete + if os.path.exists(outfile): + os.remove(outfile) + + ds = nc.Dataset(outfile, 'w', format='NETCDF4') + + return ds + + +def generate_ncfile(outfile, x, y, data, crs=None, + data_unit=None, missing_value=settings.no_value): + + """ + Save 2D dataset as netCDF4 file + + Input: + Outfile: path to store the netcdf file, string + x: longitude vector, list + y: latitude vector, list + data: 2D data array + crs: coordinate reference system, string + data_unit: data unit, string + missing_value: no data value, integer or float + """ + + # If outfile exists already, delete + if os.path.exists(outfile): + os.remove(outfile) + + ds = nc.Dataset(outfile, 'w', format='NETCDF4') + ds.createDimension('lon', len(x)) + ds.createDimension('lat', len(y)) + longitude = ds.createVariable('Longitude', 'f4', 'lon') + latitude = ds.createVariable('Latitude', 'f4', 'lat') + result = ds.createVariable('Result', 'f4', ('lat', 'lon')) + + longitude[:] = x + latitude[:] = y + result[:, :] = data + + # Provide global information in output-file + if crs is not None: + longitude.units = crs + latitude.units = crs + if data_unit is not None: + result.units = data_unit + ds.missing_value = missing_value + ds.close() + + +def generate_3dncfile(outfile, x, y, data, dim, features, crs='wgs84', + data_unit=None, missing_value=settings.no_value): + + """ + Save 3D dataset as netCDF4 file, e.g. data cube + + Input: + Outfile: path to store the netcdf file, string + x: longitude vector, list + y: latitude vector, list + dim: number of 2D datasets, integer + data: 2D data array + features: contained features in prediction dataset, list of chars + crs: coordinate reference system, string + data_unit: data unit, string + missing_value: no data value, integer or float + """ + + # If outfile exists already, delete + if os.path.exists(outfile): + os.remove(outfile) + + ds = nc.Dataset(outfile, 'w', format='NETCDF4') + ds.createDimension('lon', len(x)) + ds.createDimension('lat', len(y)) + ds.createDimension('dim', dim) + ds.createDimension('feat', len(features)) + longitude = ds.createVariable('Longitude', 'f4', 'lon') + latitude = ds.createVariable('Latitude', 'f4', 'lat') + result = ds.createVariable('Result', 'f4', ('lat', 'lon', 'dim')) + Features = ds.createVariable('features', 'S1', 'feat') + + longitude[:] = x + latitude[:] = y + result[:, :, :] = data + Features[:] = features + + # Provide global information in output-file + if crs is not None: + longitude.units = crs + latitude.units = crs + if data_unit is not None: + result.units = data_unit + ds.missing_value = missing_value + ds.close() + + +def generate_2dncfile(outfile, x, y, data, features, crs='wgs84', + data_unit=None, missing_value=settings.no_value): + + """ + Save 2D dataset as netCDF4 file, e.g. Prediction dataset + + Input: + Outfile: path to store the netcdf file, string + x: longitude vector, list + y: latitude vector, list + data: 2D data array + features: contained features in prediction dataset, list of chars + crs: coordinate reference system, string + data_unit: data unit, string + missing_value: no data value, integer or float + """ + + # If outfile exists already, delete + if os.path.exists(outfile): + os.remove(outfile) + + ds = nc.Dataset(outfile, 'w', format='NETCDF4') + ds.createDimension('lon', len(x)) + ds.createDimension('lat', len(y)) + ds.createDimension('feat', len(features)) + longitude = ds.createVariable('Longitude', 'f4', 'lon') + latitude = ds.createVariable('Latitude', 'f4', 'lat') + result = ds.createVariable('Result', 'f4', ('lat', 'lon')) + Features = ds.createVariable('features', 'S1', 'feat') + + longitude[:] = x + latitude[:] = y + result[:, :] = data + Features[:] = features + + # Provide global information in output-file + if crs is not None: + longitude.units = crs + latitude.units = crs + if data_unit is not None: + result.units = data_unit + ds.missing_value = missing_value + ds.close() diff --git a/src/plain_scripts/utilities/properties_user_input.csv b/src/plain_scripts/utilities/properties_user_input.csv new file mode 100644 index 0000000000000000000000000000000000000000..95a1a6cc7299f2a9984c0ab2a1eb3a3f9c0e525d --- /dev/null +++ b/src/plain_scripts/utilities/properties_user_input.csv @@ -0,0 +1,42 @@ +key,type,range,extension,path +ls_path,str,None,csv,1 +nonls_path,str,None,nc,1 +train_path,str,None,csv,1 +geo_path,str,None,csv,1 +feat_path,str,None,csv,1 +x,str,None,None,0 +y,str,None,None,0 +id,str,None,None,0 +x_nonls,str,None,None,0 +y_nonls,str,None,None,0 +num_nonls,"int,float",None,None,0 +from_scratch,"int,bool",None,"0,1",0 +delete,"int,bool",None,"0,1",0 +add,"int,bool",None,"0,1",0 +cluster,"int,bool",None,"0,1",0 +data_to_handle,str,None,None,0 +preprocess,str,None,None,0 +no_interpolation,"int,bool",None,"0,1",0 +interpolation,"int,bool",None,"0,1",0 +resolution,int,"1,inf",None,0 +random_seed,int,"1,inf",None,0 +crs,str,None,None,0 +no_value,int,None,None,0 +train,bool,None,None,0 +pred,bool,None,None,0 +map,bool,None,None,0 +pred_path,str,None,nc,1 +east,"int,float","-180,180",None,0 +west,"int,float","-180,180",None,0 +north,"int,float","-90,90",None,0 +south,"int,float","-90,90",None,0 +model_path,str,None,None,1 +drop_train,str,None,None,0 +drop_pred,str,None,None,0 +model_to_load,str,None,None,0 +model_to_save,str,None,None,0 +num_trees,int,"1,inf",None,0 +size_val,"int,float","0,1",None,0 +depth_trees,int,"1,inf",None,0 +name_label,str,None,None,0 +criterion,str,None,None,0 \ No newline at end of file diff --git a/src/plain_scripts/utilities/strings_for_ncfile.py b/src/plain_scripts/utilities/strings_for_ncfile.py new file mode 100644 index 0000000000000000000000000000000000000000..26c74fb11a3542550f51d117ba16b3739f1bae47 --- /dev/null +++ b/src/plain_scripts/utilities/strings_for_ncfile.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +def features_to_char(feat): + + """ + Turn list of features to chars so it can be stored in the nc-file. + """ + + char_features = [] + for feature in feat: + for letter in feature: + char_features.append(letter) + char_features.append('/') + char_features = char_features[0:-1] + + return char_features + + +def char_to_string(features): + + """ + Input: + features: list of features as chars + + Return: + features as strings + + Turns list of chars into strings providing information on + contained features in nc-file. Feature names have to be separated + by '0'. + """ + + features_decode = [] + for feature in features: + + features_decode.append(feature.decode('UTF-8')) + + tmp = ''.join(features_decode) + + return tmp.split('/')