-
Ann-Kathrin Margarete Edrich authoredAnn-Kathrin Margarete Edrich authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
create_prediction_data_gui.py 23.44 KiB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import netCDF4 as nc
import os
import pickle
import logging
from tqdm import tqdm
from tkinter import Label
from data_preprocessing_gui import generate_data_matrix
from utilities.ncfile_generation import generate_basic_ncfile
from utilities.strings_for_ncfile import features_to_char, char_to_string
from utilities.handle_categorical_values import handle_categorical_values
class create_prediction_data:
"""
This class creates the prediction data
for the Random Forest classifier.
Input:
from_scratch: boolean,
True if prediction dataset should be generated from
scratch, otherwise false
delete: True if dataset/feature should be
deleted from prediction dataset
False if dataset should be added to existing
prediction dataset
(careful: from_scratch needs to be False!)
data_to_handle: list of features that should be added/deleted
datasets need to be listed in list_of_raw_datasets
Output:
netCDF4 file
"""
def __init__(self, master, log=None):
self.logger = log
self.import_parameters()
self.row = 0
self.master = master
self.master.geometry()
self.master.winfo_toplevel().title("Prediction dataset generation")
self.logger.info('Prediction dataset generation started')
Label(self.master, text="Log:").grid(row=self.row, column=0)
self.row = self.row + 1
Label(self.master,
text="Prediction dataset generation started").grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
if self.from_scratch:
self.logger.info('Prediction dataset is generated from scratch')
Label(self.master,
text='Cube of interpolated datasets is generated').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
self.s = generate_data_matrix(
from_scratch=True,
delete=False,
dataset='prediction',
bb=self.bb,
data_to_handle=self.data_to_handle,
geo_overview=self.datasets_summary,
settings=self.properties_settings,
settings_train_pred=self.properties_pred)
self.logger.info('Cube of interpolated datasets has been generated')
Label(self.master,
text='Cube of interpolated datasets has been generated').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
self.import_cube() # Import data cube
if not self.no_dataset_found:
Label(self.master,
text='Cube of interpolated datasets has been imported').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
# Add coordinate information to
# prediction dataset for map generation
self.add_coordinates()
Label(self.master,
text='Coordinates have been added').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
# Flatten data cube for efficient information extraction
self.flatten_cube()
Label(self.master,
text='Cube has been flattened').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
self.clean_df() # Determine no value instances in DataFrame
Label(self.master,
text='Dataset has been cleaned').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
# Save prediction dataset
self.handle_categorical_features()
Label(self.master,
text='Categorical features have been handled').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
self.char_features = features_to_char(self.df_pred.columns)
self.save_as_nc()
Label(self.master,
text='Prediction dataset generation successful').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
else:
Label(self.master,
text='Error: Cube of interpolated datasets has not been found!').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
elif not self.from_scratch and not self.delete:
self.logger.info(
'A feature will be added to existing prediction dataset')
self.logger.info('Feature to add: ' + str(self.data_to_handle))
Label(self.master,
text='Feature(s) will be added to an existing prediction dataset').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
# Import existing prediction dataset
self.import_prediction_dataset()
if self.pred_exist:
Label(self.master,
text='Prediction dataset has been imported').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
# Import data cube that contains
# cut and interpolate dataset to be added
self.import_cube()
if not self.no_dataset_found:
Label(self.master,
text='Cube of interpolated datasets has been imported').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
# Check if datasets to be added are contained in the data cube
feature_to_add = []
for feature in self.data_to_handle:
not_included = False
if feature not in self.features:
self.logger.info(
str(feature)
+ ' not included in data cube,\
it has to be added first')
Label(self.master,
text=str(feature) + ' not included in cube').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
not_included = True
feature_to_add.append(feature)
#if not_included:
if len(feature_to_add):
self.logger.info(str(feature_to_add)
+ ' will be appended to the data cube')
Label(self.master,
text=str(feature_to_add) + ' is appended to the cube').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
self.s = generate_data_matrix(
from_scratch=False,
delete=False,
dataset='prediction',
bb=self.bb,
data_to_handle=feature_to_add,
geo_overview=self.datasets_summary,
settings=self.properties_settings,
settings_train_pred=self.properties_pred,
keys_already_included=self.features)
self.logger.info('Data cube has been updated')
Label(self.master,
text='Cube has been updated').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
self.import_cube()
if not self.no_dataset_found:
self.add_feature() # Add feature
Label(self.master,
text='Feature(s) added to prediction dataset').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
# Save prediction dataset
self.clean_df()
Label(self.master,
text='Prediction dataset has been cleaned').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
self.handle_categorical_features(var=self.data_to_handle)
Label(self.master,
text='Categorical features have been handled').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
self.char_features = features_to_char(self.df_pred.columns)
self.save_as_nc()
Label(self.master,
text='Prediction dataset successfully updated').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
else:
Label(self.master,
text='Cube of interpolated datasets has not been found').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
else:
Label(self.master,
text='Error: Cube of interpolated datasets has not been found').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
elif not self.from_scratch and self.delete:
self.logger.info(
'A feature will be delted from existing prediction dataset')
self.logger.info('Feature to delete: ' + str(self.data_to_handle))
Label(self.master,
text='Feature(s) will be deleted from existing prediction dataset').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
Label(self.master,
text='Feature(s) to delete: ' + str(self.data_to_handle)).grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
# Import existing prediction dataset
self.import_prediction_dataset()
if self.pred_exist:
Label(self.master,
text='Existing prediction dataset imported').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
self.delete_features() # Delete features from prediction dataset
Label(self.master,
text='Feature(s) deleted').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
# Save prediction dataset
self.char_features = features_to_char(self.df_pred.columns)
self.save_as_nc()
Label(self.master,
text='Prediction dataset successfully updated').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
def import_parameters(self):
if os.path.exists('tmp_pred.pkl') and os.path.exists('tmp_settings.pkl'):
with open('tmp_pred.pkl', 'rb') as handle:
self.properties_pred = pickle.load(handle)
self.datasets_summary = pd.read_csv(
self.properties_pred['geo_path'])
self.data_to_handle = pd.read_csv(
self.properties_pred['feat_path'])
self.keys = pd.read_csv(
self.properties_pred['geo_path'])
self.keys = list(self.keys['keys'])
if self.properties_pred['from_scratch'] == 1:
self.from_scratch = True
self.data_to_handle = list(
self.data_to_handle['keys_to_include'])
else:
self.from_scratch = False
self.data_to_handle = list(
self.data_to_handle['keys_to_include'])
if self.properties_pred['delete'] == 1:
self.delete = True
if self.properties_pred['add'] == 1:
self.delete = False
self.bb = [self.properties_pred['north'],
self.properties_pred['south'],
self.properties_pred['west'],
self.properties_pred['east']]
with open('tmp_settings.pkl', 'rb') as handle:
self.properties_settings = pickle.load(handle)
self.properties_settings['pred_path'] = \
self.properties_pred['pred_path']
else:
Label(self.master,
text='Error: user input files not found!').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
self.logger.error('Error: user input files not found!')
def add_feature(self):
"""
Add feature to the prediction dataset
"""
self.logger.info('Feature will be added')
for count, key in enumerate(self.data_to_handle):
# Delete feature if it already exists in training dataset
if key in self.df_pred.columns:
self.logger.info('Feature already exists in dataset.\
Existing feature is deleted')
self.df_pred = self.df_pred.drop(key, axis=1)
self.logger.info('Adding ' + str(key))
Label(self.master,
text='Adding ' + str(key)).grid(row=self.row, column=1)
self.row = self.row + 1
self.master.update()
if count == 0:
# Create empty DataFrame
self.df_features = pd.DataFrame(index=range(len(self.df_pred)),
columns=self.data_to_handle)
data_flat = self.cube[:, :, self.features.index(key)].flatten()
self.df_features[key] = data_flat
# Combine old training dataset with additional features
self.df_pred = pd.concat([self.df_pred, self.df_features], axis=1)
# Adapt column order
self.logger.info(
'Prediction dataset contains the following features: '
+ str(list(self.df_pred.columns)))
Label(self.master,
text='Prediction dataset contains the following features: '
+ str(list(self.df_pred.columns))).grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
def import_cube(self):
"""
Import cut and interpolated data cube
which was created in pre-processing.py
"""
self.logger.info('Data cube is being imported')
# Path to the stored data cube (see data_preprocessing.py)
folder = self.properties_pred['pred_path'].rsplit('/', 1)[0]
path = folder + '/data_combined_prediction_' + str(self.properties_settings['resolution']) + '.nc'
# Check if path exists and import the cube
# as well as list of datasets it contains
if not os.path.exists(path):
self.logger.error('Error: Dataset not found!')
self.no_dataset_found = True
else:
self.no_dataset_found = False
ds = nc.Dataset(path)
self.cube = ds['Result'][:, :, :].data
self.x = ds['Longitude'][:].data
self.y = ds['Latitude'][:].data
self.pred_features = ds['features'][:].data
self.features = char_to_string(self.pred_features)
self.logger.info('Features included in dataset: '
+ str(self.features))
Label(self.master,
text='Features included in prediction dataset: '
+ str(self.features)).grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
def flatten_cube(self):
"""
Flatten the individual datasets of the data cube
"""
self.logger.info('Data cube is flattened')
# Go through all datasets in the data cube
for i in tqdm(range(np.shape(self.cube)[2])):
data = self.cube[:, :, i]
data_flat = data.flatten() # Flatten the dataset
# Save it to the DataFrame
self.df_pred[self.features[i]] = data_flat
def add_coordinates(self):
"""
Add coordinate for which the model shall
make an prediction to the DataFrame.
"""
self.logger.info('Coordinates are added')
self.df_pred = pd.DataFrame(columns=['xcoord', 'ycoord']
+ self.features)
self.X, self.Y = np.meshgrid(self.x, self.y)
data_flat = self.X.flatten()
self.df_pred['xcoord'] = data_flat
data_flat = self.Y.flatten()
self.df_pred['ycoord'] = data_flat
def clean_df(self):
"""
Clean the DataFrame from rows with no data values
"""
self.logger.info('Prediction dataset is being cleaned')
self.idx = []
for i in tqdm(range(len(self.df_pred.to_numpy()))):
if (self.properties_settings['no_value'] in
self.df_pred.to_numpy()[i, :]):
self.idx.append(i)
# Save information on invalid locations so that they
# can masked out during hazard map generation
self.logger.info(str(len(self.idx))
+ ' rows will be saved to be considered after\
RF prediction due to invalid data')
Label(self.master,
text='Rows with missing features identified').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
def delete_features(self):
"""
Delete feature from prediction dataset
"""
self.logger.info('Feature is being deleted')
to_drop = []
for feat in self.data_to_handle:
for col in self.df_pred.columns.tolist():
if feat in col:
to_drop.append(col)
self.df_pred.drop(columns=to_drop, inplace=True)
self.logger.info('Features now included in prediction dataset: '
+ str(list(self.df_pred.columns)))
def import_prediction_dataset(self):
"""
Import existing prediction dataset
"""
if os.path.exists(self.properties_pred['pred_path']):
self.pred_exist = True
self.logger.info('Import existing prediction dataset')
ds = nc.Dataset(self.properties_pred['pred_path'])
pred = ds['Result'][:, :].data
pred_features = ds['features'][:].data
self.features = char_to_string(pred_features)
self.idx = ds['Dropped'][:].data
self.df_pred = pd.DataFrame(pred, columns=self.features)
self.logger.info(
'Features included in the existing prediction dataset: '
+ str(self.features))
else:
Label(self.master,
text='Error: no existing prediction dataset found!').grid(
row=self.row, column=1)
self.row = self.row + 1
self.master.update()
self.pred_exist = False
def handle_categorical_features(self, var=None):
"""
Function is called which performs one-hot or ordinal encoding
"""
basic = ['xcoord', 'ycoord']
self.df_pred = handle_categorical_values(self.df_pred,
self.datasets_summary,
self.properties_pred['ohe'],
basic,
self.properties_settings['no_value'],
var)
to_drop = []
for col in self.df_pred.columns.tolist():
if str(self.properties_settings['no_value']) in col:
to_drop.append(col)
self.df_pred = self.df_pred.drop(to_drop, axis=1)
def save_as_nc(self):
"""
Save prediction dataset and information on dropped rows as nc-file
"""
df_pred = self.df_pred.to_numpy()
outfile = self.properties_pred['pred_path']
self.logger.info('Prediction dataset is saved to ' + outfile)
isExist = os.path.exists(os.path.dirname(outfile))
if not isExist:
os.makedirs(os.path.dirname(outfile))
ds = generate_basic_ncfile(outfile, crs=None)
ds.createDimension('lat', (np.shape(df_pred)[0]))
ds.createDimension('lon', (np.shape(df_pred)[1]))
ds.createDimension('ix', (len(self.idx)))
ds.createDimension('feat', len(self.char_features))
result = ds.createVariable('Result', 'f4', ('lat', 'lon'))
dropped = ds.createVariable('Dropped', 'u8', 'ix')
Features = ds.createVariable('features', 'S1', 'feat')
result[:, :] = df_pred
dropped[:] = np.array(self.idx)
Features[:] = self.char_features
ds.close()