From 510b6234e074d9a63bba44de205042b8586a721f Mon Sep 17 00:00:00 2001 From: Ann-Kathrin Edrich <edrich@mbd.rwth-aachen.de> Date: Wed, 19 Feb 2025 14:43:54 +0100 Subject: [PATCH] Fix handling of categorical values to keep original ordinal numbers --- src/gui_version/create_prediction_data_gui.py | 1 + src/gui_version/create_training_data_gui.py | 1 + .../utilities/handle_categorical_values.py | 30 ++++++++++++++----- src/plain_scripts/create_prediction_data.py | 3 +- src/plain_scripts/create_training_data.py | 1 + .../utilities/handle_categorical_values.py | 29 +++++++++++++----- 6 files changed, 48 insertions(+), 17 deletions(-) diff --git a/src/gui_version/create_prediction_data_gui.py b/src/gui_version/create_prediction_data_gui.py index 499fa91..2b220e0 100644 --- a/src/gui_version/create_prediction_data_gui.py +++ b/src/gui_version/create_prediction_data_gui.py @@ -543,6 +543,7 @@ class create_prediction_data: self.datasets_summary, self.properties_pred['ohe'], basic, + self.properties_settings['no_value'], var) to_drop = [] diff --git a/src/gui_version/create_training_data_gui.py b/src/gui_version/create_training_data_gui.py index 2c36e2d..feccdf3 100644 --- a/src/gui_version/create_training_data_gui.py +++ b/src/gui_version/create_training_data_gui.py @@ -1137,6 +1137,7 @@ class create_training_data: self.datasets_summary, self.properties_train['ohe'], basic, + self.properties_settings['no_value'], var) def save_training_data(self): diff --git a/src/gui_version/utilities/handle_categorical_values.py b/src/gui_version/utilities/handle_categorical_values.py index 9ceacf7..166c077 100644 --- a/src/gui_version/utilities/handle_categorical_values.py +++ b/src/gui_version/utilities/handle_categorical_values.py @@ -6,7 +6,7 @@ import numpy as np from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder -def handle_categorical_values(df, datasets_summary, ohe, basic, var=None): +def handle_categorical_values(df, datasets_summary, ohe, basic, no_data_value, var=None): """ Categorical features in the training dataset are either one hot @@ -46,6 +46,7 @@ def handle_categorical_values(df, datasets_summary, ohe, basic, var=None): if ohe: encoder = OneHotEncoder(sparse=False) encoded_data = encoder.fit_transform(df[cat]) + unique_categories = {col: df[col].unique() for col in cat} custom_column_names = [] for col in cat: @@ -57,14 +58,27 @@ def handle_categorical_values(df, datasets_summary, ohe, basic, var=None): df = pd.concat([df.drop(columns=cat), encoded_df], axis=1) else: - for feat in cat: - df[feat] = df[feat].apply(str) + cat = [feat for feat in cat if df[feat][df[feat] != no_data_value].apply(lambda x: isinstance(x, str)).all()] - columns_to_encode = cat.copy()#df.select_dtypes(include=['object', 'category']).columns.tolist() - encoder = OrdinalEncoder() - encoded_data = encoder.fit_transform(df[columns_to_encode]) - encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_encoded" for col in columns_to_encode]) - df = pd.concat([df.drop(columns=columns_to_encode), encoded_df], axis=1) + if len(cat) > 0: + columns_to_encode = cat.copy() + encoder = OrdinalEncoder() + + # Mask: Identify rows where values are NOT no_data_value + mask = df[columns_to_encode] != no_data_value + + # Apply encoding only to valid values + encoded_data = df[columns_to_encode].copy() + encoded_data[mask] = encoder.fit_transform(df[columns_to_encode][mask]) + + # Explicitly set no_data_value where it was before + encoded_data[~mask] = no_data_value # Keep original no_data_value + + # Convert back to DataFrame + encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_encode" for col in columns_to_encode]) + + # Merge encoded columns back into df + df = pd.concat([df.drop(columns=columns_to_encode), encoded_df], axis=1) return df diff --git a/src/plain_scripts/create_prediction_data.py b/src/plain_scripts/create_prediction_data.py index 820adae..655e531 100644 --- a/src/plain_scripts/create_prediction_data.py +++ b/src/plain_scripts/create_prediction_data.py @@ -131,6 +131,7 @@ class create_prediction_data: self.data_properties, settings.ohe, basic, + settings.no_value, var) to_drop = [] @@ -342,7 +343,7 @@ class create_prediction_data: ds.createDimension('ix', (len(self.idx))) ds.createDimension('feat', len(self.char_features)) result = ds.createVariable('Result', 'f4', ('lat', 'lon')) - dropped = ds.createVariable('Dropped', 'f4', 'ix') + dropped = ds.createVariable('Dropped', 'i4', 'ix') Features = ds.createVariable('features', 'S1', 'feat') result[:, :] = df_pred dropped[:] = self.idx diff --git a/src/plain_scripts/create_training_data.py b/src/plain_scripts/create_training_data.py index 03f251c..b2d7b73 100644 --- a/src/plain_scripts/create_training_data.py +++ b/src/plain_scripts/create_training_data.py @@ -112,6 +112,7 @@ class create_training_data: self.data_properties, settings.ohe, basic, + settings.no_value, var) def delete_feature(self): diff --git a/src/plain_scripts/utilities/handle_categorical_values.py b/src/plain_scripts/utilities/handle_categorical_values.py index 65cc349..166c077 100644 --- a/src/plain_scripts/utilities/handle_categorical_values.py +++ b/src/plain_scripts/utilities/handle_categorical_values.py @@ -6,7 +6,7 @@ import numpy as np from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder -def handle_categorical_values(df, datasets_summary, ohe, basic, var=None): +def handle_categorical_values(df, datasets_summary, ohe, basic, no_data_value, var=None): """ Categorical features in the training dataset are either one hot @@ -58,14 +58,27 @@ def handle_categorical_values(df, datasets_summary, ohe, basic, var=None): df = pd.concat([df.drop(columns=cat), encoded_df], axis=1) else: - for feat in cat: - df[feat] = df[feat].apply(str) + cat = [feat for feat in cat if df[feat][df[feat] != no_data_value].apply(lambda x: isinstance(x, str)).all()] - columns_to_encode = cat.copy()#df.select_dtypes(include=['object', 'category']).columns.tolist() - encoder = OrdinalEncoder() - encoded_data = encoder.fit_transform(df[columns_to_encode]) - encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_encoded" for col in columns_to_encode]) - df = pd.concat([df.drop(columns=columns_to_encode), encoded_df], axis=1) + if len(cat) > 0: + columns_to_encode = cat.copy() + encoder = OrdinalEncoder() + + # Mask: Identify rows where values are NOT no_data_value + mask = df[columns_to_encode] != no_data_value + + # Apply encoding only to valid values + encoded_data = df[columns_to_encode].copy() + encoded_data[mask] = encoder.fit_transform(df[columns_to_encode][mask]) + + # Explicitly set no_data_value where it was before + encoded_data[~mask] = no_data_value # Keep original no_data_value + + # Convert back to DataFrame + encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_encode" for col in columns_to_encode]) + + # Merge encoded columns back into df + df = pd.concat([df.drop(columns=columns_to_encode), encoded_df], axis=1) return df -- GitLab