-
Ann-Kathrin Margarete Edrich authoredAnn-Kathrin Margarete Edrich authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
handle_categorical_values.py 2.70 KiB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
def handle_categorical_values(df, datasets_summary, ohe, basic, var=None):
"""
Categorical features in the training dataset are either one hot
encoded or ordinal encoded
Input:
df: DataFrame containing continuous and categorical features, Pandas DataFrame
datasets_summary: Information on the datasets from which the values in df have been extracted, Pandas DataFrame
ohe: True for One-hot encoding, False for ordinal encoding, Boolean
basic: columns in df not to be considered such as coordinates, ID and label, list
var: specific features to consider only, list
"""
if var is None:
cat = []
for feat in df.columns.tolist():
if feat not in basic:
index = datasets_summary['keys'].tolist().index(feat)
if datasets_summary['categorical'].tolist()[index].lower().capitalize().split(' ')[0] == 'True':
cat.append(feat)
else:
cat = []
for feat in var:
index = datasets_summary['keys'].tolist().index(feat)
if datasets_summary['categorical'].tolist()[index].lower().capitalize().split(' ')[0] == 'True':
cat.append(feat)
if len(cat) > 0:
if ohe:
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(df[cat])
unique_categories = {col: df[col].unique() for col in cat}
custom_column_names = []
for col in cat:
for unique_value in unique_categories[col]:
if isinstance(unique_value, (float, np.float32)):
unique_value = int(unique_value)
custom_column_names.append(f'{col}_{str(unique_value)}_encode')
encoded_df = pd.DataFrame(encoded_data, columns=custom_column_names)
df = pd.concat([df.drop(columns=cat), encoded_df], axis=1)
else:
for feat in cat:
df[feat] = df[feat].apply(str)
print(df[cat])
columns_to_encode = cat.copy()#df.select_dtypes(include=['object', 'category']).columns.tolist()
print(columns_to_encode)
encoder = OrdinalEncoder()
encoded_data = encoder.fit_transform(df[columns_to_encode])
encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_encoded" for col in columns_to_encode])
df = pd.concat([df.drop(columns=columns_to_encode), encoded_df], axis=1)
return df