Skip to content
Snippets Groups Projects
create_json_from_excel.py 4.38 KiB
Newer Older
"""Create json metadata files from an excel file."""
import os
import sys
from collections import namedtuple
from typing import Dict, List, NamedTuple

import pandas as pd
# pandas is using another dependency called openpyxl
# both need to be installed


COLUMN_FOR_INDEX_IN_EXCEL = 0
KEYWORD_TO_IGNORE_COLUMN = "ignore"
KEYWORD_TO_IGNORE_SHEET = "ignore"
HELP_TXT = """
    1. The source excel file. (e.g. test.xlsx)
    2. The destination folder for the json file. (e.g. test_folder)
    3. Override the folder if existing. (OPTIONAL, False)

    Providing only '-- help' will show this text and exit.
"""


def read_terminal_arguments() -> NamedTuple:
    """Read terminal arguments.

    1. File to read from.
    2. Name of the folder to store the json files.
    3. Override the destination folder if existing, defaults False.
    """
    terminal_args = namedtuple(
        "terminal_args",
        ["source", "destination", "override"],
        defaults=["test.xlsx", "test", True]
    )
    if len(sys.argv) == 1:  # No arguments passed.
        return terminal_args()  # noqa
    if sys.argv[1] in ["help", "--help", "-h", "?", "h"]:
        print(HELP_TXT)
        raise SystemExit
    try:
        arg_1 = sys.argv[1]
    except IndexError:
        raise SystemExit
    try:
        arg_2 = sys.argv[2]
    except IndexError:
        raise SystemExit
    try:
        arg_3 = sys.argv[3]
    except IndexError:
        arg_3 = False

    terminal_args = terminal_args(
        source=arg_1,
        destination=arg_2,
        override=bool(arg_3)
    )
    return terminal_args


def read_excel_file(path: str) -> Dict[str, pd.DataFrame]:
    """Read the sheets of the Excel file into multiple dataframes."""
    if not os.path.exists(path):
        raise FileNotFoundError(
            f"No file found at '{path}'"
        )
    sheets_as_dataframes = {}
    with pd.ExcelFile(path) as xls:
        for sheet_name in xls.sheet_names:
            if sheet_name.startswith(KEYWORD_TO_IGNORE_SHEET):
                continue
            excel_file = pd.read_excel(
                xls,
                sheet_name,
                index_col=COLUMN_FOR_INDEX_IN_EXCEL
            )
            sheets_as_dataframes.update({sheet_name: excel_file})
        return sheets_as_dataframes


def make_json(dataframes: Dict[str, pd.DataFrame]) -> Dict[str, str]:
    """Create the json strings from the excel data."""
    json_files = {}
    for sheet_name, dataframe in dataframes.items():
        dataframe.dropna(inplace=True, axis=1)
        dropped_columns = [column for column in dataframe.columns
                           if column.startswith(KEYWORD_TO_IGNORE_COLUMN)]
        dataframe.drop(dropped_columns, axis=1, inplace=True,
                       errors="ignore")
        print(sheet_name)
Hock, Benedikt's avatar
Hock, Benedikt committed
        json_string = dataframe.to_json(orient="index", indent=4, force_ascii=False)
        json_files.update({sheet_name: json_string})
    return json_files


def save_json_files(folder_name: str, jsons_files: Dict[str, str],
                    override_files: bool) -> None:
    """Save the json strings to a '.json' file format."""
    path_to_json_folder = os.path.join(
        os.path.abspath(__file__),
        os.pardir,
        folder_name
    )
    path_to_json_folder = os.path.abspath(path_to_json_folder)
    path_exists = os.path.exists(path_to_json_folder)
    if path_exists and not override_files:
        raise FileExistsError(
            f"Folder: {path_to_json_folder} exists. Pass override argument "
            f"to override the folder. See --help for more."
        )
    if not path_exists:
        os.mkdir(path_to_json_folder)
    for json_name, json_sting in jsons_files.items():
        file_name = f"{json_name}.json"
        file_path = os.path.join(path_to_json_folder, file_name)
        with open(file_path, "w") as f:
            f.write(json_sting)


def main():
    """The main function."""
    # Get terminal arguments.
    terminal_arguments = read_terminal_arguments()
    # Read the Excel file.
    path_to_file = os.path.abspath(os.path.join(
        os.path.abspath(__file__),
        os.pardir,
        terminal_arguments.source)
    )
    excel_sheets_as_dataframes = read_excel_file(path_to_file)
    excel_sheets_as_json = make_json(excel_sheets_as_dataframes)
    save_json_files(terminal_arguments.destination, excel_sheets_as_json,
                    terminal_arguments.override)


if __name__ == "__main__":
    t = main()