# coding=utf-8

##
# RapidMiner Python Scripting Extension
# Copyright (C) 2015-2025 RapidMiner GmbH
##
"""
Main Script for Executing User-Defined Python Scripts in RapidMiner

This script acts as the execution engine for user-defined Python scripts within the RapidMiner environment. It is initiated by the Python Scripting Extension's Java code and performs the following key tasks:

- **Input Deserialization**: Reads and deserializes input files matching `rapidminer_input%03d.*` into Python objects such as pandas DataFrames, file objects, or dictionaries.
- **User Script Execution**: Dynamically imports the user's `userscript.py` from the specified working directory and invokes the required `rm_main` function with the deserialized inputs.
- **Output Serialization**: Serializes the results returned by `rm_main` into output files named `rapidminer_output%03d.*`, supporting formats like Apache Arrow for DataFrames and pickle for generic objects.
- **Error Handling**: Captures and sanitizes exceptions during the process, adjusting stack traces to reference the user's script and logging errors for debugging.
- **Environment Setup**: Validates the Python environment, ensuring required packages are installed and configuring paths for module imports.

**Usage Notes**:

- The script is not intended to be run directly by users but is called by RapidMiner as part of the Python scripting extension.
- Users should define their custom logic in `userscript.py`, ensuring it contains the `rm_main` function.

**Author**: Tamás Járvás

"""

# Constants
__required_pandas_version__ = "1000.0"  # To be overwritten in Java code
__required_pyarrow_version__ = "1000.0"

# Standard Library Imports
import sys
import os
import importlib.util
import traceback
from glob import glob
import json
import logging
from typing import List, Tuple, Any, Optional, Dict

##############################################################################
# Constants and Configurations
##############################################################################

# Constants
ARROW_EXT = '.arrow'
BIN_EXT = '.bin'
FOI_EXT = '.foi'
CONNECTION_EXT = '.connection'
JSON_MACROS_EXT = '.json-macros'
OUTPUT_FILENAME_PATTERN = "rapidminer_output{:03d}"
BATCH_DEFAULT_ROWS = 1 << 8  # 256
BATCH_MAX_CELLS = 1 << 14  # 16384
COLUMN_RENAME_PREFIX = "att"  # Prefix used when renaming invalid/missing column names

# Exit codes
EXIT_UNKNOWN_ERROR = 1
EXIT_PANDAS_NOT_FOUND = 50
EXIT_INVALID_PANDAS_VERSION = 51
EXIT_FAILED_TO_PARSE_USERSCRIPT = 55
EXIT_USER_SCRIPT_EXECUTION_ERROR = 60
EXIT_RM_MAIN_NOT_FOUND = 61
EXIT_DESERIALIZATION_ERROR = 65
EXIT_SERIALIZATION_ERROR = 66
EXIT_CUSTOM_TYPE_CONVERSION_ERROR = 68
EXIT_FILE_NOT_FOUND = 70
EXIT_ARROW_MODULE_NOT_FOUND = 72
EXIT_ARROW_MODULE_INVALID_VERSION = 73


# Third-Party Imports
try:
    import cPickle as pickle  # For Python 2 compatibility
except ImportError:
    import pickle
try:
    import pandas as pd
except ImportError:
    print("Error: pandas module not found.")
    sys.exit(EXIT_PANDAS_NOT_FOUND)
try:
    import pyarrow as pa
    import pyarrow.ipc as ipc
except ImportError:
    logging.error("PyArrow module not found.")
    sys.exit(EXIT_ARROW_MODULE_NOT_FOUND)

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Local Imports
import serdeutils as utils

##############################################################################
# Utility Functions
##############################################################################


def check_pandas_version(actual_version: str, required_version: str) -> None:
    """
    Validates that the installed pandas version meets the required version.
    Exits with code 51 if the version is insufficient.

    Args:
        actual_version (str): The currently installed pandas version.
        required_version (str): The minimum required pandas version.

    Raises:
        SystemExit: If the installed pandas version is below the required version.
    """
    try:
        actual_major, actual_minor = map(int, actual_version.split('.')[:2])
        required_major, required_minor = map(int, required_version.split('.')[:2])
    except ValueError:
        error_msg = f"Invalid version format. Actual: {actual_version}, Required: {required_version}"
        logging.info(f"Invalid pandas version: {error_msg}")
        utils.write_to_error_log(actual_version+"\n"+required_version)
        sys.exit(EXIT_INVALID_PANDAS_VERSION)

    if (actual_major, actual_minor) < (required_major, required_minor):
        error_message = (
            f"pandas version {actual_version} not supported. "
            f"At least version {required_version} is required."
        )
        logging.error(error_message)
        utils.write_to_error_log(f"{actual_version}\n{required_version}")
        sys.exit(EXIT_INVALID_PANDAS_VERSION)

def check_pyarrow_version(required_version_str):
    """
    Checks if PyArrow is installed and meets the minimum required version.

    Args:
        required_version_str (str): The minimum required version of PyArrow (e.g., "18.0.0").
                                    Only the major version is enforced.

    Raises:
        SystemExit: Exits with code EXIT_ARROW_MODULE_INVALID_VERSION if PyArrow is not installed or version is below minimum.
    """
    actual_version_str = pa.__version__

    def get_major_version(version_str):
        return int(version_str.split('.')[0])

    try:
        actual_major_version = get_major_version(actual_version_str)
        required_major_version = get_major_version(required_version_str)
    except ValueError:
        logging.error(f"Invalid version format. Actual: {actual_version_str}, Required: {required_version_str}")
        utils.write_to_error_log(actual_version_str+"\n"+required_version_str)
        sys.exit(EXIT_ARROW_MODULE_INVALID_VERSION)

    if actual_major_version < required_major_version:
        error_message = (
            f"PyArrow version {actual_version_str} is too old. "
            f"Minimum required: {required_major_version}.0.0 or higher"
        )
        logging.error(error_message)
        utils.write_to_error_log(actual_version_str+"\n"+f"Minimum: {required_major_version}.0.0")
        sys.exit(EXIT_ARROW_MODULE_INVALID_VERSION)

def handle_exception(exception: Exception) -> None:
    """
    Handles exceptions by printing a sanitized traceback and logging the error, printing to the error log file.

    Args:
        exception (Exception): The exception to handle.
    """
    sanitized_message = str(exception).replace("userscript.py", "script")
    line_info = ''

    if isinstance(exception, SyntaxError):
        # Handle SyntaxError separately
        print(''.join(traceback.format_exception_only(type(exception), exception)))
        sanitized_message = sanitized_message.replace(f"line {exception.lineno}", f"line {exception.lineno - 1}")
    else:
        # Extract and sanitize traceback information
        tb = traceback.extract_tb(exception.__traceback__)
        sanitized_traceback = []
        script_filename = ''
        found_rm_main = False
        last_script_line = -1

        for filename, line_number, function, text in tb:
            if function == 'rm_main':
                script_filename = filename
                found_rm_main = True
            if found_rm_main and filename == script_filename:
                filename = 'script'
                line_number -= 1
                last_script_line = line_number
            sanitized_traceback.append((filename, line_number, function, text))

        # Print sanitized traceback
        print("Traceback (most recent call last):")
        for frame in sanitized_traceback:
            print(f'  File "{frame[0]}", line {frame[1]}, in {frame[2]}')
            print(f'    {frame[3]}')

        if last_script_line >= 0:
            line_info = f" (script, line {last_script_line})"

    full_message = f"{type(exception).__name__}: {sanitized_message}{line_info}"
    print(full_message)
    utils.write_to_error_log(full_message)


def import_userscript(user_folder: str) -> None:
    """
    Imports the userscript module from the specified user folder.
    Exits with code 55 if the import fails.

    Args:
        user_folder (str): The directory where the userscript is located.

    Raises:
        SystemExit: If the userscript fails to import.
    """
    original_cwd = os.getcwd()

    # Append user_folder to sys.path if not already present
    if user_folder not in sys.path:
        sys.path.append(user_folder)

    try:
        os.chdir(user_folder)

        # Dynamically import the userscript module
        module_name = 'userscript'
        spec = importlib.util.find_spec(module_name)
        if spec is None:
            logging.error(f"Module '{module_name}' not found in '{user_folder}'.")
            sys.exit(EXIT_FAILED_TO_PARSE_USERSCRIPT)

        userscript = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(userscript)
        globals()['userscript'] = userscript  # Make the module available globally

    except Exception as e:
        logging.error("Failed to import the userscript module.", exc_info=True)
        handle_exception(e)
        sys.exit(EXIT_FAILED_TO_PARSE_USERSCRIPT)
    finally:
        os.chdir(original_cwd)

##############################################################################
# Deserialization Functions
##############################################################################


"""
   Deserializes input files matching the pattern 'rapidminer_input*.*' into Python objects.

   The function processes files based on their extensions:
       - '.arrow': Reads '.arrow' format tabular data files into pandas DataFrames.
       - '.bin': Loads pickled Python objects.
       - '.foi': Reads file paths and opens the corresponding files.
       - '.connection': Reads connection data using a serialization key.
       - '.json-macros': Loads macros from a JSON file.

   Returns:
       Tuple[List[Any], Optional[Any]]: A tuple containing a list of input objects and macros.

   Raises:
       SystemExit: Exits the program with a specific exit code on encountering errors.
   """


def deserialize() -> Tuple[List[Any], Optional[Any]]:
    files = sorted(glob('rapidminer_input*.*'))
    inputs: List[Any] = []
    macros: Optional[Any] = None

    DESERIALIZERS = {
        ARROW_EXT: _deserialize_arrow,
        BIN_EXT: _deserialize_bin,
        FOI_EXT: _deserialize_foi,
        CONNECTION_EXT: _deserialize_connection_file,
        JSON_MACROS_EXT: _deserialize_json_macros,
    }

    for file_path in files:
        _, extension = os.path.splitext(file_path)
        deserializer_function = DESERIALIZERS.get(extension)
        if deserializer_function:
            try:
                deserialized_input = deserializer_function(file_path)
                if extension == JSON_MACROS_EXT:
                    if macros is not None:
                        logging.error("Multiple macros provided.")
                        utils.write_to_error_log("Multiple macros provided.")
                        sys.exit(EXIT_DESERIALIZATION_ERROR)
                    macros = deserialized_input
                else:
                    inputs.append(deserialized_input)
            except Exception as e:
                logging.error(f"Error while deserializing file '{file_path}': {e}")
                handle_exception(e)
                sys.exit(EXIT_DESERIALIZATION_ERROR)
        else:
            logging.error(f"Unknown file extension: {extension}")

    return inputs, macros


def _deserialize_arrow(file_path: str) -> pd.DataFrame:
    """
    Handles files with the '.arrow' extension by deserializing them using PyArrow
    and converting the data into a pandas DataFrame.

    Args:
        file_path (str): The path to the '.arrow' input file.

    Returns:
        pd.DataFrame: The deserialized pandas DataFrame.

    Raises:
        SystemExit: If PyArrow is not installed or an error occurs during file processing.
    """
    try:
        with ipc.open_file(file_path) as reader:
            logging.info(f"Opened the Arrow file: {file_path}")
            table = reader.read_all()
            # Define a types_mapper to use pandas nullable types
            dtype_mapping = {
                pa.int8(): pd.Int8Dtype(),
                pa.int16(): pd.Int16Dtype(),
                pa.int32(): pd.Int32Dtype(),
                pa.int64(): pd.Int64Dtype(),
                pa.uint8(): pd.UInt8Dtype(),
                pa.uint16(): pd.UInt16Dtype(),
                pa.uint32(): pd.UInt32Dtype(),
                pa.uint64(): pd.UInt64Dtype(),
                pa.float32(): pd.Float32Dtype(),
                pa.float64(): pd.Float64Dtype(),
                pa.bool_(): pd.BooleanDtype(),
                pa.string(): pd.StringDtype(),
            }
            df = table.to_pandas(types_mapper=dtype_mapping.get)
            logging.info("PyArrow Table successfully converted to a pandas DataFrame.")
            # Post-processing
            attach_metadata(table, df)
            return df
    except Exception as e:
        logging.error(f"Failed to read Arrow file '{file_path}': {e}")
        handle_exception(e)
        sys.exit(EXIT_DESERIALIZATION_ERROR)


def attach_metadata(table: pa.Table, df: pd.DataFrame) -> None:
    """
    Extracts metadata from a PyArrow Table and attaches it to a pandas DataFrame.

    Args:
        table (pa.Table): The PyArrow Table containing the data and metadata.
        df (pd.DataFrame): The pandas DataFrame to which metadata will be attached.
    """
    if not hasattr(df, 'rm_metadata'):
        df.rm_metadata = {}
    rm_metadata = df.rm_metadata

    for field in table.schema:
        column_name = field.name
        metadata = field.metadata

        if metadata:
            # Decode metadata from bytes to strings if necessary
            decoded_meta = {
                k.decode('utf-8') if isinstance(k, bytes) else k:
                v.decode('utf-8') if isinstance(v, bytes) else v
                for k, v in metadata.items()
            }
            rm_metadata[column_name] = decoded_meta


def _deserialize_bin(file_path: str) -> Any:
    """
    Handles '.bin' files by loading pickled Python objects.

    Args:
        file_path (str): The path to the '.bin' file.

    Returns:
        Any: The deserialized Python object.

    Raises:
        SystemExit: If unpickling fails.
    """
    try:
        with open(file_path, 'rb') as load_file:
            return pickle.load(load_file)
    except Exception as e:
        logging.error(f"Failed to load pickled file '{file_path}': {e}")
        handle_exception(e)
        sys.exit(EXIT_DESERIALIZATION_ERROR)


def _deserialize_foi(file_path: str) -> Any:
    """
    Handles '.foi' files by reading file paths and opening the corresponding files.
    If the incoming input file is a buffered file object, the java app created a temporary file with the content
    and create a ".foi" file containing the path of the temporary content file

    Args:
        file_path (str): The path to the '.foi' file.

    Returns:
        Any: The opened file object.

    Raises:
        SystemExit: If the referenced file does not exist.
    """
    try:
        with utils.open_file(file_path, 'r') as foi_file:
            path_of_temp_file = utils.read_file(foi_file).strip()

        if os.path.exists(path_of_temp_file):
            return utils.open_file(path_of_temp_file, 'r')
        else:
            logging.error(f"File not found: '{path_of_temp_file}' referenced in '{file_path}'.")
            utils.write_to_error_log("The file {} does not exist.".format(path_of_temp_file))
            sys.exit(EXIT_FILE_NOT_FOUND)
    except Exception as e:
        logging.error(f"Failed to process FOI file '{file_path}': {e}")
        handle_exception(e)
        sys.exit(EXIT_DESERIALIZATION_ERROR)


def _deserialize_connection_file(file_path: str) -> Any:
    """
    Handles '.connection' files by reading connection data using a serialization key.

    Args:
        file_path (str): The path to the '.connection' file.

    Returns:
        Any: The deserialized connection object.

    Raises:
        SystemExit: If 'SERDE_KEY' environment variable is not set or processing fails.
    """
    serde_key = os.getenv('SERDE_KEY')
    if not serde_key:
        logging.error("Environment variable 'SERDE_KEY' is not set.")
        utils.write_to_error_log("Environment variable 'SERDE_KEY' is not set.")
        sys.exit(EXIT_DESERIALIZATION_ERROR)

    try:
        with open(file_path, 'r') as connection_file:
            return utils.read_connection(connection_file, serde_key)
    except Exception as e:
        logging.error(f"Failed to read connection file '{file_path}': {e}")
        handle_exception(e)
        sys.exit(EXIT_DESERIALIZATION_ERROR)


def _deserialize_json_macros(file_path: str) -> Any:
    """
    Handles '.json-macros' files by loading macros from a JSON file.

    Args:
        file_path (str): The path to the '.json-macros' file.

    Returns:
        Any: The loaded macros.

    Raises:
        SystemExit: If JSON parsing fails.
    """
    try:
        with utils.open_file(file_path, 'r') as macros_file:
            macros_data = utils.read_file(macros_file)
            return json.loads(macros_data)
    except Exception as e:
        logging.error(f"Failed to read macros file '{file_path}': {e}")
        handle_exception(e)
        sys.exit(EXIT_DESERIALIZATION_ERROR)

##############################################################################
# Serialization Functions
##############################################################################


def serialize_results(results: Any) -> None:
    """
    Serializes the given results into appropriate file formats based on their types.

    Args:
        results (Any): The result(s) to serialize. Can be a single object or a tuple of objects.
    """
    # Ensure results is a tuple for uniform processing
    results_tuple = results if isinstance(results, tuple) else (results,)

    for index, entry in enumerate(results_tuple):
        if index >= rapidminer_numberOfOutputs:
            logging.info(f"Reached maximum number of outputs ({rapidminer_numberOfOutputs}). Stopping serialization.")
            break

        logging.info(f"Serializing entry {index}: {type(entry).__name__}")

        if isinstance(entry, pd.DataFrame):
            serialize_dataframe_arrow(entry, index)
        elif utils.is_file_object(entry):
            serialize_file_object(entry, index)
        else:
            serialize_generic_object(entry, index)


def serialize_dataframe_arrow(df: pd.DataFrame, index: int) -> None:
    """
    Serializes a pandas DataFrame to an Apache Arrow file with batching support.

    Args:
        df (pd.DataFrame): The DataFrame to serialize.
        index (int): The index of the current output for filename generation.
    """
    arrow_filename = f"{OUTPUT_FILENAME_PATTERN.format(index)}.arrow"
    arrow_path = os.path.join(temporary_directory, arrow_filename)
    logging.info(f"Serializing DataFrame to Arrow format at {arrow_path}")
    original_names = rename_columns(df)
    schema = set_schema_from_metadata(df, original_names)

    # Check if DataFrame has columns; if not, set batch_size to 1
    batch_size = max(1, min(BATCH_DEFAULT_ROWS, BATCH_MAX_CELLS // len(df.columns) if len(df.columns) > 0 else 1))
    logging.info('Number of chunks for writing: ' + str(batch_size))

    try:
        table = pa.Table.from_pandas(df, schema=schema, preserve_index=False, nthreads=4)
        with pa.OSFile(arrow_path, 'wb') as arrow_file:
            with ipc.RecordBatchStreamWriter(arrow_file, schema) as writer:
                for batch in table.to_batches(max_chunksize=batch_size):
                    writer.write_batch(batch)
            logging.info(f"Successfully serialized DataFrame to Arrow file: {arrow_path}")

    except Exception as e:
        handle_exception(e)
        sys.exit(EXIT_SERIALIZATION_ERROR)


def set_schema_from_metadata(df: pd.DataFrame, original_names: Dict[str, str]) -> pa.Schema:
    """
    Sets the PyArrow schema for a DataFrame based on provided metadata.

    This function ensures that when serializing a pandas DataFrame to an Apache Arrow file,
    the schema (including data types and metadata) is correctly defined according to the RapidMiner metadata.
    It processes the DataFrame's columns, applies data type conversions based on the 'rm_type' metadata,
    and updates the PyArrow schema with the appropriate metadata for each field.

    Args:
        df (pd.DataFrame): The pandas DataFrame to derive the schema from and to apply type conversions.
        original_names (Dict[str, str]): A mapping from potentially renamed column names to their original names.

    Returns:
        pa.Schema: The PyArrow schema with updated field metadata, ready for serialization.
    """
    rm_metadata = getattr(df, 'rm_metadata', {})
    default_meta = None if isinstance(rm_metadata, dict) else rm_metadata
    if not isinstance(rm_metadata, dict):
        rm_metadata = {}

    for col in df.columns:
        original_name = original_names.get(col, col)
        meta = rm_metadata.get(original_name, default_meta)
        meta = normalize_metadata(meta)
        rm_type = meta.get('rm_type')
        if rm_type:
            convert_column_dtype(df, col, rm_type)

        if df[col].dtype == 'object':
            convert_object_column(df, col)

    schema = pa.Schema.from_pandas(df, preserve_index=False)
    schema = update_schema_metadata(schema, df, original_names, rm_metadata, default_meta)
    return schema


def normalize_metadata(meta: Any) -> Dict[str, Any]:
    """
    Normalizes metadata into a standardized dictionary format.

    This function is needed to ensure that the metadata associated with DataFrame columns is in a consistent
    dictionary format, regardless of how it was originally provided (e.g., as a string, tuple, dict, or None).
    This standardized format is essential for consistent processing in downstream functions.

    Args:
        meta (Any): The metadata to normalize. Can be a string, tuple, dict, or None.

    Returns:
        Dict[str, Any]: A dictionary containing the normalized metadata. If the input is None or invalid,
        an empty dictionary is returned.
    """
    if isinstance(meta, tuple):
        meta_dict = {}
        if len(meta) >= 1 and meta[0] is not None:
            meta_dict['rm_type'] = meta[0]
        if len(meta) >= 2 and meta[1] is not None:
            meta_dict['rm_role'] = meta[1]
        meta = meta_dict
    elif isinstance(meta, str) and meta:
        meta = {'rm_type': meta}
    elif meta is None:
        meta = {}
    elif not isinstance(meta, dict):
        meta = {}
        logging.warning(
            f'Warning: Metadata must be a dict, tuple, or str. '
            f'Current type is {type(meta)}.'
        )
    return meta


def convert_column_dtype(df: pd.DataFrame, col: str, rm_type: str) -> None:
    """
    Converts a DataFrame column's data type based on the specified RapidMiner type ('rm_type').

    This function aligns the DataFrame's column data types with the expected types defined by RapidMiner's metadata.
    It ensures that the data types are compatible with the serialization process and match the types expected by RapidMiner.

    Args:
        df (pd.DataFrame): The DataFrame containing the column to convert.
        col (str): The name of the column in the DataFrame to convert.
        rm_type (str): The RapidMiner type to which the column should be converted.
            Supported types include 'real', 'integer', 'date-time', 'time', 'nominal', 'text', etc.

    Returns:
        None

    Raises:
        SystemExit: Exits with a specific error code if the conversion fails or if the 'rm_type' is unrecognized.
    """
    rm_type_lower = rm_type.lower()
    dtype_map = {
        'real': pd.Float64Dtype(),
        'integer': pd.Int64Dtype(),
        'date-time': 'datetime64[ns]',
        'time': 'timedelta64[ns]',
        'nominal': 'category',
        'text': pd.StringDtype(),
        'text-set': 'object',
        'text-list': 'object',
        'real-list': 'object',
    }
    target_dtype = dtype_map.get(rm_type_lower)
    if target_dtype:
        current_dtype = df[col].dtype
        if not pd.api.types.is_dtype_equal(current_dtype, target_dtype):
            try:
                if target_dtype == 'datetime64[ns]':
                    df[col] = pd.to_datetime(df[col], errors='raise')
                else:
                    df[col] = df[col].astype(target_dtype)
            except Exception as e:
                logging.warning(f"Failed to convert column '{col}' to dtype '{target_dtype}': {e}")
                utils.write_to_error_log(f"Failed to convert column '{col}' to dtype '{target_dtype}': {e}")
                sys.exit(EXIT_CUSTOM_TYPE_CONVERSION_ERROR)
    else:
        logging.warning(f"Unrecognized 'rm_type' '{rm_type}' for column '{col}'. No conversion can be applied.")
        utils.write_to_error_log(f"Unrecognized 'rm_type' '{rm_type}' for column '{col}'. No conversion can be applied. Valid types: {', '.join(dtype_map.keys())}")
        sys.exit(EXIT_CUSTOM_TYPE_CONVERSION_ERROR)


def convert_object_column(df: pd.DataFrame, col: str) -> None:
    """
    Converts DataFrame columns of dtype 'object' to appropriate nullable pandas dtypes based on inferred data.

    This function is necessary because 'object' dtype in pandas is a generic type that can hold any Python object,
    which may not be ideal for serialization or certain operations. By converting 'object' columns to more
    specific nullable dtypes (e.g., string, integer, float), it improves data consistency and compatibility
    with PyArrow and RapidMiner.

    Args:
        df (pd.DataFrame): The DataFrame containing the column to convert.
        col (str): The name of the column in the DataFrame to convert.

    Returns:
        None
    """
    inferred_type = pd.api.types.infer_dtype(df[col], skipna=True)
    if inferred_type in ('mixed', 'mixed-integer', 'mixed-integer-float'):
        df[col] = df[col].astype("string")
    elif inferred_type == 'bytes':
        df[col] = df[col].apply(lambda x: x.decode('utf-8', errors='replace') if isinstance(x, bytes) else x)
        df[col] = df[col].astype("string")
    elif inferred_type in ('string', 'unicode'):
        df[col] = df[col].astype("string")
    elif inferred_type == 'integer':
        df[col] = df[col].astype("Int64")
    elif inferred_type == 'floating':
        df[col] = df[col].astype("Float64")
    elif inferred_type == 'boolean':
        df[col] = df[col].astype("boolean")
    else:
        dtype_map = {
            'datetime': 'datetime64[ns]',
            'date': 'datetime64[ns]',
            'timedelta': 'timedelta64[ns]',
            'complex': 'complex128'
        }
        dtype = dtype_map.get(inferred_type)
        if dtype:
            try:
                df[col] = df[col].astype(dtype)
            except Exception as e:
                logging.warning(f"Failed to convert column '{col}' to type '{dtype}': {e}")


def update_schema_metadata(schema: pa.Schema, df: pd.DataFrame, original_names: Dict[str, str],
                           rm_metadata: Dict[str, Any], default_meta: Any) -> pa.Schema:
    """
    Updates the PyArrow schema by attaching metadata from the DataFrame to each field in the schema.

    This function ensures that when the DataFrame is serialized to an Apache Arrow file,
    the metadata (such as RapidMiner types and roles) is preserved. This metadata can then be utilized during
    deserialization or by other tools that read the Arrow file, maintaining data integrity and context.

    Args:
        schema (pa.Schema): The initial PyArrow schema derived from the DataFrame.
        df (pd.DataFrame): The DataFrame whose columns correspond to the schema fields.
        original_names (Dict[str, str]): A mapping from potentially renamed column names to their original names.
        rm_metadata (Dict[str, Any]): A dictionary containing metadata for columns, keyed by original column names.
        default_meta (Any): The default metadata to apply if a column's specific metadata is not available.

    Returns:
        pa.Schema: The updated PyArrow schema with metadata attached to each field where available.
    """
    for new_name in df.columns:
        original_name = original_names.get(new_name, new_name)
        meta = rm_metadata.get(original_name, default_meta)
        meta = normalize_metadata(meta)
        if meta:
            field_index = schema.get_field_index(new_name)
            if field_index == -1:
                logging.warning(f"Column '{new_name}' not found in schema. Skipping metadata assignment.")
                continue
            meta_bytes = {k.encode('utf-8'): str(v).encode('utf-8') for k, v in meta.items()}
            field = schema.field(field_index).with_metadata(meta_bytes)
            schema = schema.set(field_index, field)
    return schema


def rename_columns(dataframe: pd.DataFrame) -> Dict[str, str]:
    """
    Renames invalid DataFrame columns by adding a predefined prefix.

    A column name is considered invalid if it is empty or consists solely of digits.
    Renamed columns are tracked in a dictionary mapping new names to original names.

    Args:
        dataframe (pd.DataFrame): The DataFrame whose columns are to be renamed.

    Returns:
        Dict[str, str]: A dictionary mapping new column names to their original names.
    """
    original_names: Dict[str, str] = {}
    new_columns = []

    for name in dataframe.columns:
        if is_invalid_name(name):
            new_name = f"{COLUMN_RENAME_PREFIX}{name}"
            # Ensure the new name is unique to prevent duplicate columns
            if new_name in dataframe.columns:
                counter = 1
                unique_new_name = f"{new_name}_{counter}"
                while unique_new_name in dataframe.columns:
                    counter += 1
                    unique_new_name = f"{new_name}_{counter}"
                new_name = unique_new_name
            original_names[new_name] = name
            new_columns.append(new_name)
        else:
            new_columns.append(name)

    if original_names:
        dataframe.columns = new_columns

    return original_names


def is_invalid_name(name: Any) -> bool:
    """
    Checks if a column name is invalid (empty or digits only).
    """
    return not isinstance(name, str) or not name.strip() or name.strip().isdigit()


def serialize_file_object(file_obj: Any, index: int) -> None:
    """
    Serializes a file object by writing its absolute path to a .foi file.

    Args:
        file_obj (Any): The file object to serialize.
        index (int): The index of the current output for filename generation.
    """
    foi_filename = file_obj.name

    # Ensure the file path is absolute
    if not os.path.isabs(foi_filename):
        foi_filename = os.path.join(user_directory, foi_filename)

    foi_path = f"{OUTPUT_FILENAME_PATTERN.format(index)}.foi"

    try:
        with utils.open_file(foi_path, 'w') as foi_file:
            utils.write_file(foi_file, foi_filename)
    except Exception as e:
        handle_exception(e)
        sys.exit(EXIT_SERIALIZATION_ERROR)


def serialize_generic_object(obj: Any, index: int) -> None:
    """
    Serializes a generic object using pickle into a .bin file.

    Args:
        obj (Any): The object to serialize.
        index (int): The index of the current output for filename generation.
    """
    bin_path = f"{OUTPUT_FILENAME_PATTERN.format(index)}.bin"

    try:
        with open(bin_path, 'wb') as bin_file:
            pickle.dump(obj, bin_file)
    except Exception as e:
        handle_exception(e)
        sys.exit(EXIT_SERIALIZATION_ERROR)


def serialize_macros(macros: Dict[Any, Any], file_index: int) -> None:
    """
    Serializes the macros into a JSON file with the '.json-macros' extension.

    Each macro value is converted to a string. If conversion fails, the macro is skipped and a warning is logged.

    Args:
        macros (Dict[Any, Any]): The macros to serialize.
        file_index (int): The starting index used in the output filename.

    Raises:
        SystemExit: If writing to the JSON file fails.
    """
    out_macros: Dict[str, str] = {}

    for key, value in macros.items():
        try:
            out_macros[key] = str(value)
        except Exception as e:
            logging.warning(f"Error while setting value for macro '{key}': {e}. Skipping this macro.")

    with utils.open_file("rapidminer_output%03d.json-macros" % (file_index), 'w') as f:
        utils.write_file(f, json.dumps(out_macros))

##############################################################################
# Main Execution
##############################################################################


if __name__ == "__main__":
    check_pandas_version(pd.__version__, __required_pandas_version__)
    check_pyarrow_version(__required_pyarrow_version__)

    # Configure module import paths
    temporary_directory = os.getcwd()
    user_directory = os.environ['WORKING_DIRECTORY']
    import_userscript(user_directory)

    # Parse the number of outputs which the operator expects
    rapidminer_numberOfOutputs = int(float(sys.argv[1]))

    try:
        inputs, macros = deserialize()
        if not hasattr(userscript, 'rm_main'):
            error_msg = "Error: 'rm_main' function not found in userscript."
            logging.error(error_msg)
            utils.write_to_error_log(error_msg)
            sys.exit(EXIT_RM_MAIN_NOT_FOUND)

        named_params = {"macros": macros} if macros is not None else {}

        try:
            try:
                os.chdir(user_directory)
                result = userscript.rm_main(*inputs, **named_params)
            finally:
                os.chdir(temporary_directory)
        except Exception as e:
            handle_exception(e)
            sys.exit(EXIT_USER_SCRIPT_EXECUTION_ERROR)

        if result is not None and rapidminer_numberOfOutputs > 0:
            serialize_results(result)

        if macros is not None:
            serialize_macros(macros, rapidminer_numberOfOutputs)

    except Exception as e:
        # Unknown/non-user error during the process
        handle_exception(e)
        sys.exit(EXIT_UNKNOWN_ERROR)
