# coding=utf-8

##
# RapidMiner Scripting Language Integration Extension
#
# Copyright (C) 2015-2025 RapidMiner GmbH
##

# description: Reads the input from the input files, calls the rm_main method of the userscript 
# with the deserialized inputs as parameters and stores the result in output files.
# The input files are expected to have a file name of the form rapidminer_input%03d.* and the output files have 
# file names of the form rapidminer_output%03d.* Expects as parameter the number of output ports.

# author: Gisa Schaefer, Sabrina Kirstein
# USED UNDER COMPATIBILITY LEVEL 11.0.0

__version__ = "9.9.0"
__hdf5_compliant_date_and_time_conversion__ = False
__required_pandas_version__ = "1000.0" # this value should be overwritten in Java code
        
import sys
from glob import glob
import json
import os
import traceback
import datetime

try:
    import cPickle as pickle
except:
    import pickle
try:
    import pandas
except:
    print("pandas module not found")
    sys.exit(50)

import serdeutils as u

def check_pandas_version(actual, required):
    actual_major = int(actual.split('.')[0])
    actual_minor = int(actual.split('.')[1])
    required_major = int(required.split('.')[0])
    required_minor = int(required.split('.')[1])
    if actual_major < required_major or (actual_major == required_major and actual_minor < required_minor):
        print("pandas version "+actual+" not supported, at least version "+required+" is needed")
        u.write_to_error_log(actual+"\n"+required)
        sys.exit(51)

check_pandas_version(pandas.__version__, __required_pandas_version__)


#prints the exception and writes it to the error file
#"userscript.py" is replaced by "script"             
def handleException(e):
    message = str(e).replace("userscript.py","script")
    lineinfo = ''
    if type(e) is SyntaxError:
        # wrong syntax with caret is contained in middle lines
        print(''.join(traceback.format_exception_only(type(e), e)[1:-1]))
        # line is already contained in message and must be shifted by one
        message = message.replace("line "+str(e.lineno), "line "+str(e.lineno-1))
    else:
        # extract function name, line number and line content
        info = traceback.extract_tb(sys.exc_info()[2])
        # find portion that starts with rm_main
        script_filename = ''
        found = False
        last_script_line = -1
        sanitized_info = []
        # sanitize traceback information
        for (filename,line_number,function,text) in info:
            if function == 'rm_main':
                script_filename = filename
                found = True
            if found:
                if filename == script_filename:
                    # change name of temp filename
                    filename = 'script'
                    # line number must be shifted by 1 
                    line_number -=1
                    last_script_line = line_number
                sanitized_info.append((filename,line_number,function,text))
        # print the sanitized traceback
        print("Traceback (most recent call last):")
        print(''.join(traceback.format_list(sanitized_info)))
        if last_script_line >=0:
            lineinfo = " (script, line {})".format(last_script_line)
    full_message = "{}: {}".format(type(e).__name__,message+lineinfo)
    print(full_message)
    u.write_to_error_log(full_message)


# allow to import modules relative to the script location
__tmp_folder = os.getcwd()
__user_folder = os.environ['WORKING_DIRECTORY']
sys.path.append(__user_folder)

try:
    try:
        os.chdir(__user_folder)
        import userscript
    finally:
        os.chdir(__tmp_folder)
except Exception as e: 
    print("failed to parse the script")
    handleException(e)
    sys.exit(55)


# the number of outputs which the operator expects    
rapidminer_numberOfOutputs = int(float(sys.argv[1]))

# checks for files of the form 'rapidminer_input*.*', 
# reads those with extension .csv-encoded (or .csv) into pandas.DataFrames,
# reads those with extension .bin into python objects     
# reads those with extension .foi into file objects
def deserialize():
    files = glob('rapidminer_input*.*');
    files.sort()
    inputs = []
    macros = None
    for file in files:
        (path, extension) = os.path.splitext(file)
        if(extension=='.csv-encoded'):
            with u.open_file(path + ".csv-encoded", 'r') as input_csv:
                with u.open_file(path + ".pmd-encoded", 'r') as input_pmd:
                    inputs.append(u.read_example_set(input_csv, input_pmd))
        elif(extension=='.csv'):
            # legacy
            inputs.append(readExampleSet(file))
        elif(extension=='.bin'):
            try:
                with open(file,'rb') as load_file:
                    inputs.append(pickle.load(load_file))
            except Exception as e:
                handleException(e)
                sys.exit(65)            
        elif(extension=='.foi'):
            with u.open_file(file,'r') as f:
                content = u.read_file(f)
            if os.path.exists(content):
                load_file=open(content,'r')
                inputs.append(load_file)
            else:
                print("File not found. The file {} does not exist.").format(content)
                u.write_to_error_log("The file {} does not exist.".format(content))
                sys.exit(70)
        elif extension == '.connection':
            key = os.environ['SERDE_KEY']
            with open(file) as fp:
                inputs.append(u.read_connection(fp, key))
        elif(extension=='.json-macros'):
            if macros is not None:
                print("ERROR: multiple macros provided")
                sys.exit(65)
            with u.open_file(file,'r') as f:
                macros = json.loads(u.read_file(f))
    return (inputs, macros)

# writes the result(s) into files of the form rapidminer_output*.*,
# if the result is a tuple, all entries are treated separately
# exports pandas.DataFrames to json-files or legacy csv-files, 
# files to foi-files (containing the file os.path) and other objects except for tuples
# are serialized and saved in a .bin-file
def serialize(result):
    if not type(result) is tuple:
        result = (result,)
    index = 0
    for entry in result:
        if index == rapidminer_numberOfOutputs:
            break
        if isinstance(entry, pandas.DataFrame):
            exampleset_format = os.environ["EXAMPLESET_FORMAT"]
            if exampleset_format == 'csv-encoded':
                output_path = "rapidminer_output%03d" % index
                with u.open_file(output_path + ".csv-encoded", 'w') as output_csv:
                    with u.open_file(output_path + ".pmd-encoded", 'w') as output_pmd:
                        u.write_example_set(entry, output_csv, output_pmd, hdf5_compliant_date_and_time_conversion=__hdf5_compliant_date_and_time_conversion__)
            elif exampleset_format == 'csv':
                handleMetaData(entry,index)
                checkColumnNames(entry)
                entry.to_csv("rapidminer_output%03d.csv" % index,index=False,encoding='utf-8')
            else:
                u.write_to_error_log("No example set serialization format specified.")
                sys.exit(70)                    
        elif u.is_file_object(entry):
            # write path in foi-file
            foi_name = entry.name
            if foi_name != os.path.abspath(foi_name):
                # with relative path, the file will be in the working directory
                foi_name = os.path.join(__user_folder, foi_name)
            with u.open_file("rapidminer_output%03d.foi" % index, 'w') as foi_file:
                u.write_file(foi_file, foi_name)
        else:
            try:
                with open("rapidminer_output%03d.bin" % index, 'wb') as dump_file:
                    pickle.dump(entry,dump_file)
            except Exception as e:
                handleException(e)
                sys.exit(66) 
        index +=1

# serializes the macros, if more dict param is used, more file is created here. The file type is JSON, the extension is json-macros
def serialize_macros(macros, starting_file_index):
    out_macros = {}
    for k in macros:
        try:
            out_macros[k] = str(macros[k])
        except Exception as e:
            # don't pass macro value in case of error
            print("WARNING: Error while setting value for macro: '" + str(e) + "'.")
    with u.open_file("rapidminer_output%03d.json-macros" % (starting_file_index), 'w') as f:
        u.write_file(f,json.dumps(out_macros))

##############################################################################
# Legacy code kept for backward compatibility - do not change/fix it - BEGIN #
##############################################################################

# legacy
date_format_pattern = "%Y-%m-%d %H:%M:%S.%f"

# legacy
def input_date_format(date_string):
    if date_string == "" or pandas.isnull(date_string):
        return None
    return datetime.datetime.strptime(date_string, date_format_pattern)

# legacy
# reads the example set from the file into a pandas.DataFrame.
# checks if there is a metadata file and if this is the case
# attaches the metadata as pandas.DataFrame to the field metadata    
def readExampleSet(file):
    #check for same file name with meta data extension
    mdfile = os.path.splitext(file)[0]+'.pmd'
    meta_dict={}

    try:
        if sys.version_info >= (3, 0):
            with open(mdfile,'rt',encoding='utf-8') as data_file:    
                metadata = json.load(data_file)
        else:
            with open(mdfile) as data_file:    
                metadata = json.load(data_file, encoding = 'utf-8')
        #different iteration methods for python 2 and 3
        try:
            items = metadata.iteritems()
        except AttributeError:
            items = metadata.items()
        for key, value in items:
             #convert to tuple
            meta_dict[key]=(value[0],None if value[1]=="attribute" else value[1])
        rm_meta = meta_dict.copy()
    except Exception as e:
        rm_meta = None
        print(e)
    data = pandas.read_csv(file,index_col=None,encoding='utf-8')
    data = u.set_metadata_without_warning(data, rm_meta)

    for date_column, [type, _] in meta_dict.items():
        try:
            if type in set(["date_time", "date", "time"]):
                data[date_column] = data[date_column].apply(lambda date: input_date_format(date))
        except (TypeError, ValueError) as e:
            print("Error: failed to parse date/time in column '" + date_column + "' (" + str(e) + ").")
            data[date_column] = ""
    return data

# legacy
def output_date_format(dt):
    if pandas.isnull(dt):
        return ""
    return dt.strftime(date_format_pattern)[:23]

# legacy
#writes the meta data to a file
#uses the meta data from rm_metadata attribute if present
#otherwise deduces the type from the data and sets no special role
#side effect: converts date columns
def handleMetaData(data,index):
    metadata_list = []
    
    #check if rm_metadata attribute is present and a dictionary
    try:
        if isinstance(data.rm_metadata,dict):
            meta_isdict=True
        else:
            meta_isdict = False
            if data.rm_metadata is not None:
                print("Warning: rm_metadata must be a dictionary")
    except:
        meta_isdict=False
    
    for name in data.columns.values:
        try:
            meta = data.rm_metadata[name]
            #take entries only if tuple of length 2
            if isinstance(meta,tuple) and len(meta)==2 and meta_isdict:
                meta_type, meta_role = meta
            else:
                if meta_isdict and meta is not None:
                    print("Warning: rm_metadata["+name+"] must be a tuple of length 2, e.g. data.rm_metadata['column1']=('binominal','label')")
                meta_type = None
                meta_role= None
        except:
            meta_type = None
            meta_role= None
        
        if meta_role is None:
            meta_role = 'attribute'
        #choose type by dtype of the column
        if meta_type is None:
            kind_char = data.dtypes[name].kind
            if kind_char in ('i','u'):
                meta_type = 'integer'
            elif kind_char in ('f'):
                meta_type = 'real'
            elif kind_char in ('M'):
                meta_type = 'date_time'
            elif kind_char in ('b'):
                meta_type = 'binominal'
            else:
                meta_type = 'polynomial'
        metadata_list.append((meta_type,meta_role))
        if meta_type in set(["date_time", "date", "time"]):
            try:
                data[name] = data[name].apply(lambda datetime: output_date_format(datetime))
            except Exception as e:
                print("Error: failed to format date/time (" + str(e) + ").")
                data[name] = ""
    #store as json
    try:
        with open("rapidminer_output%03d.pmd" % index, 'w') as dump_file:
            json.dump(metadata_list,dump_file)
    except Exception as e:
        print("Failed to send meta data from Python script to RapidMiner")

# legacy
# if value has a string representation containing non-ascii symbols in python 2, 
# for example if value is a python 2 unicode with umlauts, then str(value) results in exception;
# in this case it is in particular not empty and contains more than only digits 
def isstringable(value):
    try: 
        str(value)
        return True
    except:
        return False

# legacy
def checkColumnNames(dataFrame):
    # column name must not be empty or a number
    if any(isstringable(value) and ((not str(value)) or str(value).isdigit()) for value in dataFrame.columns.values):
        new_columns = ['att'+str(value) if (isstringable(value) and ((not str(value)) or str(value).isdigit())) else str(value) for value in dataFrame.columns.values]
        dataFrame.columns = new_columns

############################################################################
# Legacy code kept for backward compatibility - do not change/fix it - END #
############################################################################

if __name__ == "__main__":
    try:
        inputs, macros = deserialize()
        if not hasattr(userscript, 'rm_main'):
            sys.exit(61)
        if macros is not None:
            named_params= {"macros" : macros}
        else:
            named_params = {}
        try:
            try:
                os.chdir(__user_folder)
                result = userscript.rm_main(*inputs, **named_params)
            finally:
                os.chdir(__tmp_folder)
        except Exception as e:
            handleException(e)
            sys.exit(60)
        if not result is None and rapidminer_numberOfOutputs > 0:
            serialize(result)
        if macros is not None:
            serialize_macros(macros, rapidminer_numberOfOutputs)
    except Exception as e:
        #unknown/non-user error
         handleException(e)
         sys.exit(1)


