Source code for aido.interface

import os
from abc import ABC, abstractmethod
from typing import Dict, List, Optional

import numpy as np
import pandas as pd
import torch

from aido.monitoring.logger import WandbLogger
from aido.simulation_helpers import SimulationParameterDictionary


class _UserInterfaceBase(ABC):

    def __init__(self) -> None:
        self.results_dir: str | os.PathLike
        self.wandb_logger: WandbLogger | None

    @staticmethod
    def create_surrogate_dataset(
            parameter_dict: SimulationParameterDictionary,
            user_reco_loss: pd.Series | pd.DataFrame | np.ndarray,
            user_context: pd.Series | pd.DataFrame | None = None,
            ):
        pass



[docs]
class UserInterfaceBase(_UserInterfaceBase):
    """Abstract Base Class for the interface between AIDO and user-defined code.

    These methods must be implemented:
     - simulate
     - merge
     - reconstruct
    
    Optional methods to implement:
     - constraints
     - plot
     - loss
    """


[docs]
    @abstractmethod
    def simulate(self, parameter_dict_path: str, sim_output_path: str) -> None:
        """ This method must be implemented

        Starts the simulation process. We recommend starting a container and passing the arguments
        from the command line.

        Args:
            parameter_dict_path (str): The path to the parameter dictionary file.
            sim_output_path (str): The path to save the simulation output.

        Examples:
            To open the parameter dict in your python script:

            >>> parameter_dict = json.load(parameter_dict_path)

            Access its items by name and the key 'current_value':

            >>> foo_value = parameter_dict["foo"]["current_value]

            Use equivalent methods to open JSON files if using C++ or other languages.

        Important:
            The simulation should output exactly one file, which must be saved at 'sim_output_path'. You
            are free to choose the output format of the simulation (e.g. root file)
        """
        raise NotImplementedError



[docs]
    @abstractmethod
    def merge(
            self,
            parameter_dict_file_paths: List[str],
            simulation_file_paths: List[str],
            reco_input_path: str
            ) -> None:
        """ This method must be implemented

        This method must merge the parameter dicts and the simulation outputs into a single file.
        Its file path will be passed by the scheduler to the 'reconstruct' method as the first
        argument ('reco_input_path'). You are free to choose the file format of 'reco_input_path'.

        Args:
            parameter_dict_file_paths (List[str]): List of the simulation parameter dictionary paths
            simulation_file_paths (List[str]): List of the simulation output paths
            reco_input_path (str): Path for the merged file created by this method.
        
        Important:
            The output file generated by this method must be a parquet file of a pandas.DataFrame. The
            format of this DataFrame has to be such that these columns exist.

            - df["Parameters"]: The parameters given in the format of
                :meth:`SimulationParameterDictionary.to_df(<length>, display_discrete="as_one_hot")`,
                where `length` is the number of total events.
            - df["Inputs"]: Relevant input information fed to the reconstruction algorithm
            - df["Targets"]: The ground truth fed to the reconstruction algorithm
            - df["Context"]: Additional information (such as Particle ID)

        Example:
            You can easily construct such as DataFrame by using a dict and passing it to pandas. Here
            we will first build the DataFrame for each simulation result and then concatenate them together.

            For a single simulation result:

            Load the SimulationParameterDictionary with

            >>> parameter_dict = aido.SimulationParameterDictionary.from_json(parameter_dict_path)
            ... parameter_df = parameter_dict.to_df(<length>), display_discrete="as_one_hot")

            Where length is the total number of events. Now build the dict

            >>> df_combined_dict = {
            ...     "Parameters": ,
            ...     "Inputs": input_df[input_keys],
            ...     "Targets": input_df[target_keys],
            ...     "Context": input_df[context_keys],
            ... }
    
            Where input_df is a :class:`pandas.DataFrame` of arbitrary columns and <length> row (e.g the number
            of events). This way we can concatenate them in the following step:

            >>> df: pd.DataFrame = pd.concat(
            ...     df_combined_dict.values(),
            ...     keys=df_combined_dict.keys(),
            ...     axis=1,
            ... )

            Finally, we have to concatenate the different simulations together along axis=0 (the event axis).
            Here is one way to do it:

            >>> df_list: list[pandas.DataFrame] = []
            ... for simulation_result in simulation_file_paths:
            ...     # Some code that returns the DataFrame for this simulation
            ...     # For example a function f(sim_param_path, sim_input_path) -> df_i
            ...     df_list.append(<df_i>)

            Next, concatenate everything and reset the index, as it will be duplicated otherwise

            >>> df: pd.DataFrame = pd.concat(df_list, axis=0, ignore_index=True)
            ... df = df.reset_index(drop=True)

            Finished! Now simply save this DataFrame to a parquet file with:

            >>> df.to_parquet(reco_input_path, index=range(len(df)))
    
        """
        raise NotImplementedError



[docs]
    @abstractmethod
    def reconstruct(self, reco_input_path: str, reco_output_path: str, is_validation: bool = False) -> None:
        """ This method must be implemented

        Start your reconstruction algorithm here. We recommend using a container and starting the
        reconstruction from the command line.

        Args:
            reco_input_path (str): Path of the input file for your reconstruction process. It is the same
                path as the output of the 'merge' method.
            reco_output_path (str): Path of the output file generated by your reconstruction process. Since
                this file interfaces with the AIDO Optimizer, it must have a specific format detailed in the
                following.
            is_validation (bool): Useful to define a distinct behavior for regular reconstruction and for
                evaluation.
        """
        raise NotImplementedError



[docs]
    def constraints(
            self,
            parameter_dict: SimulationParameterDictionary,
            parameter_dict_as_tensor: Dict[str, torch.Tensor]
            ) -> None | torch.Tensor:
        """ This method is optional

        Use this method to compute additional constraints such as cost or dimensions using pytorch. The resulting
        Tensor must be one-dimensional and include gradients.
        """
        return None



[docs]
    def plot(self, parameter_dict: SimulationParameterDictionary) -> None:
        """ This method is optional

        Use this method to execute code after each iteration. This can be anything used to track the
        progress of the Optimization process. Errors that occur during the execution of this function
        will be automatically excepted and displayed as warnings.
        """
        return None



[docs]
    def loss(self, y: torch.Tensor, y_pred: torch.Tensor) -> torch.Tensor:
        """ This method is optional

        Use this method to compute the loss of the internal Optimizer. This must be an equivalent
        implementation to your reconstruction loss.
        """
        raise NotImplementedError