Model

Bases: StaticCollection

A class describing a model that was trained on a particular dataset.

Attributes:

Name	Type	Description
`name`	`String`	The name of the model.
`metadata`	`Dictionary`	A dictionary of metadata that describes the model.

Examples:

>>> Model.create(name="model1")
>>> Model.create(name="model1", metadata={})
>>> Model.create(name="model1", metadata={"foo": "bar", "pi": 3.14})

Source code in valor/coretypes.py

class Model(StaticCollection):
    """
    A class describing a model that was trained on a particular dataset.

    Attributes
    ----------
    name : String
        The name of the model.
    metadata : Dictionary
        A dictionary of metadata that describes the model.

    Examples
    --------
    >>> Model.create(name="model1")
    >>> Model.create(name="model1", metadata={})
    >>> Model.create(name="model1", metadata={"foo": "bar", "pi": 3.14})
    """

    name: String = String.symbolic(owner="model", name="name")
    metadata: Dictionary = Dictionary.symbolic(owner="model", name="metadata")

    def __init__(
        self,
        *,
        name: str,
        metadata: Optional[dict] = None,
        connection: Optional[ClientConnection] = None,
    ):
        """
        Creates a local instance of a model.

        Use 'Model.create' classmethod to create a model with persistence.

        Parameters
        ----------
        name : String
            The name of the model.
        metadata : Dictionary
            A dictionary of metadata that describes the model.
        connection : ClientConnection, optional
            An initialized client connection.
        """
        self.conn = connection
        super().__init__(name=name, metadata=metadata if metadata else dict())

    @classmethod
    def create(
        cls,
        name: str,
        metadata: Optional[Dict[str, Any]] = None,
        connection: Optional[ClientConnection] = None,
        **_,
    ) -> Model:
        """
        Creates a model that persists in the back end.

        Parameters
        ----------
        name : str
            The name of the model.
        metadata : dict, optional
            A dictionary of metadata that describes the model.
        connection : ClientConnection, optional
            An initialized client connection.
        """
        model = cls(name=name, metadata=metadata, connection=connection)
        Client(connection).create_model(model)
        return model

    @classmethod
    def get(
        cls,
        name: str,
        connection: Optional[ClientConnection] = None,
    ) -> Union[Model, None]:
        """
        Retrieves a model from the back end database.

        Parameters
        ----------
        name : str
            The name of the model.
        connection : ClientConnnetion, optional
            An optional Valor client object for interacting with the API.

        Returns
        -------
        Union[valor.Model, None]
            The model or 'None' if it doesn't exist.
        """
        return Client(connection).get_model(name)

    def add_prediction(
        self,
        dataset: Dataset,
        prediction: Prediction,
    ) -> None:
        """
        Add a prediction to the model.

        Parameters
        ----------
        dataset : valor.Dataset
            The dataset that is being operated over.
        prediction : valor.Prediction
            The prediction to create.
        """
        Client(self.conn).create_predictions(
            dataset=dataset,
            model=self,
            predictions=[prediction],
        )

    def add_predictions(
        self,
        dataset: Dataset,
        predictions: List[Prediction],
        timeout: Optional[float] = 10.0,
    ) -> None:
        """
        Add multiple predictions to the model.

        Parameters
        ----------
        dataset : valor.Dataset
            The dataset that is being operated over.
        predictions : List[valor.Prediction]
            The predictions to create.
        timeout : float, optional
            The number of seconds the client should wait until raising a timeout.
        """
        Client(self.conn).create_predictions(
            dataset=dataset,
            model=self,
            predictions=predictions,
            timeout=timeout,
        )

    def get_prediction(
        self, dataset: Union[Dataset, str], datum: Union[Datum, str]
    ) -> Union[Prediction, None]:
        """
        Get a particular prediction.

        Parameters
        ----------
        dataset : Union[Dataset, str]
            The dataset the datum belongs to.
        datum : Union[Datum, str]
            The desired datum.

        Returns
        ----------
        Union[Prediction, None]
            The matching prediction or 'None' if it doesn't exist.
        """
        return Client(self.conn).get_prediction(
            dataset=dataset, model=self, datum=datum
        )

    def finalize_inferences(self, dataset: Union[Dataset, str]) -> None:
        """
        Finalizes the model over a dataset such that new predictions cannot be added to it.
        """
        return Client(self.conn).finalize_inferences(
            dataset=dataset, model=self
        )

    def _create_label_map(
        self,
        label_map: Optional[Dict[Label, Label]],
    ) -> Union[List[List[List[str]]], None]:
        """Convert a dictionary of label maps to a serializable list format."""
        if not label_map:
            return None

        if not isinstance(label_map, dict) or not all(
            [
                isinstance(key, Label) and isinstance(value, Label)
                for key, value in label_map.items()
            ]
        ):
            raise TypeError(
                "label_map should be a dictionary with valid Labels for both the key and value."
            )

        return_value = []
        for key, value in label_map.items():
            if not all(
                [
                    (isinstance(v.key, str) and isinstance(v.value, str))
                    for v in [key, value]
                ]
            ):
                raise TypeError
            return_value.append(
                [
                    [key.key, key.value],
                    [value.key, value.value],
                ]
            )
        return return_value

    def evaluate_classification(
        self,
        datasets: Union[Dataset, List[Dataset]],
        filters: Optional[Filter] = None,
        label_map: Optional[Dict[Label, Label]] = None,
        pr_curve_max_examples: int = 1,
        metrics_to_return: Optional[List[MetricType]] = None,
        *_,
        allow_retries: bool = False,
        timeout: Optional[float] = None,
    ) -> Evaluation:
        """
        Start a classification evaluation job.

        Parameters
        ----------
        datasets : Union[Dataset, List[Dataset]]
            The dataset or list of datasets to evaluate against.
        filters : Filter, optional
            Optional set of constraints to filter evaluation by.
        label_map : Dict[Label, Label], optional
            Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
        metrics_to_return : List[MetricType], optional
            The list of metrics to compute, store, and return to the user.
        allow_retries : bool, default = False
            Option to retry previously failed evaluations.
        timeout : float, optional
            The number of seconds the client should wait until raising a timeout.

        Returns
        -------
        Evaluation
            A job object that can be used to track the status of the job and get the metrics of it upon completion.
        """
        if metrics_to_return and not set(metrics_to_return).issubset(
            MetricType.classification()
        ):
            raise ValueError(
                f"The following metrics are not supported for classification: '{set(metrics_to_return) - MetricType.classification()}'"
            )

        # format request
        datasets = datasets if isinstance(datasets, list) else [datasets]
        filters = filters if filters else Filter()
        request = EvaluationRequest(
            dataset_names=[dataset.name for dataset in datasets],  # type: ignore - issue #604
            model_names=[self.name],  # type: ignore - issue #604
            filters=filters,
            parameters=EvaluationParameters(
                task_type=TaskType.CLASSIFICATION,
                label_map=self._create_label_map(label_map=label_map),
                pr_curve_max_examples=pr_curve_max_examples,
                metrics_to_return=metrics_to_return,
            ),
        )

        # create evaluation
        evaluation = Client(self.conn).evaluate(
            request, allow_retries=allow_retries, timeout=timeout
        )
        if len(evaluation) != 1:
            raise RuntimeError
        return evaluation[0]

    def evaluate_detection(
        self,
        datasets: Union[Dataset, List[Dataset]],
        filters: Optional[Filter] = None,
        convert_annotations_to_type: Optional[AnnotationType] = None,
        iou_thresholds_to_compute: Optional[List[float]] = None,
        iou_thresholds_to_return: Optional[List[float]] = None,
        label_map: Optional[Dict[Label, Label]] = None,
        recall_score_threshold: float = 0,
        metrics_to_return: Optional[List[MetricType]] = None,
        pr_curve_iou_threshold: float = 0.5,
        pr_curve_max_examples: int = 1,
        *_,
        allow_retries: bool = False,
        timeout: Optional[float] = None,
    ) -> Evaluation:
        """
        Start an object-detection evaluation job.

        Parameters
        ----------
        datasets : Union[Dataset, List[Dataset]]
            The dataset or list of datasets to evaluate against.
        filters : Filter, optional
            Optional set of constraints to filter evaluation by.
        convert_annotations_to_type : enums.AnnotationType, optional
            Forces the object detection evaluation to compute over this type.
        iou_thresholds_to_compute : List[float], optional
            Thresholds to compute mAP against.
        iou_thresholds_to_return : List[float], optional
            Thresholds to return AP for. Must be subset of `iou_thresholds_to_compute`.
        label_map : Dict[Label, Label], optional
            Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
        recall_score_threshold : float, default=0
            The confidence score threshold for use when determining whether to count a prediction as a true positive or not while calculating Average Recall.
        metrics_to_return : List[MetricType], optional
            The list of metrics to compute, store, and return to the user.
        pr_curve_iou_threshold : float, optional
            The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5.
        pr_curve_max_examples : int, optional
            The maximum number of datum examples to store when calculating PR curves.
        allow_retries : bool, default = False
            Option to retry previously failed evaluations.
        timeout : float, optional
            The number of seconds the client should wait until raising a timeout.

        Returns
        -------
        Evaluation
            A job object that can be used to track the status of the job and get the metrics of it upon completion.
        """
        if metrics_to_return and not set(metrics_to_return).issubset(
            MetricType.object_detection()
        ):
            raise ValueError(
                f"The following metrics are not supported for object detection: '{set(metrics_to_return) - MetricType.object_detection()}'"
            )

        if iou_thresholds_to_compute is None:
            iou_thresholds_to_compute = [
                round(0.5 + 0.05 * i, 2) for i in range(10)
            ]
        if iou_thresholds_to_return is None:
            iou_thresholds_to_return = [0.5, 0.75]

        # format request
        parameters = EvaluationParameters(
            task_type=TaskType.OBJECT_DETECTION,
            convert_annotations_to_type=convert_annotations_to_type,
            iou_thresholds_to_compute=iou_thresholds_to_compute,
            iou_thresholds_to_return=iou_thresholds_to_return,
            label_map=self._create_label_map(label_map=label_map),
            recall_score_threshold=recall_score_threshold,
            metrics_to_return=metrics_to_return,
            pr_curve_iou_threshold=pr_curve_iou_threshold,
            pr_curve_max_examples=pr_curve_max_examples,
        )
        datasets = datasets if isinstance(datasets, list) else [datasets]
        filters = filters if filters else Filter()
        request = EvaluationRequest(
            dataset_names=[dataset.name for dataset in datasets],  # type: ignore - issue #604
            model_names=[self.name],  # type: ignore - issue #604
            filters=filters,
            parameters=parameters,
        )

        # create evaluation
        evaluation = Client(self.conn).evaluate(
            request, allow_retries=allow_retries, timeout=timeout
        )
        if len(evaluation) != 1:
            raise RuntimeError
        return evaluation[0]

    def evaluate_segmentation(
        self,
        datasets: Union[Dataset, List[Dataset]],
        filters: Optional[Filter] = None,
        label_map: Optional[Dict[Label, Label]] = None,
        metrics_to_return: Optional[List[MetricType]] = None,
        *_,
        allow_retries: bool = False,
        timeout: Optional[float] = None,
    ) -> Evaluation:
        """
        Start a semantic-segmentation evaluation job.

        Parameters
        ----------
        datasets : Union[Dataset, List[Dataset]]
            The dataset or list of datasets to evaluate against.
        filters : Filter, optional
            Optional set of constraints to filter evaluation by.
        label_map : Dict[Label, Label], optional
            Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
        metrics_to_return : List[MetricType], optional
            The list of metrics to compute, store, and return to the user.
        allow_retries : bool, default = False
            Option to retry previously failed evaluations.
        timeout : float, optional
            The number of seconds the client should wait until raising a timeout.

        Returns
        -------
        Evaluation
            A job object that can be used to track the status of the job and get the metrics of it upon completion
        """
        if metrics_to_return and not set(metrics_to_return).issubset(
            MetricType.semantic_segmentation()
        ):
            raise ValueError(
                f"The following metrics are not supported for semantic segmentation: '{set(metrics_to_return) - MetricType.semantic_segmentation()}'"
            )

        # format request
        datasets = datasets if isinstance(datasets, list) else [datasets]
        filters = filters if filters else Filter()
        request = EvaluationRequest(
            dataset_names=[dataset.name for dataset in datasets],  # type: ignore - issue #604
            model_names=[self.name],  # type: ignore - issue #604
            filters=filters,
            parameters=EvaluationParameters(
                task_type=TaskType.SEMANTIC_SEGMENTATION,
                label_map=self._create_label_map(label_map=label_map),
                metrics_to_return=metrics_to_return,
            ),
        )

        # create evaluation
        evaluation = Client(self.conn).evaluate(
            request, allow_retries=allow_retries, timeout=timeout
        )
        if len(evaluation) != 1:
            raise RuntimeError
        return evaluation[0]

    def evaluate_text_generation(
        self,
        datasets: Union[Dataset, List[Dataset]],
        metrics_to_return: List[MetricType],
        filters: Optional[Filter] = None,
        llm_api_params: Optional[Dict[str, Union[str, dict]]] = None,
        metric_params: Optional[Dict[MetricType, Dict[str, Any]]] = None,
    ) -> Evaluation:
        """
        Start a classification evaluation job.

        Parameters
        ----------
        datasets : Union[Dataset, List[Dataset]]
            The dataset or list of datasets to evaluate against.
        metrics_to_return : List[MetricType]
            The list of metrics to compute, store, and return to the user. This is not optional for text generation evaluations.
        filters : Filter, optional
            Optional set of constraints to filter evaluation by.
        llm_api_params : Dict[str, Union[str,dict]], optional
            A dictionary of parameters for the LLM API.
        metric_params : Dict[MetricType, Dict[str,Any]], optional
            A dictionary of parameters for the metrics used in the evaluation. The keys should be the metrics and the values should be dictionaries of parameters for those metrics.

        Returns
        -------
        Evaluation
            A job object that can be used to track the status of the job and get the metrics of it upon completion.
        """
        if not set(metrics_to_return).issubset(MetricType.text_generation()):
            raise ValueError(
                f"The following metrics are not supported for text generation: '{set(metrics_to_return) - MetricType.text_generation()}'"
            )

        # If no api_key is provided, check the environment variables for an api key.
        if llm_api_params is not None:
            if "api_key" not in llm_api_params:
                if "client" not in llm_api_params:
                    raise ValueError(
                        "The client must be specified in the llm_api_params."
                    )

                if llm_api_params["client"] == "openai":
                    api_key = os.getenv("OPENAI_API_KEY", None)
                elif llm_api_params["client"] == "mistral":
                    api_key = os.getenv("MISTRAL_API_KEY", None)
                elif llm_api_params["client"] == "mock":
                    api_key = ""
                else:
                    raise ValueError(
                        "The client specified in llm_api_params is not supported."
                    )

                if api_key is not None:
                    llm_api_params["api_key"] = api_key

        bleu_weights = None
        rouge_types = None
        rouge_use_stemmer = None
        if metric_params is not None:
            if not all(
                metric in metrics_to_return for metric in metric_params.keys()
            ):
                raise ValueError(
                    "All metrics in metric_params must be in metrics_to_return."
                )

            if MetricType.BLEU in metric_params:
                bleu_weights = metric_params[MetricType.BLEU].get("weights")

            if MetricType.ROUGE in metric_params:
                rouge_types = metric_params[MetricType.ROUGE].get(
                    "rouge_types"
                )
                rouge_use_stemmer = metric_params[MetricType.ROUGE].get(
                    "use_stemmer"
                )

        # format request
        datasets = datasets if isinstance(datasets, list) else [datasets]
        filters = filters if filters else Filter()
        request = EvaluationRequest(
            dataset_names=[dataset.name for dataset in datasets],  # type: ignore - issue #604
            model_names=[self.name],  # type: ignore - issue #604
            filters=filters,
            parameters=EvaluationParameters(
                task_type=TaskType.TEXT_GENERATION,
                metrics_to_return=metrics_to_return,
                llm_api_params=llm_api_params,
                bleu_weights=bleu_weights,
                rouge_types=rouge_types,
                rouge_use_stemmer=rouge_use_stemmer,
            ),
        )

        # create evaluation
        evaluation = Client(self.conn).evaluate(request)
        if len(evaluation) != 1:
            raise RuntimeError
        return evaluation[0]

    def delete(self, timeout: int = 0):
        """
        Delete the `Model` object from the back end.

        Parameters
        ----------
        timeout : int, default=0
            Sets a timeout in seconds.
        """
        Client(self.conn).delete_model(self.name, timeout)  # type: ignore

    def get_labels(
        self,
    ) -> List[Label]:
        """
        Get all labels associated with a given model.

        Returns
        ----------
        List[Label]
            A list of `Labels` associated with the model.
        """
        return Client(self.conn).get_labels_from_model(self)

    def get_evaluations(
        self,
        metrics_to_sort_by: Optional[
            Dict[str, Union[Dict[str, str], str]]
        ] = None,
        *_,
        timeout: Optional[float] = None,
    ) -> List[Evaluation]:
        """
        Get all evaluations associated with a given model.

        Parameters
        ----------
        metrics_to_sort_by : dict[str, str | dict[str, str]], optional
            An optional dict of metric types to sort the evaluations by.
        timeout : float, optional
            The number of seconds the client should wait until raising a timeout.

        Returns
        ----------
        List[Evaluation]
            A list of `Evaluations` associated with the model.
        """
        return Client(self.conn).get_evaluations(
            models=[self],
            metrics_to_sort_by=metrics_to_sort_by,
            timeout=timeout,
        )

Functions

`valor.Model.init(*, name, metadata=None, connection=None)`

Creates a local instance of a model.

Use 'Model.create' classmethod to create a model with persistence.

Parameters:

Name	Type	Description	Default
`name`	`String`	The name of the model.	required
`metadata`	`Dictionary`	A dictionary of metadata that describes the model.	`None`
`connection`	`ClientConnection`	An initialized client connection.	`None`

Source code in valor/coretypes.py

def __init__(
    self,
    *,
    name: str,
    metadata: Optional[dict] = None,
    connection: Optional[ClientConnection] = None,
):
    """
    Creates a local instance of a model.

    Use 'Model.create' classmethod to create a model with persistence.

    Parameters
    ----------
    name : String
        The name of the model.
    metadata : Dictionary
        A dictionary of metadata that describes the model.
    connection : ClientConnection, optional
        An initialized client connection.
    """
    self.conn = connection
    super().__init__(name=name, metadata=metadata if metadata else dict())

`valor.Model.add_prediction(dataset, prediction)`

Add a prediction to the model.

Parameters:

Name	Type	Description	Default
`dataset`	`Dataset`	The dataset that is being operated over.	required
`prediction`	`Prediction`	The prediction to create.	required

Source code in valor/coretypes.py

def add_prediction(
    self,
    dataset: Dataset,
    prediction: Prediction,
) -> None:
    """
    Add a prediction to the model.

    Parameters
    ----------
    dataset : valor.Dataset
        The dataset that is being operated over.
    prediction : valor.Prediction
        The prediction to create.
    """
    Client(self.conn).create_predictions(
        dataset=dataset,
        model=self,
        predictions=[prediction],
    )

`valor.Model.add_predictions(dataset, predictions, timeout=10.0)`

Add multiple predictions to the model.

Parameters:

Name	Type	Description	Default
`dataset`	`Dataset`	The dataset that is being operated over.	required
`predictions`	`List[Prediction]`	The predictions to create.	required
`timeout`	`float`	The number of seconds the client should wait until raising a timeout.	`10.0`

Source code in valor/coretypes.py

def add_predictions(
    self,
    dataset: Dataset,
    predictions: List[Prediction],
    timeout: Optional[float] = 10.0,
) -> None:
    """
    Add multiple predictions to the model.

    Parameters
    ----------
    dataset : valor.Dataset
        The dataset that is being operated over.
    predictions : List[valor.Prediction]
        The predictions to create.
    timeout : float, optional
        The number of seconds the client should wait until raising a timeout.
    """
    Client(self.conn).create_predictions(
        dataset=dataset,
        model=self,
        predictions=predictions,
        timeout=timeout,
    )

`valor.Model.create(name, metadata=None, connection=None, **_)` `classmethod`

Creates a model that persists in the back end.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the model.	required
`metadata`	`dict`	A dictionary of metadata that describes the model.	`None`
`connection`	`ClientConnection`	An initialized client connection.	`None`

Source code in valor/coretypes.py

@classmethod
def create(
    cls,
    name: str,
    metadata: Optional[Dict[str, Any]] = None,
    connection: Optional[ClientConnection] = None,
    **_,
) -> Model:
    """
    Creates a model that persists in the back end.

    Parameters
    ----------
    name : str
        The name of the model.
    metadata : dict, optional
        A dictionary of metadata that describes the model.
    connection : ClientConnection, optional
        An initialized client connection.
    """
    model = cls(name=name, metadata=metadata, connection=connection)
    Client(connection).create_model(model)
    return model

`valor.Model.delete(timeout=0)`

Delete the Model object from the back end.

Parameters:

Name	Type	Description	Default
`timeout`	`int`	Sets a timeout in seconds.	`0`

Source code in valor/coretypes.py

def delete(self, timeout: int = 0):
    """
    Delete the `Model` object from the back end.

    Parameters
    ----------
    timeout : int, default=0
        Sets a timeout in seconds.
    """
    Client(self.conn).delete_model(self.name, timeout)  # type: ignore

`valor.Model.evaluate_classification(datasets, filters=None, label_map=None, pr_curve_max_examples=1, metrics_to_return=None, *_, allow_retries=False, timeout=None)`

Start a classification evaluation job.

Parameters:

Name	Type	Description	Default
`datasets`	`Union[Dataset, List[Dataset]]`	The dataset or list of datasets to evaluate against.	required
`filters`	`Filter`	Optional set of constraints to filter evaluation by.	`None`
`label_map`	`Dict[Label, Label]`	Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.	`None`
`metrics_to_return`	`List[MetricType]`	The list of metrics to compute, store, and return to the user.	`None`
`allow_retries`	`bool`	Option to retry previously failed evaluations.	`= False`
`timeout`	`float`	The number of seconds the client should wait until raising a timeout.	`None`

Returns:

Type	Description
`Evaluation`	A job object that can be used to track the status of the job and get the metrics of it upon completion.

Source code in valor/coretypes.py

def evaluate_classification(
    self,
    datasets: Union[Dataset, List[Dataset]],
    filters: Optional[Filter] = None,
    label_map: Optional[Dict[Label, Label]] = None,
    pr_curve_max_examples: int = 1,
    metrics_to_return: Optional[List[MetricType]] = None,
    *_,
    allow_retries: bool = False,
    timeout: Optional[float] = None,
) -> Evaluation:
    """
    Start a classification evaluation job.

    Parameters
    ----------
    datasets : Union[Dataset, List[Dataset]]
        The dataset or list of datasets to evaluate against.
    filters : Filter, optional
        Optional set of constraints to filter evaluation by.
    label_map : Dict[Label, Label], optional
        Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
    metrics_to_return : List[MetricType], optional
        The list of metrics to compute, store, and return to the user.
    allow_retries : bool, default = False
        Option to retry previously failed evaluations.
    timeout : float, optional
        The number of seconds the client should wait until raising a timeout.

    Returns
    -------
    Evaluation
        A job object that can be used to track the status of the job and get the metrics of it upon completion.
    """
    if metrics_to_return and not set(metrics_to_return).issubset(
        MetricType.classification()
    ):
        raise ValueError(
            f"The following metrics are not supported for classification: '{set(metrics_to_return) - MetricType.classification()}'"
        )

    # format request
    datasets = datasets if isinstance(datasets, list) else [datasets]
    filters = filters if filters else Filter()
    request = EvaluationRequest(
        dataset_names=[dataset.name for dataset in datasets],  # type: ignore - issue #604
        model_names=[self.name],  # type: ignore - issue #604
        filters=filters,
        parameters=EvaluationParameters(
            task_type=TaskType.CLASSIFICATION,
            label_map=self._create_label_map(label_map=label_map),
            pr_curve_max_examples=pr_curve_max_examples,
            metrics_to_return=metrics_to_return,
        ),
    )

    # create evaluation
    evaluation = Client(self.conn).evaluate(
        request, allow_retries=allow_retries, timeout=timeout
    )
    if len(evaluation) != 1:
        raise RuntimeError
    return evaluation[0]

`valor.Model.evaluate_detection(datasets, filters=None, convert_annotations_to_type=None, iou_thresholds_to_compute=None, iou_thresholds_to_return=None, label_map=None, recall_score_threshold=0, metrics_to_return=None, pr_curve_iou_threshold=0.5, pr_curve_max_examples=1, *_, allow_retries=False, timeout=None)`

Start an object-detection evaluation job.

Parameters:

Name	Type	Description	Default
`datasets`	`Union[Dataset, List[Dataset]]`	The dataset or list of datasets to evaluate against.	required
`filters`	`Filter`	Optional set of constraints to filter evaluation by.	`None`
`convert_annotations_to_type`	`AnnotationType`	Forces the object detection evaluation to compute over this type.	`None`
`iou_thresholds_to_compute`	`List[float]`	Thresholds to compute mAP against.	`None`
`iou_thresholds_to_return`	`List[float]`	Thresholds to return AP for. Must be subset of `iou_thresholds_to_compute`.	`None`
`label_map`	`Dict[Label, Label]`	Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.	`None`
`recall_score_threshold`	`float`	The confidence score threshold for use when determining whether to count a prediction as a true positive or not while calculating Average Recall.	`0`
`metrics_to_return`	`List[MetricType]`	The list of metrics to compute, store, and return to the user.	`None`
`pr_curve_iou_threshold`	`float`	The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5.	`0.5`
`pr_curve_max_examples`	`int`	The maximum number of datum examples to store when calculating PR curves.	`1`
`allow_retries`	`bool`	Option to retry previously failed evaluations.	`= False`
`timeout`	`float`	The number of seconds the client should wait until raising a timeout.	`None`

Returns:

Type	Description
`Evaluation`	A job object that can be used to track the status of the job and get the metrics of it upon completion.

Source code in valor/coretypes.py

def evaluate_detection(
    self,
    datasets: Union[Dataset, List[Dataset]],
    filters: Optional[Filter] = None,
    convert_annotations_to_type: Optional[AnnotationType] = None,
    iou_thresholds_to_compute: Optional[List[float]] = None,
    iou_thresholds_to_return: Optional[List[float]] = None,
    label_map: Optional[Dict[Label, Label]] = None,
    recall_score_threshold: float = 0,
    metrics_to_return: Optional[List[MetricType]] = None,
    pr_curve_iou_threshold: float = 0.5,
    pr_curve_max_examples: int = 1,
    *_,
    allow_retries: bool = False,
    timeout: Optional[float] = None,
) -> Evaluation:
    """
    Start an object-detection evaluation job.

    Parameters
    ----------
    datasets : Union[Dataset, List[Dataset]]
        The dataset or list of datasets to evaluate against.
    filters : Filter, optional
        Optional set of constraints to filter evaluation by.
    convert_annotations_to_type : enums.AnnotationType, optional
        Forces the object detection evaluation to compute over this type.
    iou_thresholds_to_compute : List[float], optional
        Thresholds to compute mAP against.
    iou_thresholds_to_return : List[float], optional
        Thresholds to return AP for. Must be subset of `iou_thresholds_to_compute`.
    label_map : Dict[Label, Label], optional
        Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
    recall_score_threshold : float, default=0
        The confidence score threshold for use when determining whether to count a prediction as a true positive or not while calculating Average Recall.
    metrics_to_return : List[MetricType], optional
        The list of metrics to compute, store, and return to the user.
    pr_curve_iou_threshold : float, optional
        The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5.
    pr_curve_max_examples : int, optional
        The maximum number of datum examples to store when calculating PR curves.
    allow_retries : bool, default = False
        Option to retry previously failed evaluations.
    timeout : float, optional
        The number of seconds the client should wait until raising a timeout.

    Returns
    -------
    Evaluation
        A job object that can be used to track the status of the job and get the metrics of it upon completion.
    """
    if metrics_to_return and not set(metrics_to_return).issubset(
        MetricType.object_detection()
    ):
        raise ValueError(
            f"The following metrics are not supported for object detection: '{set(metrics_to_return) - MetricType.object_detection()}'"
        )

    if iou_thresholds_to_compute is None:
        iou_thresholds_to_compute = [
            round(0.5 + 0.05 * i, 2) for i in range(10)
        ]
    if iou_thresholds_to_return is None:
        iou_thresholds_to_return = [0.5, 0.75]

    # format request
    parameters = EvaluationParameters(
        task_type=TaskType.OBJECT_DETECTION,
        convert_annotations_to_type=convert_annotations_to_type,
        iou_thresholds_to_compute=iou_thresholds_to_compute,
        iou_thresholds_to_return=iou_thresholds_to_return,
        label_map=self._create_label_map(label_map=label_map),
        recall_score_threshold=recall_score_threshold,
        metrics_to_return=metrics_to_return,
        pr_curve_iou_threshold=pr_curve_iou_threshold,
        pr_curve_max_examples=pr_curve_max_examples,
    )
    datasets = datasets if isinstance(datasets, list) else [datasets]
    filters = filters if filters else Filter()
    request = EvaluationRequest(
        dataset_names=[dataset.name for dataset in datasets],  # type: ignore - issue #604
        model_names=[self.name],  # type: ignore - issue #604
        filters=filters,
        parameters=parameters,
    )

    # create evaluation
    evaluation = Client(self.conn).evaluate(
        request, allow_retries=allow_retries, timeout=timeout
    )
    if len(evaluation) != 1:
        raise RuntimeError
    return evaluation[0]

`valor.Model.evaluate_segmentation(datasets, filters=None, label_map=None, metrics_to_return=None, *_, allow_retries=False, timeout=None)`

Start a semantic-segmentation evaluation job.

Parameters:

Name	Type	Description	Default
`datasets`	`Union[Dataset, List[Dataset]]`	The dataset or list of datasets to evaluate against.	required
`filters`	`Filter`	Optional set of constraints to filter evaluation by.	`None`
`label_map`	`Dict[Label, Label]`	Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.	`None`
`metrics_to_return`	`List[MetricType]`	The list of metrics to compute, store, and return to the user.	`None`
`allow_retries`	`bool`	Option to retry previously failed evaluations.	`= False`
`timeout`	`float`	The number of seconds the client should wait until raising a timeout.	`None`

Returns:

Type	Description
`Evaluation`	A job object that can be used to track the status of the job and get the metrics of it upon completion

Source code in valor/coretypes.py

def evaluate_segmentation(
    self,
    datasets: Union[Dataset, List[Dataset]],
    filters: Optional[Filter] = None,
    label_map: Optional[Dict[Label, Label]] = None,
    metrics_to_return: Optional[List[MetricType]] = None,
    *_,
    allow_retries: bool = False,
    timeout: Optional[float] = None,
) -> Evaluation:
    """
    Start a semantic-segmentation evaluation job.

    Parameters
    ----------
    datasets : Union[Dataset, List[Dataset]]
        The dataset or list of datasets to evaluate against.
    filters : Filter, optional
        Optional set of constraints to filter evaluation by.
    label_map : Dict[Label, Label], optional
        Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
    metrics_to_return : List[MetricType], optional
        The list of metrics to compute, store, and return to the user.
    allow_retries : bool, default = False
        Option to retry previously failed evaluations.
    timeout : float, optional
        The number of seconds the client should wait until raising a timeout.

    Returns
    -------
    Evaluation
        A job object that can be used to track the status of the job and get the metrics of it upon completion
    """
    if metrics_to_return and not set(metrics_to_return).issubset(
        MetricType.semantic_segmentation()
    ):
        raise ValueError(
            f"The following metrics are not supported for semantic segmentation: '{set(metrics_to_return) - MetricType.semantic_segmentation()}'"
        )

    # format request
    datasets = datasets if isinstance(datasets, list) else [datasets]
    filters = filters if filters else Filter()
    request = EvaluationRequest(
        dataset_names=[dataset.name for dataset in datasets],  # type: ignore - issue #604
        model_names=[self.name],  # type: ignore - issue #604
        filters=filters,
        parameters=EvaluationParameters(
            task_type=TaskType.SEMANTIC_SEGMENTATION,
            label_map=self._create_label_map(label_map=label_map),
            metrics_to_return=metrics_to_return,
        ),
    )

    # create evaluation
    evaluation = Client(self.conn).evaluate(
        request, allow_retries=allow_retries, timeout=timeout
    )
    if len(evaluation) != 1:
        raise RuntimeError
    return evaluation[0]

`valor.Model.evaluate_text_generation(datasets, metrics_to_return, filters=None, llm_api_params=None, metric_params=None)`

Start a classification evaluation job.

Parameters:

Name	Type	Description	Default
`datasets`	`Union[Dataset, List[Dataset]]`	The dataset or list of datasets to evaluate against.	required
`metrics_to_return`	`List[MetricType]`	The list of metrics to compute, store, and return to the user. This is not optional for text generation evaluations.	required
`filters`	`Filter`	Optional set of constraints to filter evaluation by.	`None`
`llm_api_params`	`Dict[str, Union[str, dict]]`	A dictionary of parameters for the LLM API.	`None`
`metric_params`	`Dict[MetricType, Dict[str, Any]]`	A dictionary of parameters for the metrics used in the evaluation. The keys should be the metrics and the values should be dictionaries of parameters for those metrics.	`None`

Returns:

Type	Description
`Evaluation`	A job object that can be used to track the status of the job and get the metrics of it upon completion.

Source code in valor/coretypes.py

def evaluate_text_generation(
    self,
    datasets: Union[Dataset, List[Dataset]],
    metrics_to_return: List[MetricType],
    filters: Optional[Filter] = None,
    llm_api_params: Optional[Dict[str, Union[str, dict]]] = None,
    metric_params: Optional[Dict[MetricType, Dict[str, Any]]] = None,
) -> Evaluation:
    """
    Start a classification evaluation job.

    Parameters
    ----------
    datasets : Union[Dataset, List[Dataset]]
        The dataset or list of datasets to evaluate against.
    metrics_to_return : List[MetricType]
        The list of metrics to compute, store, and return to the user. This is not optional for text generation evaluations.
    filters : Filter, optional
        Optional set of constraints to filter evaluation by.
    llm_api_params : Dict[str, Union[str,dict]], optional
        A dictionary of parameters for the LLM API.
    metric_params : Dict[MetricType, Dict[str,Any]], optional
        A dictionary of parameters for the metrics used in the evaluation. The keys should be the metrics and the values should be dictionaries of parameters for those metrics.

    Returns
    -------
    Evaluation
        A job object that can be used to track the status of the job and get the metrics of it upon completion.
    """
    if not set(metrics_to_return).issubset(MetricType.text_generation()):
        raise ValueError(
            f"The following metrics are not supported for text generation: '{set(metrics_to_return) - MetricType.text_generation()}'"
        )

    # If no api_key is provided, check the environment variables for an api key.
    if llm_api_params is not None:
        if "api_key" not in llm_api_params:
            if "client" not in llm_api_params:
                raise ValueError(
                    "The client must be specified in the llm_api_params."
                )

            if llm_api_params["client"] == "openai":
                api_key = os.getenv("OPENAI_API_KEY", None)
            elif llm_api_params["client"] == "mistral":
                api_key = os.getenv("MISTRAL_API_KEY", None)
            elif llm_api_params["client"] == "mock":
                api_key = ""
            else:
                raise ValueError(
                    "The client specified in llm_api_params is not supported."
                )

            if api_key is not None:
                llm_api_params["api_key"] = api_key

    bleu_weights = None
    rouge_types = None
    rouge_use_stemmer = None
    if metric_params is not None:
        if not all(
            metric in metrics_to_return for metric in metric_params.keys()
        ):
            raise ValueError(
                "All metrics in metric_params must be in metrics_to_return."
            )

        if MetricType.BLEU in metric_params:
            bleu_weights = metric_params[MetricType.BLEU].get("weights")

        if MetricType.ROUGE in metric_params:
            rouge_types = metric_params[MetricType.ROUGE].get(
                "rouge_types"
            )
            rouge_use_stemmer = metric_params[MetricType.ROUGE].get(
                "use_stemmer"
            )

    # format request
    datasets = datasets if isinstance(datasets, list) else [datasets]
    filters = filters if filters else Filter()
    request = EvaluationRequest(
        dataset_names=[dataset.name for dataset in datasets],  # type: ignore - issue #604
        model_names=[self.name],  # type: ignore - issue #604
        filters=filters,
        parameters=EvaluationParameters(
            task_type=TaskType.TEXT_GENERATION,
            metrics_to_return=metrics_to_return,
            llm_api_params=llm_api_params,
            bleu_weights=bleu_weights,
            rouge_types=rouge_types,
            rouge_use_stemmer=rouge_use_stemmer,
        ),
    )

    # create evaluation
    evaluation = Client(self.conn).evaluate(request)
    if len(evaluation) != 1:
        raise RuntimeError
    return evaluation[0]

`valor.Model.finalize_inferences(dataset)`

Finalizes the model over a dataset such that new predictions cannot be added to it.

Source code in valor/coretypes.py

def finalize_inferences(self, dataset: Union[Dataset, str]) -> None:
    """
    Finalizes the model over a dataset such that new predictions cannot be added to it.
    """
    return Client(self.conn).finalize_inferences(
        dataset=dataset, model=self
    )

`valor.Model.get(name, connection=None)` `classmethod`

Retrieves a model from the back end database.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the model.	required
`connection`	`ClientConnnetion`	An optional Valor client object for interacting with the API.	`None`

Returns:

Type	Description
`Union[Model, None]`	The model or 'None' if it doesn't exist.

Source code in valor/coretypes.py

@classmethod
def get(
    cls,
    name: str,
    connection: Optional[ClientConnection] = None,
) -> Union[Model, None]:
    """
    Retrieves a model from the back end database.

    Parameters
    ----------
    name : str
        The name of the model.
    connection : ClientConnnetion, optional
        An optional Valor client object for interacting with the API.

    Returns
    -------
    Union[valor.Model, None]
        The model or 'None' if it doesn't exist.
    """
    return Client(connection).get_model(name)

`valor.Model.get_evaluations(metrics_to_sort_by=None, *_, timeout=None)`

Get all evaluations associated with a given model.

Parameters:

Name	Type	Description	Default
`metrics_to_sort_by`	`dict[str, str \| dict[str, str]]`	An optional dict of metric types to sort the evaluations by.	`None`
`timeout`	`float`	The number of seconds the client should wait until raising a timeout.	`None`

Returns:

Type	Description
`List[Evaluation]`	A list of `Evaluations` associated with the model.

Source code in valor/coretypes.py

def get_evaluations(
    self,
    metrics_to_sort_by: Optional[
        Dict[str, Union[Dict[str, str], str]]
    ] = None,
    *_,
    timeout: Optional[float] = None,
) -> List[Evaluation]:
    """
    Get all evaluations associated with a given model.

    Parameters
    ----------
    metrics_to_sort_by : dict[str, str | dict[str, str]], optional
        An optional dict of metric types to sort the evaluations by.
    timeout : float, optional
        The number of seconds the client should wait until raising a timeout.

    Returns
    ----------
    List[Evaluation]
        A list of `Evaluations` associated with the model.
    """
    return Client(self.conn).get_evaluations(
        models=[self],
        metrics_to_sort_by=metrics_to_sort_by,
        timeout=timeout,
    )

`valor.Model.get_labels()`

Get all labels associated with a given model.

Returns:

Type	Description
`List[Label]`	A list of `Labels` associated with the model.

Source code in valor/coretypes.py

def get_labels(
    self,
) -> List[Label]:
    """
    Get all labels associated with a given model.

    Returns
    ----------
    List[Label]
        A list of `Labels` associated with the model.
    """
    return Client(self.conn).get_labels_from_model(self)

`valor.Model.get_prediction(dataset, datum)`

Get a particular prediction.

Parameters:

Name	Type	Description	Default
`dataset`	`Union[Dataset, str]`	The dataset the datum belongs to.	required
`datum`	`Union[Datum, str]`	The desired datum.	required

Returns:

Type	Description
`Union[Prediction, None]`	The matching prediction or 'None' if it doesn't exist.

Source code in valor/coretypes.py

def get_prediction(
    self, dataset: Union[Dataset, str], datum: Union[Datum, str]
) -> Union[Prediction, None]:
    """
    Get a particular prediction.

    Parameters
    ----------
    dataset : Union[Dataset, str]
        The dataset the datum belongs to.
    datum : Union[Datum, str]
        The desired datum.

    Returns
    ----------
    Union[Prediction, None]
        The matching prediction or 'None' if it doesn't exist.
    """
    return Client(self.conn).get_prediction(
        dataset=dataset, model=self, datum=datum
    )

Model

Functions

valor.Model.__init__(*, name, metadata=None, connection=None)

valor.Model.add_prediction(dataset, prediction)

valor.Model.add_predictions(dataset, predictions, timeout=10.0)

valor.Model.create(name, metadata=None, connection=None, **_) classmethod

valor.Model.delete(timeout=0)

valor.Model.evaluate_classification(datasets, filters=None, label_map=None, pr_curve_max_examples=1, metrics_to_return=None, *_, allow_retries=False, timeout=None)

valor.Model.evaluate_segmentation(datasets, filters=None, label_map=None, metrics_to_return=None, *_, allow_retries=False, timeout=None)

valor.Model.evaluate_text_generation(datasets, metrics_to_return, filters=None, llm_api_params=None, metric_params=None)

valor.Model.finalize_inferences(dataset)

valor.Model.get(name, connection=None) classmethod

valor.Model.get_evaluations(metrics_to_sort_by=None, *_, timeout=None)

valor.Model.get_labels()

valor.Model.get_prediction(dataset, datum)

`valor.Model.init(*, name, metadata=None, connection=None)`

`valor.Model.add_prediction(dataset, prediction)`

`valor.Model.add_predictions(dataset, predictions, timeout=10.0)`

`valor.Model.create(name, metadata=None, connection=None, **_)` `classmethod`

`valor.Model.delete(timeout=0)`

`valor.Model.evaluate_classification(datasets, filters=None, label_map=None, pr_curve_max_examples=1, metrics_to_return=None, *_, allow_retries=False, timeout=None)`

`valor.Model.evaluate_segmentation(datasets, filters=None, label_map=None, metrics_to_return=None, *_, allow_retries=False, timeout=None)`

`valor.Model.evaluate_text_generation(datasets, metrics_to_return, filters=None, llm_api_params=None, metric_params=None)`

`valor.Model.finalize_inferences(dataset)`

`valor.Model.get(name, connection=None)` `classmethod`

`valor.Model.get_evaluations(metrics_to_sort_by=None, *_, timeout=None)`

`valor.Model.get_labels()`

`valor.Model.get_prediction(dataset, datum)`