Skip to content

Evaluation

Wraps valor.client.Job to provide evaluation-specifc members.

Source code in valor/coretypes.py
class Evaluation:
    """
    Wraps `valor.client.Job` to provide evaluation-specifc members.
    """

    def __init__(
        self, connection: Optional[ClientConnection] = None, **kwargs
    ):
        """
        Defines important attributes of the API's `EvaluationResult`.

        Attributes
        ----------
        id : int
            The ID of the evaluation.
        dataset_names : list[str]
            The names of the datasets the model was evaluated over.
        model_name : str
            The name of the evaluated model.
        filters : dict
            The filter used to select data partitions for evaluation.
        status : EvaluationStatus
            The status of the evaluation.
        metrics : List[dict]
            A list of metric dictionaries returned by the job.
        confusion_matrices : List[dict]
            A list of confusion matrix dictionaries returned by the job.
        meta : dict[str, str | float | dict], optional
            A dictionary of metadata describing the evaluation run.
        """
        if not connection:
            connection = get_connection()
        self.conn = connection
        self.update(**kwargs)

    def update(
        self,
        *_,
        id: int,
        dataset_names: list[str],
        model_name: str,
        filters: dict,
        parameters: EvaluationParameters,
        status: EvaluationStatus,
        metrics: List[Dict],
        confusion_matrices: List[Dict],
        created_at: str,
        meta: dict[str, str | float | dict] | None,
        **kwargs,
    ):
        self.id = id
        self.dataset_names = dataset_names
        self.model_name = model_name
        self.filters = filters
        self.parameters = (
            EvaluationParameters(**parameters)
            if isinstance(parameters, dict)
            else parameters
        )
        self.status = EvaluationStatus(status)
        self.metrics = metrics
        self.meta = meta
        self.confusion_matrices = confusion_matrices
        self.kwargs = kwargs
        self.ignored_pred_labels: Optional[List[Label]] = None
        self.missing_pred_labels: Optional[List[Label]] = None
        self.created_at = datetime.datetime.strptime(
            created_at, "%Y-%m-%dT%H:%M:%S.%fZ"
        ).replace(tzinfo=datetime.timezone.utc)

        for k, v in kwargs.items():
            setattr(self, k, v)

    def poll(self) -> EvaluationStatus:
        """
        Poll the back end.

        Updates the evaluation with the latest state from the back end.

        Returns
        -------
        enums.EvaluationStatus
            The status of the evaluation.

        Raises
        ----------
        ClientException
            If an Evaluation with the given `evaluation_id` is not found.
        """
        response = self.conn.get_evaluations(
            evaluation_ids=[self.id],
            models=None,
            datasets=None,
            metrics_to_sort_by=None,
        )
        if not response:
            raise EvaluationDoesNotExist(self.id)
        self.update(**response[0])
        return self.status

    def wait_for_completion(
        self,
        *,
        timeout: Optional[int] = None,
        interval: float = 1.0,
    ) -> EvaluationStatus:
        """
        Blocking function that waits for evaluation to finish.

        Parameters
        ----------
        timeout : int, optional
            Length of timeout in seconds.
        interval : float, default=1.0
            Polling interval in seconds.
        """
        t_start = time.time()
        while self.poll() not in [
            EvaluationStatus.DONE,
            EvaluationStatus.FAILED,
        ]:
            time.sleep(interval)
            if timeout and time.time() - t_start > timeout:
                raise TimeoutError
        return self.status

    def __str__(self) -> str:
        """Dumps the object into a JSON formatted string."""
        return json.dumps(self.to_dict(), indent=4)

    def to_dict(self) -> dict:
        """
        Defines how a `valor.Evaluation` object is serialized into a dictionary.

        Returns
        ----------
        dict
            A dictionary describing an evaluation.
        """
        return {
            "id": self.id,
            "dataset_names": self.dataset_names,
            "model_name": self.model_name,
            "filters": self.filters,
            "parameters": asdict(self.parameters),
            "status": self.status.value,
            "metrics": self.metrics,
            "confusion_matrices": self.confusion_matrices,
            "meta": self.meta,
            **self.kwargs,
        }

    def to_dataframe(
        self,
        stratify_by: Optional[Tuple[str, str]] = None,
    ):
        """
        Get all metrics associated with a Model and return them in a `pd.DataFrame`.

        Returns
        ----------
        pd.DataFrame
            Evaluation metrics being displayed in a `pd.DataFrame`.

        Raises
        ------
        ModuleNotFoundError
            This function requires the use of `pandas.DataFrame`.

        """
        try:
            import pandas as pd
        except ModuleNotFoundError:
            raise ModuleNotFoundError(
                "Must have pandas installed to use `get_metric_dataframes`."
            )

        if not stratify_by:
            column_type = "evaluation"
            column_name = self.id
        else:
            column_type = stratify_by[0]
            column_name = stratify_by[1]

        metrics = [
            {**metric, column_type: column_name} for metric in self.metrics
        ]
        df = pd.DataFrame(metrics)
        for k in ["label", "parameters"]:
            df[k] = df[k].fillna("n/a")
        df["parameters"] = df["parameters"].apply(json.dumps)
        df["label"] = df["label"].apply(
            lambda x: f"{x['key']}: {x['value']}" if x != "n/a" else x
        )
        df = df.pivot(
            index=["type", "parameters", "label"], columns=[column_type]
        )
        return df

Functions

valor.Evaluation.__init__(connection=None, **kwargs)

Defines important attributes of the API's EvaluationResult.

Attributes:

Name Type Description
id int

The ID of the evaluation.

dataset_names list[str]

The names of the datasets the model was evaluated over.

model_name str

The name of the evaluated model.

filters dict

The filter used to select data partitions for evaluation.

status EvaluationStatus

The status of the evaluation.

metrics List[dict]

A list of metric dictionaries returned by the job.

confusion_matrices List[dict]

A list of confusion matrix dictionaries returned by the job.

meta (dict[str, str | float | dict], optional)

A dictionary of metadata describing the evaluation run.

Source code in valor/coretypes.py
def __init__(
    self, connection: Optional[ClientConnection] = None, **kwargs
):
    """
    Defines important attributes of the API's `EvaluationResult`.

    Attributes
    ----------
    id : int
        The ID of the evaluation.
    dataset_names : list[str]
        The names of the datasets the model was evaluated over.
    model_name : str
        The name of the evaluated model.
    filters : dict
        The filter used to select data partitions for evaluation.
    status : EvaluationStatus
        The status of the evaluation.
    metrics : List[dict]
        A list of metric dictionaries returned by the job.
    confusion_matrices : List[dict]
        A list of confusion matrix dictionaries returned by the job.
    meta : dict[str, str | float | dict], optional
        A dictionary of metadata describing the evaluation run.
    """
    if not connection:
        connection = get_connection()
    self.conn = connection
    self.update(**kwargs)

valor.Evaluation.__str__()

Dumps the object into a JSON formatted string.

Source code in valor/coretypes.py
def __str__(self) -> str:
    """Dumps the object into a JSON formatted string."""
    return json.dumps(self.to_dict(), indent=4)

valor.Evaluation.poll()

Poll the back end.

Updates the evaluation with the latest state from the back end.

Returns:

Type Description
EvaluationStatus

The status of the evaluation.

Raises:

Type Description
ClientException

If an Evaluation with the given evaluation_id is not found.

Source code in valor/coretypes.py
def poll(self) -> EvaluationStatus:
    """
    Poll the back end.

    Updates the evaluation with the latest state from the back end.

    Returns
    -------
    enums.EvaluationStatus
        The status of the evaluation.

    Raises
    ----------
    ClientException
        If an Evaluation with the given `evaluation_id` is not found.
    """
    response = self.conn.get_evaluations(
        evaluation_ids=[self.id],
        models=None,
        datasets=None,
        metrics_to_sort_by=None,
    )
    if not response:
        raise EvaluationDoesNotExist(self.id)
    self.update(**response[0])
    return self.status

valor.Evaluation.to_dataframe(stratify_by=None)

Get all metrics associated with a Model and return them in a pd.DataFrame.

Returns:

Type Description
DataFrame

Evaluation metrics being displayed in a pd.DataFrame.

Raises:

Type Description
ModuleNotFoundError

This function requires the use of pandas.DataFrame.

Source code in valor/coretypes.py
def to_dataframe(
    self,
    stratify_by: Optional[Tuple[str, str]] = None,
):
    """
    Get all metrics associated with a Model and return them in a `pd.DataFrame`.

    Returns
    ----------
    pd.DataFrame
        Evaluation metrics being displayed in a `pd.DataFrame`.

    Raises
    ------
    ModuleNotFoundError
        This function requires the use of `pandas.DataFrame`.

    """
    try:
        import pandas as pd
    except ModuleNotFoundError:
        raise ModuleNotFoundError(
            "Must have pandas installed to use `get_metric_dataframes`."
        )

    if not stratify_by:
        column_type = "evaluation"
        column_name = self.id
    else:
        column_type = stratify_by[0]
        column_name = stratify_by[1]

    metrics = [
        {**metric, column_type: column_name} for metric in self.metrics
    ]
    df = pd.DataFrame(metrics)
    for k in ["label", "parameters"]:
        df[k] = df[k].fillna("n/a")
    df["parameters"] = df["parameters"].apply(json.dumps)
    df["label"] = df["label"].apply(
        lambda x: f"{x['key']}: {x['value']}" if x != "n/a" else x
    )
    df = df.pivot(
        index=["type", "parameters", "label"], columns=[column_type]
    )
    return df

valor.Evaluation.to_dict()

Defines how a valor.Evaluation object is serialized into a dictionary.

Returns:

Type Description
dict

A dictionary describing an evaluation.

Source code in valor/coretypes.py
def to_dict(self) -> dict:
    """
    Defines how a `valor.Evaluation` object is serialized into a dictionary.

    Returns
    ----------
    dict
        A dictionary describing an evaluation.
    """
    return {
        "id": self.id,
        "dataset_names": self.dataset_names,
        "model_name": self.model_name,
        "filters": self.filters,
        "parameters": asdict(self.parameters),
        "status": self.status.value,
        "metrics": self.metrics,
        "confusion_matrices": self.confusion_matrices,
        "meta": self.meta,
        **self.kwargs,
    }

valor.Evaluation.wait_for_completion(*, timeout=None, interval=1.0)

Blocking function that waits for evaluation to finish.

Parameters:

Name Type Description Default
timeout int

Length of timeout in seconds.

None
interval float

Polling interval in seconds.

1.0
Source code in valor/coretypes.py
def wait_for_completion(
    self,
    *,
    timeout: Optional[int] = None,
    interval: float = 1.0,
) -> EvaluationStatus:
    """
    Blocking function that waits for evaluation to finish.

    Parameters
    ----------
    timeout : int, optional
        Length of timeout in seconds.
    interval : float, default=1.0
        Polling interval in seconds.
    """
    t_start = time.time()
    while self.poll() not in [
        EvaluationStatus.DONE,
        EvaluationStatus.FAILED,
    ]:
        time.sleep(interval)
        if timeout and time.time() - t_start > timeout:
            raise TimeoutError
    return self.status