Skip to content

Dataset

Bases: StaticCollection

A class describing a given dataset.

Attributes:

Name Type Description
name String

The name of the dataset.

metadata Dictionary

A dictionary of metadata that describes the dataset.

Examples:

>>> Dataset.create(name="dataset1")
>>> Dataset.create(name="dataset1", metadata={})
>>> Dataset.create(name="dataset1", metadata={"foo": "bar", "pi": 3.14})
Source code in valor/coretypes.py
class Dataset(StaticCollection):
    """
    A class describing a given dataset.

    Attributes
    ----------
    name : String
        The name of the dataset.
    metadata : Dictionary
        A dictionary of metadata that describes the dataset.

    Examples
    --------
    >>> Dataset.create(name="dataset1")
    >>> Dataset.create(name="dataset1", metadata={})
    >>> Dataset.create(name="dataset1", metadata={"foo": "bar", "pi": 3.14})
    """

    name: String = String.symbolic(owner="dataset", name="name")
    metadata: Dictionary = Dictionary.symbolic(
        owner="dataset", name="metadata"
    )

    def __init__(
        self,
        *,
        name: str,
        metadata: Optional[dict] = None,
        connection: Optional[ClientConnection] = None,
    ):
        """
        Creates a local instance of a dataset.

        Use 'Dataset.create' classmethod to create a dataset with persistence.

        Parameters
        ----------
        name : str
            The name of the dataset.
        metadata : dict, optional
            A dictionary of metadata that describes the dataset.
        connection : ClientConnection, optional
            An initialized client connection.
        """
        self.conn = connection
        super().__init__(name=name, metadata=metadata if metadata else dict())

    @classmethod
    def create(
        cls,
        name: str,
        metadata: Optional[Dict[str, Any]] = None,
        connection: Optional[ClientConnection] = None,
    ) -> Dataset:
        """
        Creates a dataset that persists in the back end.

        Parameters
        ----------
        name : str
            The name of the dataset.
        metadata : dict, optional
            A dictionary of metadata that describes the dataset.
        connection : ClientConnection, optional
            An initialized client connection.
        """
        dataset = cls(name=name, metadata=metadata, connection=connection)
        Client(dataset.conn).create_dataset(dataset)
        return dataset

    @classmethod
    def get(
        cls,
        name: str,
        connection: Optional[ClientConnection] = None,
    ) -> Union[Dataset, None]:
        """
        Retrieves a dataset from the back end database.

        Parameters
        ----------
        name : str
            The name of the dataset.

        Returns
        -------
        Union[valor.Dataset, None]
            The dataset or 'None' if it doesn't exist.
        """
        return Client(connection).get_dataset(name)

    def add_groundtruth(
        self,
        groundtruth: GroundTruth,
    ) -> None:
        """
        Add a ground truth to the dataset.

        Parameters
        ----------
        groundtruth : GroundTruth
            The ground truth to create.
        """
        Client(self.conn).create_groundtruths(
            dataset=self,
            groundtruths=[groundtruth],
        )

    def add_groundtruths(
        self,
        groundtruths: List[GroundTruth],
        ignore_existing_datums: bool = False,
        timeout: Optional[float] = 10.0,
    ) -> None:
        """
        Add multiple ground truths to the dataset.

        Parameters
        ----------
        groundtruths : List[GroundTruth]
            The ground truths to create.
        ignore_existing_datums : bool, default=False
            If True, will ignore datums that already exist in the backend.
            If False, will raise an error if any datums already exist.
            Default is False.
        timeout : float, optional
            The number of seconds the client should wait until raising a timeout.
        """
        Client(self.conn).create_groundtruths(
            dataset=self,
            groundtruths=groundtruths,
            ignore_existing_datums=ignore_existing_datums,
            timeout=timeout,
        )

    def get_groundtruth(
        self,
        datum: Union[Datum, str],
    ) -> Union[GroundTruth, None]:
        """
        Get a particular ground truth.

        Parameters
        ----------
        datum : Union[Datum, str]
            The desired datum.

        Returns
        ----------
        Union[GroundTruth, None]
            The matching ground truth or 'None' if it doesn't exist.
        """
        return Client(self.conn).get_groundtruth(dataset=self, datum=datum)

    def get_labels(
        self,
    ) -> List[Label]:
        """
        Get all labels associated with a given dataset.

        Returns
        ----------
        List[Label]
            A list of `Labels` associated with the dataset.
        """
        return Client(self.conn).get_labels_from_dataset(self)

    def get_datums(self, filters: Optional[Filter] = None) -> List[Datum]:
        """
        Get all datums associated with a given dataset.

        Parameters
        ----------
        filters : Filter, optional
            An optional datum filter.

        Returns
        ----------
        List[Datum]
            A list of `Datums` associated with the dataset.
        """
        if filters is None:
            filters = Filter()
        filters.datasets = Dataset.name == self.name  # type: ignore - #issue 605
        return Client(self.conn).get_datums(filters=filters)

    def get_evaluations(
        self,
        metrics_to_sort_by: Optional[
            Dict[str, Union[Dict[str, str], str]]
        ] = None,
        *_,
        timeout: Optional[float] = None,
    ) -> List[Evaluation]:
        """
        Get all evaluations associated with a given dataset.

        Parameters
        ----------
        metrics_to_sort_by : dict[str, str | dict[str, str]], optional
            An optional dict of metric types to sort the evaluations by.
        timeout : float, optional
            The number of seconds the client should wait until raising a timeout.

        Returns
        ----------
        List[Evaluation]
            A list of `Evaluations` associated with the dataset.
        """
        return Client(self.conn).get_evaluations(
            datasets=[self],
            metrics_to_sort_by=metrics_to_sort_by,
            timeout=timeout,
        )

    def get_summary(
        self, *_, timeout: Optional[float] = None
    ) -> DatasetSummary:
        """
        Get the summary of a given dataset.

        Parameters
        ----------
        timeout : float, optional
            The number of seconds the client should wait until raising a timeout.

        Returns
        -------
        DatasetSummary
            The summary of the dataset. This class has the following fields:

            name : name of the dataset

            num_datums : total number of datums in the dataset

            num_annotations : total number of labeled annotations in the dataset; if an
            object (such as a bounding box) has multiple labels, then each label is counted separately

            num_bounding_boxes : total number of bounding boxes in the dataset

            num_polygons : total number of polygons in the dataset

            num_rasters : total number of rasters in the dataset

            labels : list of the unique labels in the dataset

            datum_metadata : list of the unique metadata dictionaries in the dataset that are associated
            to datums

            groundtruth_annotation_metadata : list of the unique metadata dictionaries in the dataset that are
            associated to annotations
        """
        return Client(self.conn).get_dataset_summary(self.name, timeout=timeout)  # type: ignore

    def finalize(
        self,
    ):
        """
        Finalizes the dataset such that new ground truths cannot be added to it.
        """
        return Client(self.conn).finalize_dataset(self)

    def delete(
        self,
        timeout: int = 0,
    ):
        """
        Delete the dataset from the back end.

        Parameters
        ----------
        timeout : int, default=0
            Sets a timeout in seconds.
        """
        Client(self.conn).delete_dataset(self.name, timeout)  # type: ignore

Functions

valor.Dataset.__init__(*, name, metadata=None, connection=None)

Creates a local instance of a dataset.

Use 'Dataset.create' classmethod to create a dataset with persistence.

Parameters:

Name Type Description Default
name str

The name of the dataset.

required
metadata dict

A dictionary of metadata that describes the dataset.

None
connection ClientConnection

An initialized client connection.

None
Source code in valor/coretypes.py
def __init__(
    self,
    *,
    name: str,
    metadata: Optional[dict] = None,
    connection: Optional[ClientConnection] = None,
):
    """
    Creates a local instance of a dataset.

    Use 'Dataset.create' classmethod to create a dataset with persistence.

    Parameters
    ----------
    name : str
        The name of the dataset.
    metadata : dict, optional
        A dictionary of metadata that describes the dataset.
    connection : ClientConnection, optional
        An initialized client connection.
    """
    self.conn = connection
    super().__init__(name=name, metadata=metadata if metadata else dict())

valor.Dataset.add_groundtruth(groundtruth)

Add a ground truth to the dataset.

Parameters:

Name Type Description Default
groundtruth GroundTruth

The ground truth to create.

required
Source code in valor/coretypes.py
def add_groundtruth(
    self,
    groundtruth: GroundTruth,
) -> None:
    """
    Add a ground truth to the dataset.

    Parameters
    ----------
    groundtruth : GroundTruth
        The ground truth to create.
    """
    Client(self.conn).create_groundtruths(
        dataset=self,
        groundtruths=[groundtruth],
    )

valor.Dataset.add_groundtruths(groundtruths, ignore_existing_datums=False, timeout=10.0)

Add multiple ground truths to the dataset.

Parameters:

Name Type Description Default
groundtruths List[GroundTruth]

The ground truths to create.

required
ignore_existing_datums bool

If True, will ignore datums that already exist in the backend. If False, will raise an error if any datums already exist. Default is False.

False
timeout float

The number of seconds the client should wait until raising a timeout.

10.0
Source code in valor/coretypes.py
def add_groundtruths(
    self,
    groundtruths: List[GroundTruth],
    ignore_existing_datums: bool = False,
    timeout: Optional[float] = 10.0,
) -> None:
    """
    Add multiple ground truths to the dataset.

    Parameters
    ----------
    groundtruths : List[GroundTruth]
        The ground truths to create.
    ignore_existing_datums : bool, default=False
        If True, will ignore datums that already exist in the backend.
        If False, will raise an error if any datums already exist.
        Default is False.
    timeout : float, optional
        The number of seconds the client should wait until raising a timeout.
    """
    Client(self.conn).create_groundtruths(
        dataset=self,
        groundtruths=groundtruths,
        ignore_existing_datums=ignore_existing_datums,
        timeout=timeout,
    )

valor.Dataset.create(name, metadata=None, connection=None) classmethod

Creates a dataset that persists in the back end.

Parameters:

Name Type Description Default
name str

The name of the dataset.

required
metadata dict

A dictionary of metadata that describes the dataset.

None
connection ClientConnection

An initialized client connection.

None
Source code in valor/coretypes.py
@classmethod
def create(
    cls,
    name: str,
    metadata: Optional[Dict[str, Any]] = None,
    connection: Optional[ClientConnection] = None,
) -> Dataset:
    """
    Creates a dataset that persists in the back end.

    Parameters
    ----------
    name : str
        The name of the dataset.
    metadata : dict, optional
        A dictionary of metadata that describes the dataset.
    connection : ClientConnection, optional
        An initialized client connection.
    """
    dataset = cls(name=name, metadata=metadata, connection=connection)
    Client(dataset.conn).create_dataset(dataset)
    return dataset

valor.Dataset.delete(timeout=0)

Delete the dataset from the back end.

Parameters:

Name Type Description Default
timeout int

Sets a timeout in seconds.

0
Source code in valor/coretypes.py
def delete(
    self,
    timeout: int = 0,
):
    """
    Delete the dataset from the back end.

    Parameters
    ----------
    timeout : int, default=0
        Sets a timeout in seconds.
    """
    Client(self.conn).delete_dataset(self.name, timeout)  # type: ignore

valor.Dataset.finalize()

Finalizes the dataset such that new ground truths cannot be added to it.

Source code in valor/coretypes.py
def finalize(
    self,
):
    """
    Finalizes the dataset such that new ground truths cannot be added to it.
    """
    return Client(self.conn).finalize_dataset(self)

valor.Dataset.get(name, connection=None) classmethod

Retrieves a dataset from the back end database.

Parameters:

Name Type Description Default
name str

The name of the dataset.

required

Returns:

Type Description
Union[Dataset, None]

The dataset or 'None' if it doesn't exist.

Source code in valor/coretypes.py
@classmethod
def get(
    cls,
    name: str,
    connection: Optional[ClientConnection] = None,
) -> Union[Dataset, None]:
    """
    Retrieves a dataset from the back end database.

    Parameters
    ----------
    name : str
        The name of the dataset.

    Returns
    -------
    Union[valor.Dataset, None]
        The dataset or 'None' if it doesn't exist.
    """
    return Client(connection).get_dataset(name)

valor.Dataset.get_datums(filters=None)

Get all datums associated with a given dataset.

Parameters:

Name Type Description Default
filters Filter

An optional datum filter.

None

Returns:

Type Description
List[Datum]

A list of Datums associated with the dataset.

Source code in valor/coretypes.py
def get_datums(self, filters: Optional[Filter] = None) -> List[Datum]:
    """
    Get all datums associated with a given dataset.

    Parameters
    ----------
    filters : Filter, optional
        An optional datum filter.

    Returns
    ----------
    List[Datum]
        A list of `Datums` associated with the dataset.
    """
    if filters is None:
        filters = Filter()
    filters.datasets = Dataset.name == self.name  # type: ignore - #issue 605
    return Client(self.conn).get_datums(filters=filters)

valor.Dataset.get_evaluations(metrics_to_sort_by=None, *_, timeout=None)

Get all evaluations associated with a given dataset.

Parameters:

Name Type Description Default
metrics_to_sort_by dict[str, str | dict[str, str]]

An optional dict of metric types to sort the evaluations by.

None
timeout float

The number of seconds the client should wait until raising a timeout.

None

Returns:

Type Description
List[Evaluation]

A list of Evaluations associated with the dataset.

Source code in valor/coretypes.py
def get_evaluations(
    self,
    metrics_to_sort_by: Optional[
        Dict[str, Union[Dict[str, str], str]]
    ] = None,
    *_,
    timeout: Optional[float] = None,
) -> List[Evaluation]:
    """
    Get all evaluations associated with a given dataset.

    Parameters
    ----------
    metrics_to_sort_by : dict[str, str | dict[str, str]], optional
        An optional dict of metric types to sort the evaluations by.
    timeout : float, optional
        The number of seconds the client should wait until raising a timeout.

    Returns
    ----------
    List[Evaluation]
        A list of `Evaluations` associated with the dataset.
    """
    return Client(self.conn).get_evaluations(
        datasets=[self],
        metrics_to_sort_by=metrics_to_sort_by,
        timeout=timeout,
    )

valor.Dataset.get_groundtruth(datum)

Get a particular ground truth.

Parameters:

Name Type Description Default
datum Union[Datum, str]

The desired datum.

required

Returns:

Type Description
Union[GroundTruth, None]

The matching ground truth or 'None' if it doesn't exist.

Source code in valor/coretypes.py
def get_groundtruth(
    self,
    datum: Union[Datum, str],
) -> Union[GroundTruth, None]:
    """
    Get a particular ground truth.

    Parameters
    ----------
    datum : Union[Datum, str]
        The desired datum.

    Returns
    ----------
    Union[GroundTruth, None]
        The matching ground truth or 'None' if it doesn't exist.
    """
    return Client(self.conn).get_groundtruth(dataset=self, datum=datum)

valor.Dataset.get_labels()

Get all labels associated with a given dataset.

Returns:

Type Description
List[Label]

A list of Labels associated with the dataset.

Source code in valor/coretypes.py
def get_labels(
    self,
) -> List[Label]:
    """
    Get all labels associated with a given dataset.

    Returns
    ----------
    List[Label]
        A list of `Labels` associated with the dataset.
    """
    return Client(self.conn).get_labels_from_dataset(self)

valor.Dataset.get_summary(*_, timeout=None)

Get the summary of a given dataset.

Parameters:

Name Type Description Default
timeout float

The number of seconds the client should wait until raising a timeout.

None

Returns:

Type Description
DatasetSummary

The summary of the dataset. This class has the following fields:

name : name of the dataset

num_datums : total number of datums in the dataset

num_annotations : total number of labeled annotations in the dataset; if an object (such as a bounding box) has multiple labels, then each label is counted separately

num_bounding_boxes : total number of bounding boxes in the dataset

num_polygons : total number of polygons in the dataset

num_rasters : total number of rasters in the dataset

labels : list of the unique labels in the dataset

datum_metadata : list of the unique metadata dictionaries in the dataset that are associated to datums

groundtruth_annotation_metadata : list of the unique metadata dictionaries in the dataset that are associated to annotations

Source code in valor/coretypes.py
def get_summary(
    self, *_, timeout: Optional[float] = None
) -> DatasetSummary:
    """
    Get the summary of a given dataset.

    Parameters
    ----------
    timeout : float, optional
        The number of seconds the client should wait until raising a timeout.

    Returns
    -------
    DatasetSummary
        The summary of the dataset. This class has the following fields:

        name : name of the dataset

        num_datums : total number of datums in the dataset

        num_annotations : total number of labeled annotations in the dataset; if an
        object (such as a bounding box) has multiple labels, then each label is counted separately

        num_bounding_boxes : total number of bounding boxes in the dataset

        num_polygons : total number of polygons in the dataset

        num_rasters : total number of rasters in the dataset

        labels : list of the unique labels in the dataset

        datum_metadata : list of the unique metadata dictionaries in the dataset that are associated
        to datums

        groundtruth_annotation_metadata : list of the unique metadata dictionaries in the dataset that are
        associated to annotations
    """
    return Client(self.conn).get_dataset_summary(self.name, timeout=timeout)  # type: ignore