Computer Vision Schemas

BoundingBox

Bounding box for an object whether in an image, video, or reference to document.

from tricorder.schemas.analytics.computer_vision import BoundingBox

class BoundingBox(BaseModel):
    """Bounding box for an object whether in an image, video, or reference to document."""

    xmin: confloat(ge=0, le=1) = Field(
        ..., description="The x-coordinate of the top-left corner of the bounding box."
    )
    ymin: confloat(ge=0, le=1) = Field(
        ..., description="The y-coordinate of the top-left corner of the bounding box."
    )
    xmax: confloat(ge=0, le=1) = Field(
        ...,
        description="The x-coordinate of the bottom-right corner of the bounding box.",
    )
    ymax: confloat(ge=0, le=1) = Field(
        ...,
        description="The y-coordinate of the bottom-right corner of the bounding box.",
    )

    @model_validator(mode="after")
    def validate_bbox(self):
        if self.xmin >= self.xmax:
            raise ValueError("xmin must be less than xmax")
        if self.ymin >= self.ymax:
            raise ValueError("ymin must be less than ymax")
        return self

Detection

A detected object in an image or video.

from tricorder.schemas.analytics.computer_vision import Detection

class Detection(Classification):
    """A detected object in an image or video."""
    bbox: BoundingBox = Field(..., description="Bounding box of the detection")
    landmarks: Optional[list[float]] = Field(
        None, description="Landmarks of the detection"
    )

FaceAttributes

Attributes of a detected face such as age or sex.

from tricorder.schemas.analytics.computer_vision import FaceAttributes

class FaceAttributes(BaseModel):
    """Attributes of a detected face such as age or sex."""
    age: Optional[int] = Field(
        None, description="The estimated age as detected by a model."
    )
    sex: Optional[Sex] = Field(
        None, description="The estimated sex as detected by a model."
    )
    model_config = ConfigDict(use_enum_values=True)

FaceLandmarks5

Five key landmarks for a face, typically used to align a face for downstream embedding and recognition models.

from tricorder.schemas.analytics.computer_vision import FaceLandmarks5

class FaceLandmarks5(BaseModel):
    """Five key landmarks for a face, typically used to align a face for downstream embedding and recognition models."""
    lefteye_x: float = Field(..., description="The x-coordinate of the left eye.")
    lefteye_y: float = Field(..., description="The y-coordinate of the left eye.")
    righteye_x: float = Field(..., description="The x-coordinate of the right eye.")
    righteye_y: float = Field(..., description="The y-coordinate of the right eye.")
    nose_x: float = Field(..., description="The x-coordinate of the nose.")
    nose_y: float = Field(..., description="The y-coordinate of the nose.")
    leftmouth_x: float = Field(..., description="The x-coordinate of the left mouth.")
    leftmouth_y: float = Field(..., description="The y-coordinate of the left mouth.")
    rightmouth_x: float = Field(..., description="The x-coordinate of the right mouth.")
    rightmouth_y: float = Field(..., description="The y-coordinate of the right mouth.")

FaceEmbedding512

An embedding for a face specific to models with output of 512 dimensions.

from tricorder.schemas.analytics.computer_vision import FaceEmbedding512

class FaceEmbedding512(RootModel[conlist(float, min_length=512, max_length=512)]):
    """An embedding for a face specific to models with output of 512 dimensions."""
    pass

FaceDetection

A detected face in an image or video.

from tricorder.schemas.analytics.computer_vision import FaceDetection

class FaceDetection(Detection):
    """A detected face in an image or video."""

    label: Literal["face"] = Field(
        "face", description="The label of the detected object."
    )

FaceAnnotation

An annotation for a detected face in an image or video.

from tricorder.schemas.analytics.computer_vision import FaceAnnotation

class FaceAnnotation(FaceDetection):
    """An annotation for a detected face in an image or video."""
    attributes: Optional[FaceAttributes] = Field(
        None, description="The attributes of the detected face."
    )
    embedding: Optional[FaceEmbedding] = Field(
        None, description="The embedding of the detected face."
    )
    pose: Optional[Pose] = Field(None, description="The pose of the detected face.")

FacesResponse

A set of face annotations.

from tricorder.schemas.analytics.computer_vision import FacesResponse

class FacesResponse(RootModel[list[FaceAnnotation]]):
    """A set of face annotations."""

    pass

ImageCaption

A small text based summary of an image.

from tricorder.schemas.analytics.computer_vision import ImageCaption

class ImageCaption(RootModel[str]):
    """A small text based summary of an image."""

    pass

ImageClassification

A tag associated with an image.

from tricorder.schemas.analytics.computer_vision import ImageClassification

class ImageClassification(Classification):
    """A tag associated with an image."""
    pass

OCRResponse

A response containing OCR'd text and its detection annotations.

from tricorder.schemas.analytics.computer_vision import OCRResponse

class OCRResponse(AnalyticAnnotation):
    """A response containing OCR'd text and its detection annotations."""
    text: str | None = Field(None, description="The OCR'd text")
    script: Optional[ISO_15924] = Field(None, description="The script of the text")
    annotations: Optional[list[OCRDetection]] = Field(
        None, description="A list of OCR annotations"
    )

    model_config = ConfigDict(
        use_enum_values=True,
        json_schema_extra={
            "examples": [
                {
                    "schema_name": "OCRResponse",
                    "text": "Test",
                    "annotations": [
                        {
                            "label": "text",
                            "confidence": 0.96,
                            "bbox": {
                                "xmin": 0.10224438902743142,
                                "ymin": 0.1984126984126984,
                                "xmax": 0.46633416458852867,
                                "ymax": 0.36507936507936506,
                            },
                            "offset": 0,
                            "length": 4,
                        }
                    ],
                }
            ]
        },
    )

Pose

The pose of a detected face, typically used for alignment and recognition.

from tricorder.schemas.analytics.computer_vision import Pose

class Pose(BaseModel):
    """The pose of a detected face, typically used for alignment and recognition."""
    pitch: confloat(ge=-180, le=180) = Field(
        ..., description="Rotation around the x-axis in degrees."
    )
    yaw: confloat(ge=-180, le=180) = Field(
        ..., description="Rotation around the y-axis in degrees."
    )
    roll: confloat(ge=-180, le=180) = Field(
        ..., description="Rotation around the z-axis (or front to back) in degrees."
    )