Audio Schemas

ASRChunk

A transcript chunk from an Automatic Speech Recognition (ASR) system consisting primarily of: * VAD * Diarization * Transcription * Optional embedding Each chunk represents a segment of audio with a speaker, text, start time, and end time.

from tricorder.schemas.analytics.audio import ASRChunk

class ASRChunk(BaseModel):
    """
    A transcript chunk from an Automatic Speech Recognition (ASR) system consisting primarily of:
        * VAD
        * Diarization
        * Transcription
        * Optional embedding
    Each chunk represents a segment of audio with a speaker, text, start time, and end time.
    """

    speaker: str
    text: str
    start_time: float
    end_time: float
    embedding: Optional[conlist(float, min_length=256, max_length=256)] = None

    @field_validator("embedding", mode="before")
    def np_to_list(cls, emb: np.ndarray | list):
        return emb.tolist() if isinstance(emb, np.ndarray) else emb

STTResponse

A full transcript of conversations with speaker, text, start time, and end time. If the model supports it, it may also include language detection and embedding for the purposes of speaker verification or other downstream tasks.

from tricorder.schemas.analytics.audio import STTResponse

class STTResponse(AnalyticAnnotation):
    """A full transcript of conversations with speaker, text, start time, and end time.
    If the model supports it, it may also include language detection and embedding for the purposes of
    speaker verification or other downstream tasks.
    """

    language: Optional[Iso639_3] = Field(
        None, description="Language of the audio file if detected"
    )
    transcript: list[ASRChunk] = Field(
        ...,
        description="List of ASR chunks with speaker, text, start time, and end time",
    )