Audio Schemas

ASRChunk
A transcript chunk from an Automatic Speech Recognition (ASR) system consisting primarily of: * VAD * Diarization * Transcription * Optional embedding Each chunk represents a segment of audio with a speaker, text, start time, and end time.
from tricorder.schemas.analytics.audio import ASRChunk class ASRChunk(BaseModel): """ A transcript chunk from an Automatic Speech Recognition (ASR) system consisting primarily of: * VAD * Diarization * Transcription * Optional embedding Each chunk represents a segment of audio with a speaker, text, start time, and end time. """ speaker: str text: str start_time: float end_time: float embedding: Optional[conlist(float, min_length=256, max_length=256)] = None @field_validator("embedding", mode="before") def np_to_list(cls, emb: np.ndarray | list): return emb.tolist() if isinstance(emb, np.ndarray) else emb
STTResponse
A full transcript of conversations with speaker, text, start time, and end time. If the model supports it, it may also include language detection and embedding for the purposes of speaker verification or other downstream tasks.
from tricorder.schemas.analytics.audio import STTResponse class STTResponse(AnalyticAnnotation): """A full transcript of conversations with speaker, text, start time, and end time. If the model supports it, it may also include language detection and embedding for the purposes of speaker verification or other downstream tasks. """ language: Optional[Iso639_3] = Field( None, description="Language of the audio file if detected" ) transcript: list[ASRChunk] = Field( ..., description="List of ASR chunks with speaker, text, start time, and end time", )