Natural Language Processing Schemas

LanguageDetection

Language annotation object

from tricorder.schemas.analytics.nlp import LanguageDetection

class LanguageDetection(TextClassification):
    """Language annotation object"""

    label: Iso639_3 = Field(..., description="Language of the text in ISO 639-3 code")

EntityAnnotation

Entity Annotation typically used for Named Entity Recognition (NER) tasks.

from tricorder.schemas.analytics.nlp import EntityAnnotation

class EntityAnnotation(AnalyticAnnotation):
    """Entity Annotation typically used for Named Entity Recognition (NER) tasks."""

    offset: Optional[int] = Field(
        None, description="Start index of the extraction text for the selector"
    )
    length: Optional[int] = Field(
        None,
        description="Length of the extraction from the text body to find end index",
    )
    label: str = Field(..., description="The unique label for the type of selector")
    text: str = Field(..., description="Raw selector text extracted from the text body")
    attributes: Optional[dict[str, Any]] = Field(
        None, description="A list of attributes specific to the label of selector"
    )
    confidence: Optional[confloat(ge=0, le=1)] = Field(
        None, description="Confidence level attributed to the pattern"
    )

    model_config = ConfigDict(use_enum_values=True)

    @field_validator("confidence")
    def round_confidence(cls, value: confloat(ge=0, le=1)) -> confloat(ge=0, le=1):
        return round(value, 2) if value is not None else None

SelectorAnnotation

An entity annotation that represents a selector, typically resolvable to a specific pattern, standardized format, or a known entity type.

from tricorder.schemas.analytics.nlp import SelectorAnnotation

class SelectorAnnotation(EntityAnnotation):
    """An entity annotation that represents a selector, typically resolvable to a 
    specific pattern, standardized format, or a known entity type.
    """

    text: str = Field(..., description="Raw selector text extracted from the text body")
    normalized_text: Optional[str] = Field(None, description="Normalized selector text")
    realm: str = Field(
        "unknown",
        description="The realm of the selector. Defualt is 'unknown' if not specified",
    )
    potential_usp: Optional[bool] = Field(
        None,
        description="Whether the selector contains potential USP information as determined by an attribute",
    )

    model_config = ConfigDict(use_enum_values=True)

TextClassification

A tag associated with a body of text.

from tricorder.schemas.analytics.nlp import TextClassification

class TextClassification(Classification):
    """A tag associated with a body of text."""

    pass

NERResponse

A set of named entities detected in a text.

from tricorder.schemas.analytics.nlp import NERResponse

class NERResponse(RootModel[list[EntityAnnotation | SelectorAnnotation]]):
    """A set of named entities detected in a text."""

    pass

ChunkTextEmbedding384

A 384 dimensional embedding for a chunk of text.

from tricorder.schemas.analytics.nlp import ChunkTextEmbedding384

class ChunkTextEmbedding384(BaseModel):
    """A 384 dimensional embedding for a chunk of text."""

    embedding: conlist(float, min_length=384, max_length=384) = Field(
        ..., description="Language embedding"
    )
    offset: Optional[int] = Field(
        None, description="Offset of the embedding in the original text"
    )
    length: Optional[int] = Field(
        None, description="Length of the embedding in the original text"
    )

EmbeddingsResponse

A set of embeddings for each chunk within a document or body of text.

from tricorder.schemas.analytics.nlp import EmbeddingsResponse

class EmbeddingsResponse(RootModel[list[ChunkTextEmbedding384]]):
    """A set of embeddings for each chunk within a document or body of text."""

    pass