Source code for rdfsolve.schema_models.report

"""Mining analytics report models."""

from __future__ import annotations

from typing import Any

from pydantic import BaseModel, ConfigDict, Field



[docs]
class QueryStats(BaseModel):
    """Cumulative statistics for one query category."""

    sent: int = Field(0, ge=0, description="Queries sent")
    failed: int = Field(
        0,
        ge=0,
        description="Queries that failed",
    )
    total_time_s: float = Field(
        0.0,
        ge=0,
        description="Wall-clock seconds for this category",
    )

    model_config = ConfigDict(extra="forbid")




[docs]
class OneShotQueryResult(BaseModel):
    """Outcome of a single unbounded SELECT against a SPARQL endpoint.

    Used to record the raw performance of an unguarded one-shot
    query so it can be compared against the fallback-chain result.
    """

    query_type: str = Field(
        ...,
        description=("Pattern type queried: 'typed-object', 'literal', or 'untyped-uri'"),
    )
    success: bool = Field(
        ...,
        description="True if the endpoint returned a result set",
    )
    duration_s: float | None = Field(
        None,
        ge=0,
        description="Wall-clock seconds for the single HTTP call",
    )
    row_count: int | None = Field(
        None,
        ge=0,
        description="Number of result rows returned",
    )
    error: str | None = Field(
        None,
        description="Exception message if the query failed",
    )

    model_config = ConfigDict(extra="forbid")




[docs]
class PhaseReport(BaseModel):
    """Timing and outcome for one mining phase."""

    name: str = Field(..., description="Phase identifier")
    started_at: str | None = Field(
        None,
        description="ISO-8601 start time",
    )
    finished_at: str | None = Field(
        None,
        description="ISO-8601 finish time",
    )
    duration_s: float | None = Field(
        None,
        ge=0,
        description="Wall-clock seconds",
    )
    items_discovered: int = Field(
        0,
        ge=0,
        description="Number of items produced by this phase",
    )
    error: str | None = Field(
        None,
        description="Error message if the phase failed",
    )

    model_config = ConfigDict(extra="forbid")




[docs]
class MiningReport(BaseModel):
    """Analytical metadata collected during a mining run.

    Designed to be written to disk incrementally (after each phase
    completes) so that partial data is preserved even if mining
    crashes midway.
    """

    # Identification
    dataset_name: str | None = Field(
        None,
        description="Human-readable name of the mined dataset",
    )
    endpoint_url: str = Field(
        ...,
        description="SPARQL endpoint URL",
    )
    graph_uris: list[str] | None = Field(
        None,
        description="Named-graph URIs (if any)",
    )
    strategy: str = Field(
        "unknown",
        description=("Mining strategy: 'miner' or 'miner/two-phase'"),
    )

    # Versions & environment
    rdfsolve_version: str = Field(
        ...,
        description="Package version string",
    )
    python_version: str = Field(
        ...,
        description="Python interpreter version",
    )
    qlever_version: dict[str, str] | None = Field(
        None,
        description=(
            "QLever build info fetched from the endpoint's "
            '?cmd=stats: {"git_hash_server": str, '
            '"git_hash_index": str}'
        ),
    )

    # Timing
    started_at: str = Field(
        ...,
        description="ISO-8601 timestamp when mining started",
    )
    finished_at: str | None = Field(
        None,
        description="ISO-8601 timestamp when mining finished",
    )
    total_duration_s: float | None = Field(
        None,
        ge=0,
        description="Total wall-clock seconds",
    )

    # Query statistics
    query_stats: dict[str, QueryStats] = Field(
        default_factory=dict,
        description="Per-purpose query statistics.",
    )
    total_queries_sent: int = Field(0, ge=0)
    total_queries_failed: int = Field(0, ge=0)

    # Phase breakdown
    phases: list[PhaseReport] = Field(default_factory=list)

    # Results summary
    abort_reason: str | None = Field(None)
    pattern_count: int = Field(0, ge=0)
    class_count: int = Field(0, ge=0)
    property_count: int = Field(0, ge=0)
    unique_uris_labelled: int = Field(0, ge=0)

    # Configuration snapshot
    config: dict[str, Any] = Field(default_factory=dict)

    # Benchmark / resource usage
    machine: dict[str, Any] | None = Field(None)
    benchmark: dict[str, Any] | None = Field(None)

    # One-shot baseline
    one_shot_results: list[OneShotQueryResult] | None = Field(
        None,
    )

    # Author provenance
    authors: list[dict[str, str]] | None = Field(None)

    # Captured endpoint metadata
    dataset_metadata: dict[str, Any] | None = Field(None)

    # Canonical URI
    report_uri: str | None = Field(None)

    model_config = ConfigDict(extra="allow")