Source code for rdfsolve.shapes

"""Shapes Creator - subset schemas and convert to SHACL.

This module provides two core operations:

1. **Subset**: given a full JSON-LD schema (as produced by rdfsolve's
   miner or parser) and a list of edges (paths) to *keep*, produce a
   smaller JSON-LD containing only those triples.

2. **SHACL conversion**: take a (subset) JSON-LD schema and convert it
   to SHACL shapes via the existing VoidParser -> LinkML -> ShaclGenerator
   pipeline.

The shapes feature allows users to define *intended* schemas from
mined schemas, which can then be used for RDF data validation.
"""

from __future__ import annotations

import copy
import logging
from typing import Any

logger = logging.getLogger(__name__)

__all__ = [
    "jsonld_to_shacl",
    "subset_jsonld",
]


# ── Types ────────────────────────────────────────────────────────────

# An "edge spec" describes one edge (triple pattern) to keep.
# It mirrors what the frontend knows about a diagram edge:
#   subject   - class CURIE or URI  (e.g. "core:Protein")
#   predicate - property CURIE or URI
#   object    - class CURIE/URI or "Literal" or "Resource"
EdgeSpec = dict[str, str]  # keys: subject, predicate, object


# ── Subset ───────────────────────────────────────────────────────────



[docs]
def subset_jsonld(  # noqa C901
    schema_jsonld: dict[str, Any],
    keep_edges: list[EdgeSpec],
) -> dict[str, Any]:
    """Return a copy of *schema_jsonld* containing only *keep_edges*.

    Parameters
    ----------
    schema_jsonld:
        Full JSON-LD schema with ``@context``, ``@graph``, and
        optionally ``@about``.
    keep_edges:
        List of edge specs.  Each must have ``subject``,
        ``predicate``, and ``object`` keys (CURIEs or full URIs).

    Returns
    -------
    dict
        A new JSON-LD document that keeps only the nodes and
        properties referenced by *keep_edges*.  The ``@context``
        is preserved; ``@about`` is carried over as-is.
    """
    if not keep_edges:
        return {
            "@context": schema_jsonld.get("@context", {}),
            "@graph": [],
            "@about": schema_jsonld.get("@about", {}),
        }

    context = schema_jsonld.get("@context", {})
    graph: list[dict[str, Any]] = schema_jsonld.get("@graph", [])

    # Build a lookup: subject_id -> node dict
    node_map: dict[str, dict[str, Any]] = {}
    for node in graph:
        nid = node.get("@id", "")
        if nid:
            node_map[nid] = node

    # Expand a CURIE using @context so we can compare both forms.
    def _expand(curie: str) -> str:
        if ":" not in curie or curie.startswith("http"):
            return curie
        prefix, local = curie.split(":", 1)
        ns = context.get(prefix, "")
        return f"{ns}{local}" if ns else curie

    # Build a reverse lookup: full URI -> CURIE.
    # This lets us also compact a full URI back to its CURIE form.
    def _compact(uri: str) -> str:
        if not uri.startswith("http"):
            return uri
        for prefix, ns in context.items():
            if isinstance(ns, str) and uri.startswith(ns):
                return f"{prefix}:{uri[len(ns) :]}"
        return uri

    # Build the set of (subject, predicate) to keep.
    # We store EVERY possible combination of expanded/compacted forms
    # so that matching works regardless of what format the graph or
    # the caller uses.
    keep_set: set[tuple[str, str]] = set()
    keep_subjects: set[str] = set()
    for edge in keep_edges:
        subj = edge.get("subject", "")
        pred = edge.get("predicate", "")
        if subj and pred:
            forms_s = {subj, _expand(subj), _compact(subj)}
            forms_p = {pred, _expand(pred), _compact(pred)}
            for s in forms_s:
                for p in forms_p:
                    keep_set.add((s, p))
                keep_subjects.add(s)

    # Filter: for each subject node, keep only the specified properties.
    new_graph: list[dict[str, Any]] = []
    logger.debug(
        "subset_jsonld: %d keep_edges -> %d (s,p) pairs, %d subjects, %d graph nodes",
        len(keep_edges),
        len(keep_set),
        len(keep_subjects),
        len(graph),
    )
    for node in graph:
        nid = node.get("@id", "")
        if nid not in keep_subjects:
            continue
        new_node: dict[str, Any] = {"@id": nid}
        for key, value in node.items():
            if key.startswith("@") or key == "_counts":
                if key.startswith("@"):
                    new_node[key] = value
                continue
            # Check if (subject, predicate) should be kept.
            if (nid, key) in keep_set:
                # If we also want to filter by object, we can do so here.
                # For now, we keep the entire predicate entry.
                new_node[key] = copy.deepcopy(value)
        # Only include if the node has at least one non-@ property
        has_data = any(not k.startswith("@") for k in new_node if k != "@id")
        if has_data:
            new_graph.append(new_node)

    result: dict[str, Any] = {
        "@context": copy.deepcopy(context),
        "@graph": new_graph,
    }
    if "@about" in schema_jsonld:
        result["@about"] = copy.deepcopy(schema_jsonld["@about"])

    return result



# ── SHACL conversion ─────────────────────────────────────────────────



[docs]
def jsonld_to_shacl(
    schema_jsonld: dict[str, Any],
    *,
    schema_name: str | None = None,
    closed: bool = True,
    suffix: str | None = None,
) -> str:
    """Convert a JSON-LD schema to SHACL Turtle via LinkML.

    Delegates to :func:`rdfsolve.schema_models.shacl.to_shacl`, which
    runs the full LinkML -> ShaclGenerator pipeline on the given
    JSON-LD dict- no VoidParser instance required.

    Parameters
    ----------
    schema_jsonld:
        JSON-LD dict (the same format produced by ``subset_jsonld``
        or the miner).
    schema_name:
        Optional name for the generated LinkML/SHACL schema.
    closed:
        Whether to produce closed SHACL shapes (``sh:closed true``).
    suffix:
        Suffix appended to every shape name
        (e.g. ``"Shape"`` -> ``PersonShape``).

    Returns
    -------
    str
        SHACL shapes serialised as Turtle.
    """
    from rdfsolve.schema_models.shacl import to_shacl

    return to_shacl(
        schema_jsonld,
        schema_name=schema_name or "shapes",
        closed=closed,
        suffix=suffix,
    )