Source code for rdfsolve.shapes

"""Shapes Creator - subset schemas and convert to SHACL.

This module provides two core operations:

1. **Subset**: given a full JSON-LD schema (as produced by rdfsolve's
   miner or parser) and a list of edges (paths) to *keep*, produce a
   smaller JSON-LD containing only those triples.

2. **SHACL conversion**: take a (subset) JSON-LD schema and convert it
   to SHACL shapes via the existing VoidParser -> LinkML -> ShaclGenerator
   pipeline.

The shapes feature allows users to define *intended* schemas from
mined schemas, which can then be used for RDF data validation.
"""

from __future__ import annotations

import copy
import logging
from typing import Any

logger = logging.getLogger(__name__)

__all__ = [
    "jsonld_to_shacl",
    "subset_jsonld",
]


# ── Types ────────────────────────────────────────────────────────────

# An "edge spec" describes one edge (triple pattern) to keep.
# It mirrors what the frontend knows about a diagram edge:
#   subject   - class CURIE or URI  (e.g. "core:Protein")
#   predicate - property CURIE or URI
#   object    - class CURIE/URI or "Literal" or "Resource"
EdgeSpec = dict[str, str]  # keys: subject, predicate, object


# ── Subset ───────────────────────────────────────────────────────────


[docs] def subset_jsonld( # noqa C901 schema_jsonld: dict[str, Any], keep_edges: list[EdgeSpec], ) -> dict[str, Any]: """Return a copy of *schema_jsonld* containing only *keep_edges*. Parameters ---------- schema_jsonld: Full JSON-LD schema with ``@context``, ``@graph``, and optionally ``@about``. keep_edges: List of edge specs. Each must have ``subject``, ``predicate``, and ``object`` keys (CURIEs or full URIs). Returns ------- dict A new JSON-LD document that keeps only the nodes and properties referenced by *keep_edges*. The ``@context`` is preserved; ``@about`` is carried over as-is. """ if not keep_edges: return { "@context": schema_jsonld.get("@context", {}), "@graph": [], "@about": schema_jsonld.get("@about", {}), } context = schema_jsonld.get("@context", {}) graph: list[dict[str, Any]] = schema_jsonld.get("@graph", []) # Build a lookup: subject_id -> node dict node_map: dict[str, dict[str, Any]] = {} for node in graph: nid = node.get("@id", "") if nid: node_map[nid] = node # Expand a CURIE using @context so we can compare both forms. def _expand(curie: str) -> str: if ":" not in curie or curie.startswith("http"): return curie prefix, local = curie.split(":", 1) ns = context.get(prefix, "") return f"{ns}{local}" if ns else curie # Build a reverse lookup: full URI -> CURIE. # This lets us also compact a full URI back to its CURIE form. def _compact(uri: str) -> str: if not uri.startswith("http"): return uri for prefix, ns in context.items(): if isinstance(ns, str) and uri.startswith(ns): return f"{prefix}:{uri[len(ns) :]}" return uri # Build the set of (subject, predicate) to keep. # We store EVERY possible combination of expanded/compacted forms # so that matching works regardless of what format the graph or # the caller uses. keep_set: set[tuple[str, str]] = set() keep_subjects: set[str] = set() for edge in keep_edges: subj = edge.get("subject", "") pred = edge.get("predicate", "") if subj and pred: forms_s = {subj, _expand(subj), _compact(subj)} forms_p = {pred, _expand(pred), _compact(pred)} for s in forms_s: for p in forms_p: keep_set.add((s, p)) keep_subjects.add(s) # Filter: for each subject node, keep only the specified properties. new_graph: list[dict[str, Any]] = [] logger.debug( "subset_jsonld: %d keep_edges -> %d (s,p) pairs, %d subjects, %d graph nodes", len(keep_edges), len(keep_set), len(keep_subjects), len(graph), ) for node in graph: nid = node.get("@id", "") if nid not in keep_subjects: continue new_node: dict[str, Any] = {"@id": nid} for key, value in node.items(): if key.startswith("@") or key == "_counts": if key.startswith("@"): new_node[key] = value continue # Check if (subject, predicate) should be kept. if (nid, key) in keep_set: # If we also want to filter by object, we can do so here. # For now, we keep the entire predicate entry. new_node[key] = copy.deepcopy(value) # Only include if the node has at least one non-@ property has_data = any(not k.startswith("@") for k in new_node if k != "@id") if has_data: new_graph.append(new_node) result: dict[str, Any] = { "@context": copy.deepcopy(context), "@graph": new_graph, } if "@about" in schema_jsonld: result["@about"] = copy.deepcopy(schema_jsonld["@about"]) return result
# ── SHACL conversion ─────────────────────────────────────────────────
[docs] def jsonld_to_shacl( schema_jsonld: dict[str, Any], *, schema_name: str | None = None, closed: bool = True, suffix: str | None = None, ) -> str: """Convert a JSON-LD schema to SHACL Turtle via LinkML. Delegates to :func:`rdfsolve.schema_models.shacl.to_shacl`, which runs the full LinkML -> ShaclGenerator pipeline on the given JSON-LD dict- no VoidParser instance required. Parameters ---------- schema_jsonld: JSON-LD dict (the same format produced by ``subset_jsonld`` or the miner). schema_name: Optional name for the generated LinkML/SHACL schema. closed: Whether to produce closed SHACL shapes (``sh:closed true``). suffix: Suffix appended to every shape name (e.g. ``"Shape"`` -> ``PersonShape``). Returns ------- str SHACL shapes serialised as Turtle. """ from rdfsolve.schema_models.shacl import to_shacl return to_shacl( schema_jsonld, schema_name=schema_name or "shapes", closed=closed, suffix=suffix, )