Source code for rdflib.plugins.parsers.hext

"""
This is a rdflib plugin for parsing Hextuple files, which are Newline-Delimited JSON
(ndjson) files, into Conjunctive. The store that backs the graph *must* be able to
handle contexts, i.e. multiple graphs.
"""

from __future__ import annotations

import json
import warnings
from io import TextIOWrapper
from typing import Any, BinaryIO, List, Optional, TextIO, Union

from rdflib.graph import ConjunctiveGraph, Graph
from rdflib.parser import InputSource, Parser
from rdflib.term import BNode, Literal, URIRef

__all__ = ["HextuplesParser"]


[docs]class HextuplesParser(Parser): """ An RDFLib parser for Hextuples """
[docs] def __init__(self): pass
def _load_json_line(self, line: str) -> List[Optional[Any]]: # this complex handing is because the 'value' component is # allowed to be "" but not None # all other "" values are treated as None ret1 = json.loads(line) ret2 = [x if x != "" else None for x in ret1] if ret1[2] == "": ret2[2] = "" return ret2 def _parse_hextuple( self, cg: ConjunctiveGraph, tup: List[Union[str, None]] ) -> None: # all values check # subject, predicate, value, datatype cannot be None # language and graph may be None if tup[0] is None or tup[1] is None or tup[2] is None or tup[3] is None: raise ValueError( "subject, predicate, value, datatype cannot be None. Given: " f"{tup}" ) # 1 - subject s: Union[URIRef, BNode] if tup[0].startswith("_"): s = BNode(value=tup[0].replace("_:", "")) else: s = URIRef(tup[0]) # 2 - predicate p = URIRef(tup[1]) # 3 - value o: Union[URIRef, BNode, Literal] if tup[3] == "globalId": o = URIRef(tup[2]) elif tup[3] == "localId": o = BNode(value=tup[2].replace("_:", "")) else: # literal if tup[4] is None: o = Literal(tup[2], datatype=URIRef(tup[3])) else: o = Literal(tup[2], lang=tup[4]) # 6 - context if tup[5] is not None: c = URIRef(tup[5]) # type error: Argument 1 to "add" of "ConjunctiveGraph" has incompatible type "Tuple[Union[URIRef, BNode], URIRef, Union[URIRef, BNode, Literal], URIRef]"; expected "Union[Tuple[Node, Node, Node], Tuple[Node, Node, Node, Optional[Graph]]]" cg.add((s, p, o, c)) # type: ignore[arg-type] else: cg.add((s, p, o)) # type error: Signature of "parse" incompatible with supertype "Parser"
[docs] def parse(self, source: InputSource, graph: Graph, **kwargs: Any) -> None: # type: ignore[override] if kwargs.get("encoding") not in [None, "utf-8"]: warnings.warn( f"Hextuples files are always utf-8 encoded, " f"I was passed: {kwargs.get('encoding')}, " "but I'm still going to use utf-8" ) assert ( graph.store.context_aware ), "Hextuples Parser needs a context-aware store!" cg = ConjunctiveGraph(store=graph.store, identifier=graph.identifier) cg.default_context = graph text_stream: Optional[TextIO] = source.getCharacterStream() if text_stream is None: binary_stream: Optional[BinaryIO] = source.getByteStream() if binary_stream is None: raise ValueError( f"Source does not have a character stream or a byte stream and cannot be used {type(source)}" ) text_stream = TextIOWrapper(binary_stream, encoding="utf-8") for line in text_stream: if len(line) == 0 or line.isspace(): # Skipping empty lines because this is what was being done before for the first and last lines, albeit in an rather indirect way. # The result is that we accept input that would otherwise be invalid. # Possibly we should just let this result in an error. continue self._parse_hextuple(cg, self._load_json_line(line))