Source code for rdflib.plugins.parsers.hturtle

# -*- coding: utf-8 -*-
"""
Extraction parser RDF embedded verbatim into HTML or XML files. This is based
on:

* The specification on embedding turtle into html:
    http://www.w3.org/TR/turtle/#in-html

For SVG (and currently SVG only) the method also extracts an embedded RDF/XML
data, per SVG specification

License: W3C Software License,
http://www.w3.org/Consortium/Legal/copyright-software
Author: Ivan Herman
Copyright: W3C
"""

from rdflib.parser import Parser
from .pyRdfa import pyRdfa, Options
from .pyRdfa.state import ExecutionContext
from .pyRdfa.embeddedRDF import handle_embeddedRDF
from .structureddata import _get_orig_source, _check_error

try:
    import html5lib
    assert html5lib
    html5lib = True
except ImportError:
    import warnings
    warnings.warn(
        'html5lib not found! RDFa and Microdata parsers ' +
        'will not be available.')
    html5lib = False


[docs]class HTurtle(pyRdfa): """ Bastardizing the RDFa 1.1 parser to do a hturtle extractions """
[docs] def __init__(self, options=None, base="", media_type=""): pyRdfa.__init__(self, options=options, base=base, media_type=media_type, rdfa_version="1.1")
[docs] def graph_from_DOM(self, dom, graph, pgraph=None): """ Stealing the parsing function from the original class, to do turtle extraction only """ def copyGraph(tog, fromg): for t in fromg: tog.add(t) for k, ns in fromg.namespaces(): tog.bind(k, ns) def _process_one_node(node, graph, state): if handle_embeddedRDF(node, graph, state): # we got an RDF content that has been extracted into Graph; # the recursion should stop return else: # recurse through all the child elements of the current node for n in node.childNodes: if n.nodeType == node.ELEMENT_NODE: _process_one_node(n, graph, state) topElement = dom.documentElement state = ExecutionContext(topElement, graph, base=self.base, options=self.options, rdfa_version="1.1") _process_one_node(topElement, graph, state) if pgraph is not None: copyGraph(pgraph, self.options.processor_graph.graph)
# This is the parser interface as it would look when called from the rest of # RDFLib
[docs]class HTurtleParser(Parser):
[docs] def parse(self, source, graph, pgraph=None, media_type=""): """ @param source: one of the input sources that the RDFLib package defined @type source: InputSource class instance @param graph: target graph for the triples; output graph, in RDFa spec. parlance @type graph: RDFLib Graph @keyword media_type: explicit setting of the preferred media type (a.k.a. content type) of the the RDFa source. None means the content type of the HTTP result is used, or a guess is made based on the suffix of a file @type media_type: string """ if html5lib is False: raise ImportError( 'html5lib is not installed, cannot ' + 'use RDFa and Microdata parsers.') (baseURI, orig_source) = _get_orig_source(source) self._process( graph, pgraph, baseURI, orig_source, media_type=media_type)
def _process(self, graph, baseURI, orig_source, media_type=""): self.options = Options(output_processor_graph=None, embedded_rdf=True, vocab_expansion=False, vocab_cache=False) if media_type is None: media_type = "" processor = HTurtle( self.options, base=baseURI, media_type=media_type) processor.graph_from_source( orig_source, graph=graph, pgraph=None, rdfOutput=False) # get possible error triples to raise exceptions _check_error(graph)