# -*- coding: utf-8 -*-
"""
Extraction parser RDF embedded verbatim into HTML or XML files. This is based
on:
* The specification on embedding turtle into html:
http://www.w3.org/TR/turtle/#in-html
For SVG (and currently SVG only) the method also extracts an embedded RDF/XML
data, per SVG specification
License: W3C Software License,
http://www.w3.org/Consortium/Legal/copyright-software
Author: Ivan Herman
Copyright: W3C
"""
from rdflib.parser import Parser
from .pyRdfa import pyRdfa, Options
from .pyRdfa.state import ExecutionContext
from .pyRdfa.embeddedRDF import handle_embeddedRDF
from .structureddata import _get_orig_source, _check_error
try:
import html5lib
assert html5lib
html5lib = True
except ImportError:
import warnings
warnings.warn(
'html5lib not found! RDFa and Microdata parsers ' +
'will not be available.')
html5lib = False
[docs]class HTurtle(pyRdfa):
"""
Bastardizing the RDFa 1.1 parser to do a hturtle extractions
"""
[docs] def __init__(self, options=None, base="", media_type=""):
pyRdfa.__init__(self, options=options, base=base,
media_type=media_type, rdfa_version="1.1")
[docs] def graph_from_DOM(self, dom, graph, pgraph=None):
"""
Stealing the parsing function from the original class, to do
turtle extraction only
"""
def copyGraph(tog, fromg):
for t in fromg:
tog.add(t)
for k, ns in fromg.namespaces():
tog.bind(k, ns)
def _process_one_node(node, graph, state):
if handle_embeddedRDF(node, graph, state):
# we got an RDF content that has been extracted into Graph;
# the recursion should stop
return
else:
# recurse through all the child elements of the current node
for n in node.childNodes:
if n.nodeType == node.ELEMENT_NODE:
_process_one_node(n, graph, state)
topElement = dom.documentElement
state = ExecutionContext(topElement, graph, base=self.base,
options=self.options, rdfa_version="1.1")
_process_one_node(topElement, graph, state)
if pgraph is not None:
copyGraph(pgraph, self.options.processor_graph.graph)
# This is the parser interface as it would look when called from the rest of
# RDFLib
[docs]class HTurtleParser(Parser):
[docs] def parse(self, source, graph, pgraph=None, media_type=""):
"""
@param source: one of the input sources that the RDFLib package defined
@type source: InputSource class instance
@param graph: target graph for the triples; output graph, in RDFa spec.
parlance
@type graph: RDFLib Graph
@keyword media_type: explicit setting of the preferred media type
(a.k.a. content type) of the the RDFa source. None means the content
type of the HTTP result is used, or a guess is made based on the
suffix of a file
@type media_type: string
"""
if html5lib is False:
raise ImportError(
'html5lib is not installed, cannot ' +
'use RDFa and Microdata parsers.')
(baseURI, orig_source) = _get_orig_source(source)
self._process(
graph, pgraph, baseURI, orig_source, media_type=media_type)
def _process(self, graph, baseURI, orig_source, media_type=""):
self.options = Options(output_processor_graph=None,
embedded_rdf=True,
vocab_expansion=False,
vocab_cache=False)
if media_type is None:
media_type = ""
processor = HTurtle(
self.options, base=baseURI, media_type=media_type)
processor.graph_from_source(
orig_source, graph=graph, pgraph=None, rdfOutput=False)
# get possible error triples to raise exceptions
_check_error(graph)