Source code for rdflib.plugins.parsers.pyRdfa.termorcurie

# -*- coding: utf-8 -*-
"""
Management of vocabularies, terms, and their mapping to URI-s. The main class of this module (L{TermOrCurie}) is,
conceptually, part of the overall state of processing at a node (L{state.ExecutionContext}) but putting it into a separate
module makes it easider to maintain.

@summary: Management of vocabularies, terms, and their mapping to URI-s.
@requires: U{RDFLib package<http://rdflib.net>}
@organization: U{World Wide Web Consortium<http://www.w3.org>}
@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
@license: This software is available for use under the
U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}

@var XHTML_PREFIX: prefix for the XHTML vocabulary URI (set to 'xhv')
@var XHTML_URI: URI prefix of the XHTML vocabulary
@var ncname: Regular expression object for NCNAME
@var termname: Regular expression object for a term
@var xml_application_media_type: Regular expression object for a general XML application media type
"""

"""
$Id: termorcurie.py,v 1.12 2013-10-16 11:48:54 ivan Exp $
$Date: 2013-10-16 11:48:54 $
"""

import re, sys
import xml.dom.minidom
import random

if sys.version_info[0] >= 3 :
	from urllib.parse import urlsplit
else :
	from urlparse import urlsplit


import rdflib
from rdflib	import URIRef
from rdflib	import Literal
from rdflib	import BNode
from rdflib	import Namespace
if rdflib.__version__ >= "3.0.0" :
	from rdflib	import Graph
	from rdflib	import RDF  as ns_rdf
	from rdflib	import RDFS as ns_rdfs
else :
	from rdflib.Graph	import Graph
	from rdflib.RDFS	import RDFSNS as ns_rdfs
	from rdflib.RDF		import RDFNS  as ns_rdf

from .options		import Options
from .utils 		import quote_URI, URIOpener
from .host 			import MediaTypes, HostLanguage, predefined_1_0_rel, warn_xmlns_usage
from .				import IncorrectPrefixDefinition, RDFA_VOCAB, UnresolvableReference, PrefixRedefinitionWarning
from .				import ns_rdfa

from . import err_redefining_URI_as_prefix
from . import err_xmlns_deprecated
from . import err_bnode_local_prefix
from . import err_col_local_prefix
from . import err_missing_URI_prefix
from . import err_invalid_prefix
from . import err_no_default_prefix
from . import err_prefix_and_xmlns
from . import err_non_ncname_prefix
from . import err_absolute_reference
from . import err_query_reference
from . import err_fragment_reference
from . import err_prefix_redefinition


# Regular expression object for NCNAME
ncname   = re.compile("^[A-Za-z][A-Za-z0-9._-]*$")

# Regular expression object for term name
termname = re.compile("^[A-Za-z]([A-Za-z0-9._-]|/)*$")

# Regular expression object for a general XML application media type
xml_application_media_type = re.compile("application/[a-zA-Z0-9]+\+xml")

XHTML_PREFIX = "xhv"
XHTML_URI    = "http://www.w3.org/1999/xhtml/vocab#"

#### Managing blank nodes for CURIE-s: mapping from local names to blank nodes.
_bnodes = {}
_empty_bnode = BNode()

####

[docs]class InitialContext : """ Get the initial context values. In most cases this class has an empty content, except for the top level (in case of RDFa 1.1). Each L{TermOrCurie} class has one instance of this class. It provides initial mappings for terms, namespace prefixes, etc, that the top level L{TermOrCurie} instance uses for its own initialization. @ivar terms: collection of all term mappings @type terms: dictionary @ivar ns: namespace mapping @type ns: dictionary @ivar vocabulary: default vocabulary @type vocabulary: string """
[docs] def __init__(self, state, top_level) : """ @param state: the state behind this term mapping @type state: L{state.ExecutionContext} @param top_level : whether this is the top node of the DOM tree (the only place where initial contexts are handled) @type top_level : boolean """ self.state = state # This is to store the local terms self.terms = {} # This is to store the local Namespaces (a.k.a. prefixes) self.ns = {} # Default vocabulary self.vocabulary = None if state.rdfa_version < "1.1" or top_level == False : return from .initialcontext import initial_context as context_data from .host import initial_contexts as context_ids from .host import default_vocabulary for id in context_ids[state.options.host_language] : # This gives the id of a initial context, valid for this media type: data = context_data[id] # Merge the context data with the overall definition if state.options.host_language in default_vocabulary : self.vocabulary = default_vocabulary[state.options.host_language] elif data.vocabulary != "" : self.vocabulary = data.vocabulary for key in data.terms : self.terms[key] = URIRef(data.terms[key]) for key in data.ns : self.ns[key] = (Namespace(data.ns[key]),False)
##################################################################################################################
[docs]class TermOrCurie : """ Wrapper around vocabulary management, ie, mapping a term to a URI, as well as a CURIE to a URI. Each instance of this class belongs to a "state", instance of L{state.ExecutionContext}. Context definitions are managed at initialization time. (In fact, this class is, conceptually, part of the overall state at a node, and has been separated here for an easier maintenance.) The class takes care of the stack-like behavior of vocabulary items, ie, inheriting everything that is possible from the "parent". At initialization time, this works through the prefix definitions (i.e., C{@prefix} or C{@xmln:} attributes) and/or C{@vocab} attributes. @ivar state: State to which this instance belongs @type state: L{state.ExecutionContext} @ivar graph: The RDF Graph under generation @type graph: rdflib.Graph @ivar terms: mapping from terms to URI-s @type terms: dictionary @ivar ns: namespace declarations, ie, mapping from prefixes to URIs @type ns: dictionary @ivar default_curie_uri: URI for a default CURIE """
[docs] def __init__(self, state, graph, inherited_state) : """Initialize the vocab bound to a specific state. @param state: the state to which this vocab instance belongs to @type state: L{state.ExecutionContext} @param graph: the RDF graph being worked on @type graph: rdflib.Graph @param inherited_state: the state inherited by the current state. 'None' if this is the top level state. @type inherited_state: L{state.ExecutionContext} """ def check_prefix(pr) : from . import uri_schemes if pr in uri_schemes : # The prefix being defined is a registered URI scheme, better avoid it... state.options.add_warning(err_redefining_URI_as_prefix % pr, node=state.node.nodeName) self.state = state self.graph = graph # -------------------------------------------------------------------------------- # This is set to non-void only on the top level and in the case of 1.1 default_vocab = InitialContext(self.state, inherited_state == None) # Set the default CURIE URI if inherited_state == None : # This is the top level... self.default_curie_uri = Namespace(XHTML_URI) # self.graph.bind(XHTML_PREFIX, self.default_curie_uri) else : self.default_curie_uri = inherited_state.term_or_curie.default_curie_uri # -------------------------------------------------------------------------------- # Set the default term URI # This is a 1.1 feature, ie, should be ignored if the version is < 1.0 if state.rdfa_version >= "1.1" : # that is the absolute default setup... if inherited_state == None : self.default_term_uri = None else : self.default_term_uri = inherited_state.term_or_curie.default_term_uri # see if the initial context has defined a default vocabulary: if default_vocab.vocabulary : self.default_term_uri = default_vocab.vocabulary # see if there is local vocab that would override previous settings # However, care should be taken with the vocab="" value that should not become a URI... # Indeed, this value is used to 'vipe out', ie, get back to the default vocabulary... if self.state.node.hasAttribute("vocab") and self.state.node.getAttribute("vocab") == "" : self.default_term_uri = default_vocab.vocabulary else : def_term_uri = self.state.getURI("vocab") if def_term_uri and def_term_uri != "" : self.default_term_uri = def_term_uri self.graph.add((URIRef(self.state.base),RDFA_VOCAB,URIRef(def_term_uri))) else : self.default_term_uri = None # -------------------------------------------------------------------------------- # The simpler case: terms, adding those that have been defined by a possible initial context if inherited_state is None : # this is the vocabulary belonging to the top level of the tree! self.terms = {} if state.rdfa_version >= "1.1" : # Simply get the terms defined by the default vocabularies. There is no need for merging for key in default_vocab.terms : self.terms[key] = default_vocab.terms[key] else : # The terms are hardwired... for key in predefined_1_0_rel : self.terms[key] = URIRef(XHTML_URI + key) else : # just refer to the inherited terms self.terms = inherited_state.term_or_curie.terms #----------------------------------------------------------------- # the locally defined namespaces dict = {} # locally defined xmlns namespaces, necessary for correct XML Literal generation xmlns_dict = {} # Add the locally defined namespaces using the xmlns: syntax for i in range(0, state.node.attributes.length) : attr = state.node.attributes.item(i) if attr.name.find('xmlns:') == 0 : # yep, there is a namespace setting prefix = attr.localName if prefix != "" : # exclude the top level xmlns setting... if state.rdfa_version >= "1.1" and state.options.host_language in warn_xmlns_usage : state.options.add_warning(err_xmlns_deprecated % prefix, IncorrectPrefixDefinition, node=state.node.nodeName) if prefix == "_" : state.options.add_warning(err_bnode_local_prefix, IncorrectPrefixDefinition, node=state.node.nodeName) elif prefix.find(':') != -1 : state.options.add_warning(err_col_local_prefix % prefix, IncorrectPrefixDefinition, node=state.node.nodeName) else : # quote the URI, ie, convert special characters into %.. This is # true, for example, for spaces uri = quote_URI(attr.value, state.options) # create a new RDFLib Namespace entry ns = Namespace(uri) # Add an entry to the dictionary if not already there (priority is left to right!) if state.rdfa_version >= "1.1" : pr = prefix.lower() else : pr = prefix dict[pr] = ns xmlns_dict[pr] = ns self.graph.bind(pr,ns) check_prefix(pr) # Add the locally defined namespaces using the @prefix syntax # this may override the definition @xmlns if state.rdfa_version >= "1.1" and state.node.hasAttribute("prefix") : pr = state.node.getAttribute("prefix") if pr != None : # separator character is whitespace pr_list = pr.strip().split() # range(0, len(pr_list), 2) for i in range(len(pr_list) - 2, -1, -2) : prefix = pr_list[i] # see if there is a URI at all if i == len(pr_list) - 1 : state.options.add_warning(err_missing_URI_prefix % (prefix,pr), node=state.node.nodeName) break else : value = pr_list[i+1] # see if the value of prefix is o.k., ie, there is a ':' at the end if prefix[-1] != ':' : state.options.add_warning(err_invalid_prefix % (prefix,pr), IncorrectPrefixDefinition, node=state.node.nodeName) continue elif prefix == ":" : state.options.add_warning(err_no_default_prefix % pr, IncorrectPrefixDefinition, node=state.node.nodeName) continue else : prefix = prefix[:-1] uri = Namespace(quote_URI(value, state.options)) if prefix == "" : #something to be done here self.default_curie_uri = uri elif prefix == "_" : state.options.add_warning(err_bnode_local_prefix, IncorrectPrefixDefinition, node=state.node.nodeName) else : # last check: is the prefix an NCNAME? if ncname.match(prefix) : real_prefix = prefix.lower() dict[real_prefix] = uri self.graph.bind(real_prefix,uri) # Additional warning: is this prefix overriding an existing xmlns statement with a different URI? if # so, that may lead to discrepancies between an RDFa 1.0 and RDFa 1.1 run... if (prefix in xmlns_dict and xmlns_dict[prefix] != uri) or (real_prefix in xmlns_dict and xmlns_dict[real_prefix] != uri) : state.options.add_warning(err_prefix_and_xmlns % (real_prefix,real_prefix), node=state.node.nodeName) check_prefix(real_prefix) else : state.options.add_warning(err_non_ncname_prefix % (prefix,pr), IncorrectPrefixDefinition, node=state.node.nodeName) # See if anything has been collected at all. # If not, the namespaces of the incoming state is # taken over by reference. Otherwise that is copied to the # the local dictionary if inherited_state == None : self.default_prefixes = default_vocab.ns inherited_prefixes = {} else : self.default_prefixes = inherited_state.term_or_curie.default_prefixes inherited_prefixes = inherited_state.term_or_curie.ns if len(dict) == 0 : self.ns = inherited_prefixes else : self.ns = {} for key in inherited_prefixes : self.ns[key] = inherited_prefixes[key] for key in dict : if (key in inherited_prefixes and dict[key] != inherited_prefixes[key]) or (key in self.default_prefixes and dict[key] != self.default_prefixes[key][0]) : state.options.add_warning(err_prefix_redefinition % key, PrefixRedefinitionWarning, node=state.node.nodeName) self.ns[key] = dict[key] # the xmlns prefixes have to be stored separately, again for XML Literal generation self.xmlns = {} if len(xmlns_dict) == 0 and inherited_state : self.xmlns = inherited_state.term_or_curie.xmlns else : if inherited_state : for key in inherited_state.term_or_curie.xmlns : self.xmlns[key] = inherited_state.term_or_curie.xmlns[key] for key in xmlns_dict : self.xmlns[key] = xmlns_dict[key] else : self.xmlns = xmlns_dict
# end __init__ def _check_reference(self, val) : """Checking the CURIE reference for correctness. It is probably not 100% foolproof, but may take care of some of the possible errors. See the URI RFC for the details. """ def char_check(s, not_allowed = ['#','[',']']) : for c in not_allowed : if s.find(c) != -1 : return False return True # Creating an artificial http URI to fool the urlparse module... scheme, netloc, url, query, fragment = urlsplit('http:' + val) if netloc != "" and self.state.rdfa_version >= "1.1" : self.state.options.add_warning(err_absolute_reference % (netloc, val), UnresolvableReference, node=self.state.node.nodeName) return False elif not char_check(query) : self.state.options.add_warning(err_query_reference % (query, val), UnresolvableReference, node=self.state.node.nodeName) return False elif not char_check(fragment) : self.state.options.add_warning(err_fragment_reference % (fragment, val), UnresolvableReference, node=self.state.node.nodeName) return False else : return True
[docs] def CURIE_to_URI(self, val) : """CURIE to URI mapping. This method does I{not} take care of the last step of CURIE processing, ie, the fact that if it does not have a CURIE then the value is used a URI. This is done on the caller's side, because this has to be combined with base, for example. The method I{does} take care of BNode processing, though, ie, CURIE-s of the form "_:XXX". @param val: the full CURIE @type val: string @return: URIRef of a URI or None. """ # Just to be on the safe side: if val == "" : return None elif val == ":" : if self.default_curie_uri : return URIRef(self.default_curie_uri) else : return None # See if this is indeed a valid CURIE, ie, it can be split by a colon curie_split = val.split(':',1) if len(curie_split) == 1 : # there is no ':' character in the string, ie, it is not a valid CURIE return None else : if self.state.rdfa_version >= "1.1" : prefix = curie_split[0].lower() else : prefix = curie_split[0] reference = curie_split[1] #if len(reference) > 0 : # if self.state.rdfa_version >= "1.1" and (len(prefix) == 0 or prefix in self.ns) and reference.startswith('//') : # # This has been defined as illegal in RDFa 1.1 # self.state.options.add_warning(err_absolute_reference % (reference, val), UnresolvableReference, node=self.state.node.nodeName) # return None # if reference[0] == ":" : # return None # first possibility: empty prefix if len(prefix) == 0 : if self.default_curie_uri and self._check_reference(reference) : return self.default_curie_uri[reference] else : return None else : # prefix is non-empty; can be a bnode if prefix == "_" : # yep, BNode processing. There is a difference whether the reference is empty or not... if len(reference) == 0 : return _empty_bnode else : # see if this variable has been used before for a BNode if reference in _bnodes : return _bnodes[reference] else : # a new bnode... retval = BNode() _bnodes[reference] = retval return retval # check if the prefix is a valid NCNAME elif ncname.match(prefix) : # see if there is a binding for this: if prefix in self.ns and self._check_reference(reference) : # yep, a binding has been defined! if len(reference) == 0 : return URIRef(str(self.ns[prefix])) else : return self.ns[prefix][reference] elif prefix in self.default_prefixes and self._check_reference(reference) : # this has been defined through the default context if len(reference) == 0 : return URIRef(str(self.default_prefixes[prefix][0])) else : (ns,used) = self.default_prefixes[prefix] # lazy binding of prefixes (to avoid unnecessary prefix definitions in the serializations at the end...) if not used : self.graph.bind(prefix,ns) self.default_prefixes[prefix] = (ns,True) return ns[reference] else : # no definition for this thing... return None else : return None
# end CURIE_to_URI
[docs] def term_to_URI(self, term) : """A term to URI mapping, where term is a simple string and the corresponding URI is defined via the @vocab (ie, default term uri) mechanism. Returns None if term is not defined @param term: string @return: an RDFLib URIRef instance (or None) """ if len(term) == 0 : return None if termname.match(term) : # It is a valid NCNAME # First of all, a @vocab nukes everything. That has to be done first... if self.default_term_uri != None : return URIRef(self.default_term_uri + term) # For default terms, the algorithm is (see 7.4.3 of the document): first make a case sensitive match; # if that fails than make a case insensive one # 1. simple, case sensitive test: if term in self.terms : # yep, term is a valid key as is # lazy binding of the xhv prefix for terms... self.graph.bind(XHTML_PREFIX, XHTML_URI) return self.terms[term] # 2. case insensitive test for defined_term in self.terms : if term.lower() == defined_term.lower() : # lazy binding of the xhv prefix for terms... self.graph.bind(XHTML_PREFIX, XHTML_URI) return self.terms[defined_term] # If it got here, it is all wrong... return None