Source code for rdflib.plugins.parsers.pyRdfa.termorcurie

# -*- coding: utf-8 -*-
"""
Management of vocabularies, terms, and their mapping to URI-s. The main class of this module (L{TermOrCurie}) is,
conceptually, part of the overall state of processing at a node (L{state.ExecutionContext}) but putting it into a separate
module makes it easider to maintain.

@summary: Management of vocabularies, terms, and their mapping to URI-s.
@requires: U{RDFLib package<http://rdflib.net>}
@organization: U{World Wide Web Consortium<http://www.w3.org>}
@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
@license: This software is available for use under the
U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}

@var XHTML_PREFIX: prefix for the XHTML vocabulary URI (set to 'xhv')
@var XHTML_URI: URI prefix of the XHTML vocabulary
@var ncname: Regular expression object for NCNAME
@var termname: Regular expression object for a term
@var xml_application_media_type: Regular expression object for a general XML application media type
"""

"""
$Id: termorcurie.py,v 1.12 2013-10-16 11:48:54 ivan Exp $
$Date: 2013-10-16 11:48:54 $
"""

import re, sys
import xml.dom.minidom
import random

if sys.version_info[0] >= 3 :
	from urllib.parse import urlsplit
else :
	from urlparse import urlsplit


import rdflib
from rdflib	import URIRef
from rdflib	import Literal
from rdflib	import BNode
from rdflib	import Namespace
if rdflib.__version__ >= "3.0.0" :
	from rdflib	import Graph
	from rdflib	import RDF  as ns_rdf
	from rdflib	import RDFS as ns_rdfs
else :
	from rdflib.Graph	import Graph
	from rdflib.RDFS	import RDFSNS as ns_rdfs
	from rdflib.RDF		import RDFNS  as ns_rdf

from .options		import Options
from .utils 		import quote_URI, URIOpener
from .host 			import MediaTypes, HostLanguage, predefined_1_0_rel, warn_xmlns_usage
from .				import IncorrectPrefixDefinition, RDFA_VOCAB, UnresolvableReference, PrefixRedefinitionWarning
from .				import ns_rdfa

from . import err_redefining_URI_as_prefix
from . import err_xmlns_deprecated
from . import err_bnode_local_prefix
from . import err_col_local_prefix
from . import err_missing_URI_prefix
from . import err_invalid_prefix
from . import err_no_default_prefix
from . import err_prefix_and_xmlns
from . import err_non_ncname_prefix
from . import err_absolute_reference
from . import err_query_reference
from . import err_fragment_reference
from . import err_prefix_redefinition


# Regular expression object for NCNAME
ncname   = re.compile("^[A-Za-z][A-Za-z0-9._-]*$")

# Regular expression object for term name
termname = re.compile("^[A-Za-z]([A-Za-z0-9._-]|/)*$")

# Regular expression object for a general XML application media type
xml_application_media_type = re.compile("application/[a-zA-Z0-9]+\+xml")

XHTML_PREFIX = "xhv"
XHTML_URI    = "http://www.w3.org/1999/xhtml/vocab#"

#### Managing blank nodes for CURIE-s: mapping from local names to blank nodes.
_bnodes = {}
_empty_bnode = BNode()

####

[docs]class InitialContext :
	"""
	Get the initial context values. In most cases this class has an empty content, except for the
	top level (in case of RDFa 1.1). Each L{TermOrCurie} class has one instance of this class. It provides initial
	mappings for terms, namespace prefixes, etc, that the top level L{TermOrCurie} instance uses for its own initialization.

	@ivar terms: collection of all term mappings
	@type terms: dictionary
	@ivar ns: namespace mapping
	@type ns: dictionary
	@ivar vocabulary: default vocabulary
	@type vocabulary: string
	"""

[docs]	def __init__(self, state, top_level) :
		"""
		@param state: the state behind this term mapping
		@type state: L{state.ExecutionContext}
		@param top_level : whether this is the top node of the DOM tree (the only place where initial contexts are handled)
		@type top_level : boolean
		"""
		self.state = state

		# This is to store the local terms
		self.terms  = {}
		# This is to store the local Namespaces (a.k.a. prefixes)
		self.ns     = {}
		# Default vocabulary
		self.vocabulary = None

		if state.rdfa_version < "1.1" or top_level == False :
			return

		from .initialcontext	import initial_context    as context_data
		from .host 				import initial_contexts   as context_ids
		from .host				import default_vocabulary

		for id in context_ids[state.options.host_language] :
			# This gives the id of a initial context, valid for this media type:
			data = context_data[id]

			# Merge the context data with the overall definition
			if state.options.host_language in default_vocabulary :
				self.vocabulary = default_vocabulary[state.options.host_language]
			elif data.vocabulary != "" :
				self.vocabulary = data.vocabulary

			for key in data.terms :
				self.terms[key] = URIRef(data.terms[key])
			for key in data.ns :
				self.ns[key] = (Namespace(data.ns[key]),False)


##################################################################################################################

[docs]class TermOrCurie :
	"""
	Wrapper around vocabulary management, ie, mapping a term to a URI, as well as a CURIE to a URI. Each instance of this class belongs to a
	"state", instance of L{state.ExecutionContext}. Context definitions are managed at initialization time.

	(In fact, this class is, conceptually, part of the overall state at a node, and has been separated here for an
	easier maintenance.)

	The class takes care of the stack-like behavior of vocabulary items, ie, inheriting everything that is possible
	from the "parent". At initialization time, this works through the prefix definitions (i.e., C{@prefix} or C{@xmln:} attributes)
	and/or C{@vocab} attributes.

	@ivar state: State to which this instance belongs
	@type state: L{state.ExecutionContext}
	@ivar graph: The RDF Graph under generation
	@type graph: rdflib.Graph
	@ivar terms: mapping from terms to URI-s
	@type terms: dictionary
	@ivar ns: namespace declarations, ie, mapping from prefixes to URIs
	@type ns: dictionary
	@ivar default_curie_uri: URI for a default CURIE
	"""
[docs]	def __init__(self, state, graph, inherited_state) :
		"""Initialize the vocab bound to a specific state.
		@param state: the state to which this vocab instance belongs to
		@type state: L{state.ExecutionContext}
		@param graph: the RDF graph being worked on
		@type graph: rdflib.Graph
		@param inherited_state: the state inherited by the current state. 'None' if this is the top level state.
		@type inherited_state: L{state.ExecutionContext}
		"""
		def check_prefix(pr) :
			from . import uri_schemes
			if pr in uri_schemes :
				# The prefix being defined is a registered URI scheme, better avoid it...
				state.options.add_warning(err_redefining_URI_as_prefix % pr, node=state.node.nodeName)

		self.state	= state
		self.graph	= graph

		# --------------------------------------------------------------------------------
		# This is set to non-void only on the top level and in the case of 1.1
		default_vocab = InitialContext(self.state, inherited_state == None)

		# Set the default CURIE URI
		if inherited_state == None :
			# This is the top level...
			self.default_curie_uri = Namespace(XHTML_URI)
			# self.graph.bind(XHTML_PREFIX, self.default_curie_uri)
		else :
			self.default_curie_uri = inherited_state.term_or_curie.default_curie_uri

		# --------------------------------------------------------------------------------
		# Set the default term URI
		# This is a 1.1 feature, ie, should be ignored if the version is < 1.0
		if state.rdfa_version >= "1.1" :
			# that is the absolute default setup...
			if inherited_state == None :
				self.default_term_uri = None
			else :
				self.default_term_uri = inherited_state.term_or_curie.default_term_uri

			# see if the initial context has defined a default vocabulary:
			if default_vocab.vocabulary :
				self.default_term_uri = default_vocab.vocabulary

			# see if there is local vocab that would override previous settings
			# However, care should be taken with the vocab="" value that should not become a URI...
			# Indeed, this value is used to 'vipe out', ie, get back to the default vocabulary...
			if self.state.node.hasAttribute("vocab") and self.state.node.getAttribute("vocab") == "" :
				self.default_term_uri = default_vocab.vocabulary
			else :
				def_term_uri = self.state.getURI("vocab")
				if def_term_uri and def_term_uri != "" :
					self.default_term_uri = def_term_uri
					self.graph.add((URIRef(self.state.base),RDFA_VOCAB,URIRef(def_term_uri)))
		else :
			self.default_term_uri = None

		# --------------------------------------------------------------------------------
		# The simpler case: terms, adding those that have been defined by a possible initial context
		if inherited_state is None :
			# this is the vocabulary belonging to the top level of the tree!
			self.terms = {}
			if state.rdfa_version >= "1.1" :
				# Simply get the terms defined by the default vocabularies. There is no need for merging
				for key in default_vocab.terms :
					self.terms[key] = default_vocab.terms[key]
			else :
				# The terms are hardwired...
				for key in predefined_1_0_rel :
					self.terms[key] = URIRef(XHTML_URI + key)
		else :
			# just refer to the inherited terms
			self.terms = inherited_state.term_or_curie.terms

		#-----------------------------------------------------------------
		# the locally defined namespaces
		dict = {}
		# locally defined xmlns namespaces, necessary for correct XML Literal generation
		xmlns_dict = {}

		# Add the locally defined namespaces using the xmlns: syntax
		for i in range(0, state.node.attributes.length) :
			attr = state.node.attributes.item(i)
			if attr.name.find('xmlns:') == 0 :
				# yep, there is a namespace setting
				prefix = attr.localName
				if prefix != "" : # exclude the top level xmlns setting...
					if state.rdfa_version >= "1.1" and state.options.host_language in warn_xmlns_usage :
						state.options.add_warning(err_xmlns_deprecated % prefix, IncorrectPrefixDefinition, node=state.node.nodeName)
					if prefix == "_" :
						state.options.add_warning(err_bnode_local_prefix, IncorrectPrefixDefinition, node=state.node.nodeName)
					elif prefix.find(':') != -1 :
						state.options.add_warning(err_col_local_prefix % prefix, IncorrectPrefixDefinition, node=state.node.nodeName)
					else :
						# quote the URI, ie, convert special characters into %.. This is
						# true, for example, for spaces
						uri = quote_URI(attr.value, state.options)
						# create a new RDFLib Namespace entry
						ns = Namespace(uri)
						# Add an entry to the dictionary if not already there (priority is left to right!)
						if state.rdfa_version >= "1.1" :
							pr = prefix.lower()
						else :
							pr = prefix
						dict[pr]       = ns
						xmlns_dict[pr] = ns
						self.graph.bind(pr,ns)
						check_prefix(pr)

		# Add the locally defined namespaces using the @prefix syntax
		# this may override the definition @xmlns
		if state.rdfa_version >= "1.1" and state.node.hasAttribute("prefix") :
			pr = state.node.getAttribute("prefix")
			if pr != None :
				# separator character is whitespace
				pr_list = pr.strip().split()
				# range(0, len(pr_list), 2)
				for i in range(len(pr_list) - 2, -1, -2) :
					prefix = pr_list[i]
					# see if there is a URI at all
					if i == len(pr_list) - 1 :
						state.options.add_warning(err_missing_URI_prefix % (prefix,pr), node=state.node.nodeName)
						break
					else :
						value = pr_list[i+1]

					# see if the value of prefix is o.k., ie, there is a ':' at the end
					if prefix[-1] != ':' :
						state.options.add_warning(err_invalid_prefix % (prefix,pr), IncorrectPrefixDefinition, node=state.node.nodeName)
						continue
					elif prefix == ":" :
						state.options.add_warning(err_no_default_prefix % pr, IncorrectPrefixDefinition, node=state.node.nodeName)
						continue
					else :
						prefix = prefix[:-1]
						uri    = Namespace(quote_URI(value, state.options))
						if prefix == "" :
							#something to be done here
							self.default_curie_uri = uri
						elif prefix == "_" :
							state.options.add_warning(err_bnode_local_prefix, IncorrectPrefixDefinition, node=state.node.nodeName)
						else :
							# last check: is the prefix an NCNAME?
							if ncname.match(prefix) :
								real_prefix = prefix.lower()
								dict[real_prefix] = uri
								self.graph.bind(real_prefix,uri)
								# Additional warning: is this prefix overriding an existing xmlns statement with a different URI? if
								# so, that may lead to discrepancies between an RDFa 1.0 and RDFa 1.1 run...
								if (prefix in xmlns_dict and xmlns_dict[prefix] != uri) or (real_prefix in xmlns_dict and xmlns_dict[real_prefix] != uri) :
									state.options.add_warning(err_prefix_and_xmlns % (real_prefix,real_prefix), node=state.node.nodeName)
								check_prefix(real_prefix)

							else :
								state.options.add_warning(err_non_ncname_prefix % (prefix,pr), IncorrectPrefixDefinition, node=state.node.nodeName)

		# See if anything has been collected at all.
		# If not, the namespaces of the incoming state is
		# taken over by reference. Otherwise that is copied to the
		# the local dictionary
		if inherited_state == None :
			self.default_prefixes = default_vocab.ns
			inherited_prefixes    = {}
		else :
			self.default_prefixes = inherited_state.term_or_curie.default_prefixes
			inherited_prefixes    = inherited_state.term_or_curie.ns

		if len(dict) == 0 :
			self.ns = inherited_prefixes
		else :
			self.ns = {}
			for key in inherited_prefixes : self.ns[key] = inherited_prefixes[key]
			for key in dict :
				if (key in inherited_prefixes and dict[key] != inherited_prefixes[key]) or (key in self.default_prefixes and dict[key] != self.default_prefixes[key][0]) :
					state.options.add_warning(err_prefix_redefinition % key, PrefixRedefinitionWarning, node=state.node.nodeName)
				self.ns[key] = dict[key]


		# the xmlns prefixes have to be stored separately, again for XML Literal generation
		self.xmlns = {}
		if len(xmlns_dict) == 0 and inherited_state :
			self.xmlns = inherited_state.term_or_curie.xmlns
		else :
			if inherited_state :
				for key in inherited_state.term_or_curie.xmlns	: self.xmlns[key] = inherited_state.term_or_curie.xmlns[key]
				for key in xmlns_dict							: self.xmlns[key] = xmlns_dict[key]
			else :
				self.xmlns = xmlns_dict
	# end __init__

	def _check_reference(self, val) :
		"""Checking the CURIE reference for correctness. It is probably not 100% foolproof, but may take care
		of some of the possible errors. See the URI RFC for the details.
		"""
		def char_check(s, not_allowed = ['#','[',']']) :
			for c in not_allowed :
				if s.find(c) != -1 : return False
			return True
		# Creating an artificial http URI to fool the urlparse module...
		scheme, netloc, url, query, fragment = urlsplit('http:' + val)
		if netloc != "" and self.state.rdfa_version >= "1.1" :
			self.state.options.add_warning(err_absolute_reference % (netloc, val), UnresolvableReference, node=self.state.node.nodeName)
			return False
		elif not char_check(query) :
			self.state.options.add_warning(err_query_reference % (query, val), UnresolvableReference, node=self.state.node.nodeName)
			return False
		elif not char_check(fragment) :
			self.state.options.add_warning(err_fragment_reference % (fragment, val), UnresolvableReference, node=self.state.node.nodeName)
			return False
		else :
			return True

[docs]	def CURIE_to_URI(self, val) :
		"""CURIE to URI mapping.

		This method does I{not} take care of the last step of CURIE processing, ie, the fact that if
		it does not have a CURIE then the value is used a URI. This is done on the caller's side, because this has
		to be combined with base, for example. The method I{does} take care of BNode processing, though, ie,
		CURIE-s of the form "_:XXX".

		@param val: the full CURIE
		@type val: string
		@return: URIRef of a URI or None.
		"""
		# Just to be on the safe side:
		if val == "" :
			return None
		elif val == ":" :
			if self.default_curie_uri :
				return URIRef(self.default_curie_uri)
			else :
				return None

		# See if this is indeed a valid CURIE, ie, it can be split by a colon
		curie_split = val.split(':',1)
		if len(curie_split) == 1 :
			# there is no ':' character in the string, ie, it is not a valid CURIE
			return None
		else :
			if self.state.rdfa_version >= "1.1" :
				prefix	= curie_split[0].lower()
			else :
				prefix	= curie_split[0]
			reference = curie_split[1]

			#if len(reference) > 0 :
			#	if self.state.rdfa_version >= "1.1" and (len(prefix) == 0 or prefix in self.ns) and reference.startswith('//') :
			#		# This has been defined as illegal in RDFa 1.1
			#		self.state.options.add_warning(err_absolute_reference % (reference, val), UnresolvableReference, node=self.state.node.nodeName)
			#		return None
			#	if reference[0] == ":" :
			#		return None

			# first possibility: empty prefix
			if len(prefix) == 0 :
				if self.default_curie_uri and self._check_reference(reference) :
					return self.default_curie_uri[reference]
				else :
					return None
			else :
				# prefix is non-empty; can be a bnode
				if prefix == "_" :
					# yep, BNode processing. There is a difference whether the reference is empty or not...
					if len(reference) == 0 :
						return _empty_bnode
					else :
						# see if this variable has been used before for a BNode
						if reference in _bnodes :
							return _bnodes[reference]
						else :
							# a new bnode...
							retval = BNode()
							_bnodes[reference] = retval
							return retval
				# check if the prefix is a valid NCNAME
				elif ncname.match(prefix) :
					# see if there is a binding for this:
					if prefix in self.ns and self._check_reference(reference) :
						# yep, a binding has been defined!
						if len(reference) == 0 :
							return URIRef(str(self.ns[prefix]))
						else :
							return self.ns[prefix][reference]
					elif prefix in self.default_prefixes and self._check_reference(reference) :
						# this has been defined through the default context
						if len(reference) == 0 :
							return URIRef(str(self.default_prefixes[prefix][0]))
						else :
							(ns,used) = self.default_prefixes[prefix]
							# lazy binding of prefixes (to avoid unnecessary prefix definitions in the serializations at the end...)
							if not used :
								self.graph.bind(prefix,ns)
								self.default_prefixes[prefix] = (ns,True)
							return ns[reference]
					else :
						# no definition for this thing...
						return None
				else :
					return None
	# end CURIE_to_URI

[docs]	def term_to_URI(self, term) :
		"""A term to URI mapping, where term is a simple string and the corresponding
		URI is defined via the @vocab (ie, default term uri) mechanism. Returns None if term is not defined
		@param term: string
		@return: an RDFLib URIRef instance (or None)
		"""
		if len(term) == 0 : return None

		if termname.match(term) :
			# It is a valid NCNAME

			# First of all, a @vocab nukes everything. That has to be done first...
			if self.default_term_uri != None :
				return URIRef(self.default_term_uri + term)

			# For default terms, the algorithm is (see 7.4.3 of the document): first make a case sensitive match;
			# if that fails than make a case insensive one
			# 1. simple, case sensitive test:
			if term in self.terms :
				# yep, term is a valid key as is
				# lazy binding of the xhv prefix for terms...
				self.graph.bind(XHTML_PREFIX, XHTML_URI)
				return self.terms[term]

			# 2. case insensitive test
			for defined_term in self.terms :
				if term.lower() == defined_term.lower() :
					# lazy binding of the xhv prefix for terms...
					self.graph.bind(XHTML_PREFIX, XHTML_URI)
					return self.terms[defined_term]

		# If it got here, it is all wrong...
		return None