Source code for rdflib.plugins.parsers.pyRdfa.host.html5

# -*- coding: utf-8 -*-
"""
Simple transfomer for HTML5: add a @src for any @data, add a @content for the @value attribute of the <data> element, and interpret the <time> element.

@summary: Add a top "about" to <head> and <body>
@requires: U{RDFLib package<http://rdflib.net>}
@organization: U{World Wide Web Consortium<http://www.w3.org>}
@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
@license: This software is available for use under the
U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
@contact: Ivan Herman, ivan@w3.org
"""

"""
$Id: html5.py,v 1.13 2013-02-01 10:53:48 ivan Exp $
$Date: 2013-02-01 10:53:48 $
"""
try :
	from functools import reduce
except :
	# Not important. This import is necessary in Python 3 only and the newer versions of Python 2.X it is there
	# for a forward compatibility with Python 3
	pass

# The handling of datatime is a little bit more complex... better put this in a separate function for a better management
from datetime import datetime
import re
datetime_type   = "http://www.w3.org/2001/XMLSchema#dateTime"
time_type 	    = "http://www.w3.org/2001/XMLSchema#time"
date_type       = "http://www.w3.org/2001/XMLSchema#date"
date_gYear      = "http://www.w3.org/2001/XMLSchema#gYear"
date_gYearMonth = "http://www.w3.org/2001/XMLSchema#gYearMonth"
date_gMonthDay  = "http://www.w3.org/2001/XMLSchema#gMonthDay"
duration_type   = "http://www.w3.org/2001/XMLSchema#duration"
plain           = "plain"

handled_time_types = [ datetime_type, time_type, date_type, date_gYear, date_gYearMonth, date_gMonthDay, duration_type ]

_formats = {
	date_gMonthDay	  : [ "%m-%d" ],
	date_gYearMonth	  : [ "%Y-%m"],
	date_gYear     	  : [ "%Y" ],
	date_type      	  : [ "%Y-%m-%d", "%Y-%m-%dZ" ],
	time_type      	  : [ "%H:%M",
					      "%H:%M:%S",
					      "%H:%M:%SZ",
					      "%H:%M:%S.%f" ],
	datetime_type  	  : [ "%Y-%m-%dT%H:%M",
					      "%Y-%m-%dT%H:%M:%S",
					      "%Y-%m-%dT%H:%M:%S.%f",
					      "%Y-%m-%dT%H:%MZ",
					      "%Y-%m-%dT%H:%M:%SZ",
					      "%Y-%m-%dT%H:%M:%S.%fZ" ],
	duration_type     : [ "P%dD",
						  "P%YY%mM%dD",
						  "P%YY%mM",
						  "P%YY%dD",
						  "P%YY",
						  "P%mM",
						  "P%mM%dD",
						 ],
}

_dur_times = [ "%HH%MM%SS", "%HH", "%MM", "%SS", "%HH%MM", "%HH%SS", "%MM%SS" ]

def _format_test(string) :
	"""
	Tests the string format to see whether it fits one of the time datatypes
	@param string: attribute value to test
	@return: a URI for the xsd datatype or the string 'plain'
	"""
	# Try to get the easy cases:
	for key in _formats :
		for format in _formats[key] :
			try :
				# try to check if the syntax is fine
				d = datetime.strptime(string, format)
				# bingo!
				return key
			except ValueError :
				pass

	# Now come the special cases:-(
	# Check first for the duration stuff, that is the nastiest.
	if len(string) > 2 and (string[0] == 'P' or (string[0] == '-' and string[1] == 'P')) :
		# this is meant to be a duration type
		# first of all, get rid of the leading '-' and check again
		if string[0] == '-' :
			for format in _formats[duration_type] :
				try :
					# try to check if the syntax is fine
					d = datetime.strptime(string, format)
					# bingo!
					return duration_type
				except ValueError :
					pass
		# Let us see if the value contains a separate time portion, and cut that one
		durs = string.split('T')
		if len(durs) == 2 :
			# yep, so we should check again
			dur = durs[0]
			tm  = durs[1]
			# Check the duration part
			td = False
			for format in _formats[duration_type] :
				try :
					# try to check if the syntax is fine
					d = datetime.strptime(dur, format)
					# bingo!
					td = True
					break
				except ValueError :
					pass
			if td == True :
				# Getting there...
				for format in _dur_times :
					try :
						# try to check if the syntax is fine
						d = datetime.strptime(tm, format)
						# bingo!
						return duration_type
					except ValueError :
						pass
			# something went wrong...
			return plain
		else :
			# Well, no more tricks, this is a plain type
			return plain


	# If we got here, we should check the time zone
	# there is a discrepancy betwen the python and the HTML5/XSD lexical string,
	# which means that this has to handled separately for the date and the timezone portion
	try :
		# The time-zone-less portion of the string
		str = string[0:-6]
		# The time-zone portion
		tz = string[-5:]
		try :
			t = datetime.strptime(tz,"%H:%M")
		except ValueError :
			# Bummer, this is not a correct time
			return plain
		# The time-zone is fine, the datetime portion has to be checked
		for format in _formats[datetime_type] :
			try :
				# try to check if it is fine
				d = datetime.strptime(str, format)
				# Bingo!
				return datetime_type
			except ValueError :
				pass
	except :
		pass
	return plain

[docs]def html5_extra_attributes(node, state) : """ @param node: the current node that could be modified @param state: current state @type state: L{Execution context<pyRdfa.state.ExecutionContext>} """ def _get_literal(Pnode): """ Get (recursively) the full text from a DOM Node. @param Pnode: DOM Node @return: string """ rc = "" for node in Pnode.childNodes: if node.nodeType == node.TEXT_NODE: rc = rc + node.data elif node.nodeType == node.ELEMENT_NODE : rc = rc + _get_literal(node) if state.options.space_preserve : return rc else : return re.sub(r'(\r| |\n|\t)+'," ",rc).strip() #return re.sub(r'(\r| |\n|\t)+',"",rc).strip() # end _getLiteral def _set_time(value) : if not node.hasAttribute("datatype") : # Check the datatype: dt = _format_test(value) if dt != plain : node.setAttribute("datatype",dt) # Finally, set the value itself node.setAttribute("content",value) # end _set_time if not node.hasAttribute("content") : # @content has top priority over the others... if node.hasAttribute("datetime") : _set_time( node.getAttribute("datetime") ) elif node.hasAttribute("dateTime") : _set_time( node.getAttribute("dateTime") ) elif node.tagName == "time" : # Note that a possible @datetime value has already been taken care of _set_time( _get_literal(node) )
[docs]def remove_rel(node, state): """ If @property and @rel/@rev are on the same element, then only CURIE and URI can appear as a rel/rev value. @param node: the current node that could be modified @param state: current state @type state: L{Execution context<pyRdfa.state.ExecutionContext>} """ from ..termorcurie import termname def _massage_node(node,attr) : """The real work for remove_rel is done here, parametrized with @rel and @rev""" if node.hasAttribute("property") and node.hasAttribute(attr) : vals = node.getAttribute(attr).strip().split() if len(vals) != 0 : final_vals = [ v for v in vals if not termname.match(v) ] if len(final_vals) == 0 : node.removeAttribute(attr) else : node.setAttribute(attr, reduce(lambda x,y: x+' '+y,final_vals)) _massage_node(node, "rev") _massage_node(node, "rel")