Source code for rdflib.plugins.parsers.pyMicrodata.utils

# -*- coding: utf-8 -*-
"""
Various utilities for pyMicrodata

@organization: U{World Wide Web Consortium<http://www.w3.org>}
@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
@license: This software is available for use under the
U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
"""

"""
$Id: utils.py,v 1.7 2012/09/01 15:17:28 ivan Exp $
$Date: 2012/09/01 15:17:28 $
"""
import os, os.path, sys
(py_v_major, py_v_minor, py_v_micro, py_v_final, py_v_serial) = sys.version_info

if py_v_major >= 3 :
	from urllib.request import Request, urlopen
	from urllib.parse   import urljoin, quote, urlparse
	from http.server    import BaseHTTPRequestHandler
	from urllib.error   import HTTPError as urllib_HTTPError
else :
	from urllib2        import Request, urlopen
	from urllib2        import HTTPError as urllib_HTTPError
	from urlparse       import urljoin, urlparse
	from urllib         import quote
	from BaseHTTPServer import BaseHTTPRequestHandler

import re
from datetime import datetime

from rdflib	import BNode
import rdflib
if rdflib.__version__ >= "3.0.0" :
	from rdflib	import RDF as ns_rdf
else :
	from rdflib.RDF	import RDFNS  as ns_rdf

#################################################################################
[docs]def is_absolute_URI( uri ) :
	return urlparse(uri)[0] != ""

#################################################################################

[docs]def fragment_escape( name ) :
	return quote(name, '/~:-.')

#################################################################################

[docs]def generate_URI(base, v) :
	"""
	Generate an (absolute) URI; if val is a fragment, then using it with base,
	otherwise just return the value
	@param base: Absolute URI for base
	@param v: relative or absolute URI
	"""
	if is_absolute_URI( v ) :
		return v
	else :
		# UGLY!!! There is a bug for a corner case in python version <= 2.5.X
		if len(v) > 0 and v[0] == '?' and (py_v_major < 3 and py_v_minor <= 5) :
			return base+val
		####

		# Trust the python library...
		# Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it
		# swallows the '#' or '?' character at the end. This is clearly a problem with
		# Semantic Web URI-s
		v = fragment_escape(v.strip())
		joined = urljoin(base, v)
		try :
			if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?") :
				return joined + v[-1]
			else :
				return joined
		except :
			return joined

#################################################################################
[docs]def generate_RDF_collection( graph, vals ) :
	"""
	Generate an RDF List from vals, returns the head of the list
	@param graph: RDF graph
	@type graph: RDFLib Graph
	@param vals: array of RDF Resources
	@return: head of the List (an RDF Resource)
	"""
	# generate an RDF List, returns the head
	# list has all the elements in RDF format already
	heads = [ BNode() for r in vals ] + [ ns_rdf["nil"] ]
	for i in range(0, len(vals)) :
		graph.add( (heads[i], ns_rdf["first"], vals[i]) )
		graph.add( (heads[i], ns_rdf["rest"],  heads[i+1]) )
	return heads[0]

#################################################################################
[docs]def get_Literal(Pnode):
	"""
	Get (recursively) the full text from a DOM Node.

	@param Pnode: DOM Node
	@return: string
	"""
	rc = ""
	for node in Pnode.childNodes:
		if node.nodeType == node.TEXT_NODE:
			rc = rc + node.data
		elif node.nodeType == node.ELEMENT_NODE :
			rc = rc + get_Literal(node)

	# This presupposes that all spaces and such should be stripped. I am not sure it is true in the spec,
	# but this is what the examples show
	# return re.sub(r'(\r| |\n|\t)+'," ",rc).strip()

	# at present, the agreement seems to say that white spaces are maintained:
	return rc

#################################################################################
[docs]def get_lang(node) :
	# we may have lang and xml:lang
	retval  = None
	if node.hasAttribute("lang") :
		retval = node.getAttribute("lang")
	if retval and node.hasAttribute("xml:lang") :
		xmllang = node.getAttribute("xml:lang").lower()
		if not( xmllang != None and xmllang == retval.lower() ) :
			# This is an error, in which case retval must be invalidated...
			retval = None
	return retval

[docs]def get_lang_from_hierarchy(document, node) :
	lang = get_lang(node)
	if lang == None :
		parent = node.parentNode
		if parent != None and parent != document :
			return get_lang_from_hierarchy(document, parent)
		else :
			return get_lang(document)
	else :
		return lang

#################################################################################
datetime_type 	= "http://www.w3.org/2001/XMLSchema#dateTime"
time_type 	 	= "http://www.w3.org/2001/XMLSchema#time"
date_type 	 	= "http://www.w3.org/2001/XMLSchema#date"
date_gYear		= "http://www.w3.org/2001/XMLSchema#gYear"
date_gYearMonth	= "http://www.w3.org/2001/XMLSchema#gYearMonth"
date_gMonthDay	= "http://www.w3.org/2001/XMLSchema#gMonthDay"
duration_type	= "http://www.w3.org/2001/XMLSchema#duration"

_formats = {
	date_gMonthDay	  : [ "%m-%d" ],
	date_gYearMonth	  : [ "%Y-%m"],
	date_gYear     	  : [ "%Y" ],
	date_type      	  : [ "%Y-%m-%d", "%Y-%m-%dZ" ],
	time_type      	  : [ "%H:%M",
					      "%H:%M:%S",
					      "%H:%M:%SZ",
					      "%H:%M:%S.%f" ],
	datetime_type  	  : [ "%Y-%m-%dT%H:%M",
					      "%Y-%m-%dT%H:%M:%S",
					      "%Y-%m-%dT%H:%M:%S.%f",
					      "%Y-%m-%dT%H:%MZ",
					      "%Y-%m-%dT%H:%M:%SZ",
					      "%Y-%m-%dT%H:%M:%S.%fZ" ],
	duration_type     : [ "P%dD",
						  "P%YY%mM%dD",
						  "P%YY%mM",
						  "P%YY%dD",
						  "P%YY",
						  "P%mM",
						  "P%mM%dD",
						 ],
}

_dur_times = [ "%HH%MM%SS", "%HH", "%MM", "%SS", "%HH%MM", "%HH%SS", "%MM%SS" ]

[docs]def get_time_type(string) :
	"""
	Check whether the string abides to one of the accepted time related datatypes, and returns that one if yes
	@param string: the attribute value to be checked
	@return : a datatype URI or None
	"""
	for key in _formats :
		for format in _formats[key] :
			try :
				# try to check if the syntax is fine
				d = datetime.strptime(string, format)
				# bingo!
				return key
			except ValueError :
				pass

	# Now come the special cases:-(
	# Check first for the duration stuff, that is the nastiest.
	if len(string) > 2 and string[0] == 'P' or (string [0] == '-' and string[1] == 'P') :
		# this is meant to be a duration type
		# first of all, get rid of the leading '-' and check again
		if string[0] == '-' :
			for format in _formats[duration_type] :
				try :
					# try to check if the syntax is fine
					d = datetime.strptime(string, format)
					# bingo!
					return duration_type
				except ValueError :
					pass
		# Let us see if the value contains a separate time portion, and cut that one
		durs = string.split('T')
		if len(durs) == 2 :
			# yep, so we should check again
			dur = durs[0]
			tm  = durs[1]
			# Check the duration part
			td = False
			for format in _formats[duration_type] :
				try :
					# try to check if the syntax is fine
					d = datetime.strptime(dur, format)
					# bingo!
					td = True
					break
				except ValueError :
					pass
			if td == True :
				# Getting there...
				for format in _dur_times :
					try :
						# try to check if the syntax is fine
						d = datetime.strptime(tm, format)
						# bingo!
						return duration_type
					except ValueError :
						pass
			# something went wrong...
			return None
		else :
			# Well, no more tricks, this is a plain type
			return None

	# If we got here, we should check the time zone
	# there is a discrepancy betwen the python and the HTML5/XSD lexical string,
	# which means that this has to handled separately for the date and the timezone portion
	try :
		# The time-zone-less portion of the string
		str = string[0:-6]
		# The time-zone portion
		tz = string[-5:]
		try :
			t = datetime.strptime(tz,"%H:%M")
		except ValueError :
			# Bummer, this is not a correct time
			return None
		# The time-zone is fine, the datetime portion has to be checked
		for format in _formats[datetime_type] :
			try :
				# try to check if it is fine
				d = datetime.strptime(str, format)
				# Bingo!
				return datetime_type
			except ValueError :
				pass
	except :
		pass
	return None


#########################################################################################################
# Handling URIs
[docs]class URIOpener :
	"""A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class
	sets the content location.
	The class also adds an accept header to the outgoing request, namely
	text/html and application/xhtml+xml (unless set explicitly by the caller).

	@ivar data: the real data, ie, a file-like object
	@ivar headers: the return headers as sent back by the server
	@ivar location: the real location of the data (ie, after possible redirection and content negotiation)
	"""
	CONTENT_LOCATION	= 'Content-Location'
[docs]	def __init__(self, name) :
		"""
		@param name: URL to be opened
		@keyword additional_headers: additional HTTP request headers to be added to the call
		"""
		try :
			# Note the removal of the fragment ID. This is necessary, per the HTTP spec
			req = Request(url=name.split('#')[0])

			req.add_header('Accept', 'text/html, application/xhtml+xml')

			self.data		= urlopen(req)
			self.headers	= self.data.info()

			if URIOpener.CONTENT_LOCATION in self.headers :
				self.location = urljoin(self.data.geturl(),self.headers[URIOpener.CONTENT_LOCATION])
			else :
				self.location = name

		except urllib_HTTPError :
			e = sys.exc_info()[1]
			from pyMicrodata import HTTPError
			msg = BaseHTTPRequestHandler.responses[e.code]
			raise HTTPError('%s' % msg[1], e.code)
		except Exception :
			e = sys.exc_info()[1]
			from pyMicrodata import MicrodataError
			raise MicrodataError('%s' % e)