Source code for rdflib.plugins.parsers.pyMicrodata.utils

# -*- coding: utf-8 -*-
"""
Various utilities for pyMicrodata

@organization: U{World Wide Web Consortium<http://www.w3.org>}
@author: U{Ivan Herman<a href="http://www.w3.org/People/Ivan/">}
@license: This software is available for use under the
U{W3C® SOFTWARE NOTICE AND LICENSE<href="http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231">}
"""

"""
$Id: utils.py,v 1.7 2012/09/01 15:17:28 ivan Exp $
$Date: 2012/09/01 15:17:28 $
"""
import os, os.path, sys
(py_v_major, py_v_minor, py_v_micro, py_v_final, py_v_serial) = sys.version_info

if py_v_major >= 3 :
	from urllib.request import Request, urlopen
	from urllib.parse   import urljoin, quote, urlparse
	from http.server    import BaseHTTPRequestHandler
	from urllib.error   import HTTPError as urllib_HTTPError
else :
	from urllib2        import Request, urlopen
	from urllib2        import HTTPError as urllib_HTTPError
	from urlparse       import urljoin, urlparse
	from urllib         import quote
	from BaseHTTPServer import BaseHTTPRequestHandler

import re
from datetime import datetime

from rdflib	import BNode
import rdflib
if rdflib.__version__ >= "3.0.0" :
	from rdflib	import RDF as ns_rdf
else :
	from rdflib.RDF	import RDFNS  as ns_rdf

#################################################################################
[docs]def is_absolute_URI( uri ) : return urlparse(uri)[0] != ""
#################################################################################
[docs]def fragment_escape( name ) : return quote(name, '/~:-.')
#################################################################################
[docs]def generate_URI(base, v) : """ Generate an (absolute) URI; if val is a fragment, then using it with base, otherwise just return the value @param base: Absolute URI for base @param v: relative or absolute URI """ if is_absolute_URI( v ) : return v else : # UGLY!!! There is a bug for a corner case in python version <= 2.5.X if len(v) > 0 and v[0] == '?' and (py_v_major < 3 and py_v_minor <= 5) : return base+val #### # Trust the python library... # Well, not quite:-) there is what is, in my view, a bug in the urljoin; in some cases it # swallows the '#' or '?' character at the end. This is clearly a problem with # Semantic Web URI-s v = fragment_escape(v.strip()) joined = urljoin(base, v) try : if v[-1] != joined[-1] and (v[-1] == "#" or v[-1] == "?") : return joined + v[-1] else : return joined except : return joined
#################################################################################
[docs]def generate_RDF_collection( graph, vals ) : """ Generate an RDF List from vals, returns the head of the list @param graph: RDF graph @type graph: RDFLib Graph @param vals: array of RDF Resources @return: head of the List (an RDF Resource) """ # generate an RDF List, returns the head # list has all the elements in RDF format already heads = [ BNode() for r in vals ] + [ ns_rdf["nil"] ] for i in range(0, len(vals)) : graph.add( (heads[i], ns_rdf["first"], vals[i]) ) graph.add( (heads[i], ns_rdf["rest"], heads[i+1]) ) return heads[0]
#################################################################################
[docs]def get_Literal(Pnode): """ Get (recursively) the full text from a DOM Node. @param Pnode: DOM Node @return: string """ rc = "" for node in Pnode.childNodes: if node.nodeType == node.TEXT_NODE: rc = rc + node.data elif node.nodeType == node.ELEMENT_NODE : rc = rc + get_Literal(node) # This presupposes that all spaces and such should be stripped. I am not sure it is true in the spec, # but this is what the examples show # return re.sub(r'(\r| |\n|\t)+'," ",rc).strip() # at present, the agreement seems to say that white spaces are maintained: return rc
#################################################################################
[docs]def get_lang(node) : # we may have lang and xml:lang retval = None if node.hasAttribute("lang") : retval = node.getAttribute("lang") if retval and node.hasAttribute("xml:lang") : xmllang = node.getAttribute("xml:lang").lower() if not( xmllang != None and xmllang == retval.lower() ) : # This is an error, in which case retval must be invalidated... retval = None return retval
[docs]def get_lang_from_hierarchy(document, node) : lang = get_lang(node) if lang == None : parent = node.parentNode if parent != None and parent != document : return get_lang_from_hierarchy(document, parent) else : return get_lang(document) else : return lang
################################################################################# datetime_type = "http://www.w3.org/2001/XMLSchema#dateTime" time_type = "http://www.w3.org/2001/XMLSchema#time" date_type = "http://www.w3.org/2001/XMLSchema#date" date_gYear = "http://www.w3.org/2001/XMLSchema#gYear" date_gYearMonth = "http://www.w3.org/2001/XMLSchema#gYearMonth" date_gMonthDay = "http://www.w3.org/2001/XMLSchema#gMonthDay" duration_type = "http://www.w3.org/2001/XMLSchema#duration" _formats = { date_gMonthDay : [ "%m-%d" ], date_gYearMonth : [ "%Y-%m"], date_gYear : [ "%Y" ], date_type : [ "%Y-%m-%d", "%Y-%m-%dZ" ], time_type : [ "%H:%M", "%H:%M:%S", "%H:%M:%SZ", "%H:%M:%S.%f" ], datetime_type : [ "%Y-%m-%dT%H:%M", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%MZ", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%fZ" ], duration_type : [ "P%dD", "P%YY%mM%dD", "P%YY%mM", "P%YY%dD", "P%YY", "P%mM", "P%mM%dD", ], } _dur_times = [ "%HH%MM%SS", "%HH", "%MM", "%SS", "%HH%MM", "%HH%SS", "%MM%SS" ]
[docs]def get_time_type(string) : """ Check whether the string abides to one of the accepted time related datatypes, and returns that one if yes @param string: the attribute value to be checked @return : a datatype URI or None """ for key in _formats : for format in _formats[key] : try : # try to check if the syntax is fine d = datetime.strptime(string, format) # bingo! return key except ValueError : pass # Now come the special cases:-( # Check first for the duration stuff, that is the nastiest. if len(string) > 2 and string[0] == 'P' or (string [0] == '-' and string[1] == 'P') : # this is meant to be a duration type # first of all, get rid of the leading '-' and check again if string[0] == '-' : for format in _formats[duration_type] : try : # try to check if the syntax is fine d = datetime.strptime(string, format) # bingo! return duration_type except ValueError : pass # Let us see if the value contains a separate time portion, and cut that one durs = string.split('T') if len(durs) == 2 : # yep, so we should check again dur = durs[0] tm = durs[1] # Check the duration part td = False for format in _formats[duration_type] : try : # try to check if the syntax is fine d = datetime.strptime(dur, format) # bingo! td = True break except ValueError : pass if td == True : # Getting there... for format in _dur_times : try : # try to check if the syntax is fine d = datetime.strptime(tm, format) # bingo! return duration_type except ValueError : pass # something went wrong... return None else : # Well, no more tricks, this is a plain type return None # If we got here, we should check the time zone # there is a discrepancy betwen the python and the HTML5/XSD lexical string, # which means that this has to handled separately for the date and the timezone portion try : # The time-zone-less portion of the string str = string[0:-6] # The time-zone portion tz = string[-5:] try : t = datetime.strptime(tz,"%H:%M") except ValueError : # Bummer, this is not a correct time return None # The time-zone is fine, the datetime portion has to be checked for format in _formats[datetime_type] : try : # try to check if it is fine d = datetime.strptime(str, format) # Bingo! return datetime_type except ValueError : pass except : pass return None
######################################################################################################### # Handling URIs
[docs]class URIOpener : """A wrapper around the urllib2 method to open a resource. Beyond accessing the data itself, the class sets the content location. The class also adds an accept header to the outgoing request, namely text/html and application/xhtml+xml (unless set explicitly by the caller). @ivar data: the real data, ie, a file-like object @ivar headers: the return headers as sent back by the server @ivar location: the real location of the data (ie, after possible redirection and content negotiation) """ CONTENT_LOCATION = 'Content-Location'
[docs] def __init__(self, name) : """ @param name: URL to be opened @keyword additional_headers: additional HTTP request headers to be added to the call """ try : # Note the removal of the fragment ID. This is necessary, per the HTTP spec req = Request(url=name.split('#')[0]) req.add_header('Accept', 'text/html, application/xhtml+xml') self.data = urlopen(req) self.headers = self.data.info() if URIOpener.CONTENT_LOCATION in self.headers : self.location = urljoin(self.data.geturl(),self.headers[URIOpener.CONTENT_LOCATION]) else : self.location = name except urllib_HTTPError : e = sys.exc_info()[1] from pyMicrodata import HTTPError msg = BaseHTTPRequestHandler.responses[e.code] raise HTTPError('%s' % msg[1], e.code) except Exception : e = sys.exc_info()[1] from pyMicrodata import MicrodataError raise MicrodataError('%s' % e)