Source code for rdflib.plugins.memory

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import random

from rdflib.term import BNode
from rdflib.store import Store, NO_STORE, VALID_STORE
from six import iteritems

__all__ = ['Memory', 'IOMemory']

ANY = Any = None


[docs]class Memory(Store): """\ An in memory implementation of a triple store. This triple store uses nested dictionaries to store triples. Each triple is stored in two such indices as follows spo[s][p][o] = 1 and pos[p][o][s] = 1. Authors: Michel Pelletier, Daniel Krech, Stefan Niederhauser """
[docs] def __init__(self, configuration=None, identifier=None): super(Memory, self).__init__(configuration) self.identifier = identifier # indexed by [subject][predicate][object] self.__spo = {} # indexed by [predicate][object][subject] self.__pos = {} # indexed by [predicate][object][subject] self.__osp = {} self.__namespace = {} self.__prefix = {}
[docs] def add(self, triple, context, quoted=False): """\ Add a triple to the store of triples. """ # add dictionary entries for spo[s][p][p] = 1 and pos[p][o][s] # = 1, creating the nested dictionaries where they do not yet # exits. subject, predicate, object = triple spo = self.__spo try: po = spo[subject] except: po = spo[subject] = {} try: o = po[predicate] except: o = po[predicate] = {} o[object] = 1 pos = self.__pos try: os = pos[predicate] except: os = pos[predicate] = {} try: s = os[object] except: s = os[object] = {} s[subject] = 1 osp = self.__osp try: sp = osp[object] except: sp = osp[object] = {} try: p = sp[subject] except: p = sp[subject] = {} p[predicate] = 1
[docs] def remove(self, triple_pattern, context=None): for (subject, predicate, object), c in self.triples(triple_pattern): del self.__spo[subject][predicate][object] del self.__pos[predicate][object][subject] del self.__osp[object][subject][predicate]
[docs] def triples(self, triple_pattern, context=None): """A generator over all the triples matching """ subject, predicate, object = triple_pattern if subject != ANY: # subject is given spo = self.__spo if subject in spo: subjectDictionary = spo[subject] if predicate != ANY: # subject+predicate is given if predicate in subjectDictionary: if object != ANY: # subject+predicate+object is given if object in subjectDictionary[predicate]: yield (subject, predicate, object), \ self.__contexts() else: # given object not found pass else: # subject+predicate is given, object unbound for o in subjectDictionary[predicate].keys(): yield (subject, predicate, o), \ self.__contexts() else: # given predicate not found pass else: # subject given, predicate unbound for p in subjectDictionary.keys(): if object != ANY: # object is given if object in subjectDictionary[p]: yield (subject, p, object), self.__contexts() else: # given object not found pass else: # object unbound for o in subjectDictionary[p].keys(): yield (subject, p, o), self.__contexts() else: # given subject not found pass elif predicate != ANY: # predicate is given, subject unbound pos = self.__pos if predicate in pos: predicateDictionary = pos[predicate] if object != ANY: # predicate+object is given, subject unbound if object in predicateDictionary: for s in predicateDictionary[object].keys(): yield (s, predicate, object), self.__contexts() else: # given object not found pass else: # predicate is given, object+subject unbound for o in predicateDictionary.keys(): for s in predicateDictionary[o].keys(): yield (s, predicate, o), self.__contexts() elif object != ANY: # object is given, subject+predicate unbound osp = self.__osp if object in osp: objectDictionary = osp[object] for s in objectDictionary.keys(): for p in objectDictionary[s].keys(): yield (s, p, object), self.__contexts() else: # subject+predicate+object unbound spo = self.__spo for s in spo.keys(): subjectDictionary = spo[s] for p in subjectDictionary.keys(): for o in subjectDictionary[p].keys(): yield (s, p, o), self.__contexts()
[docs] def __len__(self, context=None): # @@ optimize i = 0 for triple in self.triples((None, None, None)): i += 1 return i
[docs] def bind(self, prefix, namespace): self.__prefix[namespace] = prefix self.__namespace[prefix] = namespace
[docs] def namespace(self, prefix): return self.__namespace.get(prefix, None)
[docs] def prefix(self, namespace): return self.__prefix.get(namespace, None)
[docs] def namespaces(self): for prefix, namespace in iteritems(self.__namespace): yield prefix, namespace
def __contexts(self): return (c for c in []) # TODO: best way to return empty generator
[docs]class IOMemory(Store): """\ An integer-key-optimized context-aware in-memory store. Uses three dict indices (for subjects, objects and predicates) holding sets of triples. Context information is tracked in a separate dict, with the triple as key and a dict of {context: quoted} items as value. The context information is used to filter triple query results. Memory usage is low due to several optimizations. RDF nodes are not stored directly in the indices; instead, the indices hold integer keys and the actual nodes are only stored once in int-to-object and object-to-int mapping dictionaries. A default context is determined based on the first triple that is added to the store, and no context information is actually stored for subsequent other triples with the same context information. Most operations should be quite fast, but a triples() query with two bound parts requires a set intersection operation, which may be slow in some cases. When multiple contexts are used in the same store, filtering based on context has to be done after each query, which may also be slow. """ context_aware = True formula_aware = True graph_aware = True # The following variable name conventions are used in this class: # # subject, predicate, object unencoded triple parts # triple = (subject, predicate, object) unencoded triple # context: unencoded context # # sid, pid, oid integer-encoded triple parts # enctriple = (sid, pid, oid) integer-encoded triple # cid integer-encoded context
[docs] def __init__(self, configuration=None, identifier=None): super(IOMemory, self).__init__() self.__namespace = {} self.__prefix = {} # Mappings for encoding RDF nodes using integer keys, to save memory # in the indexes Note that None is always mapped to itself, to make # it easy to test for it in either encoded or unencoded form. self.__int2obj = {None: None} # maps integer keys to objects self.__obj2int = {None: None} # maps objects to integer keys # Indexes for each triple part, and a list of contexts for each triple self.__subjectIndex = {} # key: sid val: set(enctriples) self.__predicateIndex = {} # key: pid val: set(enctriples) self.__objectIndex = {} # key: oid val: set(enctriples) self.__tripleContexts = { } # key: enctriple val: {cid1: quoted, cid2: quoted ...} self.__contextTriples = {None: set()} # key: cid val: set(enctriples) # all contexts used in store (unencoded) self.__all_contexts = set() # default context information for triples self.__defaultContexts = None
[docs] def bind(self, prefix, namespace): self.__prefix[namespace] = prefix self.__namespace[prefix] = namespace
[docs] def namespace(self, prefix): return self.__namespace.get(prefix, None)
[docs] def prefix(self, namespace): return self.__prefix.get(namespace, None)
[docs] def namespaces(self): for prefix, namespace in iteritems(self.__namespace): yield prefix, namespace
[docs] def add(self, triple, context, quoted=False): Store.add(self, triple, context, quoted) if context is not None: self.__all_contexts.add(context) enctriple = self.__encodeTriple(triple) sid, pid, oid = enctriple self.__addTripleContext(enctriple, context, quoted) if sid in self.__subjectIndex: self.__subjectIndex[sid].add(enctriple) else: self.__subjectIndex[sid] = set([enctriple]) if pid in self.__predicateIndex: self.__predicateIndex[pid].add(enctriple) else: self.__predicateIndex[pid] = set([enctriple]) if oid in self.__objectIndex: self.__objectIndex[oid].add(enctriple) else: self.__objectIndex[oid] = set([enctriple])
[docs] def remove(self, triplepat, context=None): req_cid = self.__obj2id(context) for triple, contexts in self.triples(triplepat, context): enctriple = self.__encodeTriple(triple) for cid in self.__getTripleContexts(enctriple): if context is not None and req_cid != cid: continue self.__removeTripleContext(enctriple, cid) ctxs = self.__getTripleContexts(enctriple, skipQuoted=True) if None in ctxs and (context is None or len(ctxs) == 1): self.__removeTripleContext(enctriple, None) if len(self.__getTripleContexts(enctriple)) == 0: # triple has been removed from all contexts sid, pid, oid = enctriple self.__subjectIndex[sid].remove(enctriple) self.__predicateIndex[pid].remove(enctriple) self.__objectIndex[oid].remove(enctriple) del self.__tripleContexts[enctriple] if not req_cid is None and \ req_cid in self.__contextTriples and \ len(self.__contextTriples[req_cid]) == 0: # all triples are removed out of this context # and it's not the default context so delete it del self.__contextTriples[req_cid] if triplepat == (None, None, None) and \ context in self.__all_contexts and \ not self.graph_aware: # remove the whole context self.__all_contexts.remove(context)
[docs] def triples(self, triplein, context=None): if context is not None: if context == self: # hmm...does this really ever happen? context = None cid = self.__obj2id(context) enctriple = self.__encodeTriple(triplein) sid, pid, oid = enctriple # all triples case (no triple parts given as pattern) if sid is None and pid is None and oid is None: return self.__all_triples(cid) # optimize "triple in graph" case (all parts given) if sid is not None and pid is not None and oid is not None: if sid in self.__subjectIndex and \ enctriple in self.__subjectIndex[sid] and \ self.__tripleHasContext(enctriple, cid): return ((triplein, self.__contexts(enctriple)) for i in [0]) else: return self.__emptygen() # remaining cases: one or two out of three given sets = [] if sid is not None: if sid in self.__subjectIndex: sets.append(self.__subjectIndex[sid]) else: return self.__emptygen() if pid is not None: if pid in self.__predicateIndex: sets.append(self.__predicateIndex[pid]) else: return self.__emptygen() if oid is not None: if oid in self.__objectIndex: sets.append(self.__objectIndex[oid]) else: return self.__emptygen() # to get the result, do an intersection of the sets (if necessary) if len(sets) > 1: enctriples = sets[0].intersection(*sets[1:]) else: enctriples = sets[0].copy() return ((self.__decodeTriple(enctriple), self.__contexts(enctriple)) for enctriple in enctriples if self.__tripleHasContext(enctriple, cid))
[docs] def contexts(self, triple=None): if triple is None or triple == (None, None, None): return (context for context in self.__all_contexts) enctriple = self.__encodeTriple(triple) sid, pid, oid = enctriple if sid in self.__subjectIndex and enctriple in self.__subjectIndex[sid]: return self.__contexts(enctriple) else: return self.__emptygen()
[docs] def __len__(self, context=None): cid = self.__obj2id(context) if cid not in self.__contextTriples: return 0 return len(self.__contextTriples[cid])
[docs] def add_graph(self, graph): if not self.graph_aware: Store.add_graph(self, graph) else: self.__all_contexts.add(graph)
[docs] def remove_graph(self, graph): if not self.graph_aware: Store.remove_graph(self, graph) else: self.remove((None, None, None), graph) try: self.__all_contexts.remove(graph) except KeyError: pass # we didn't know this graph, no problem
# internal utility methods below def __addTripleContext(self, enctriple, context, quoted): """add the given context to the set of contexts for the triple""" cid = self.__obj2id(context) sid, pid, oid = enctriple if sid in self.__subjectIndex and enctriple in self.__subjectIndex[sid]: # we know the triple exists somewhere in the store if enctriple not in self.__tripleContexts: # triple exists with default ctx info # start with a copy of the default ctx info self.__tripleContexts[ enctriple] = self.__defaultContexts.copy() self.__tripleContexts[enctriple][cid] = quoted if not quoted: self.__tripleContexts[enctriple][None] = quoted else: # the triple didn't exist before in the store if quoted: # this context only self.__tripleContexts[enctriple] = {cid: quoted} else: # default context as well self.__tripleContexts[enctriple] = {cid: quoted, None: quoted} # if the triple is not quoted add it to the default context if not quoted: self.__contextTriples[None].add(enctriple) # always add the triple to given context, making sure it's initialized if cid not in self.__contextTriples: self.__contextTriples[cid] = set() self.__contextTriples[cid].add(enctriple) # if this is the first ever triple in the store, set default ctx info if self.__defaultContexts is None: self.__defaultContexts = self.__tripleContexts[enctriple] # if the context info is the same as default, no need to store it if self.__tripleContexts[enctriple] == self.__defaultContexts: del self.__tripleContexts[enctriple] def __getTripleContexts(self, enctriple, skipQuoted=False): """return a list of (encoded) contexts for the triple, skipping quoted contexts if skipQuoted==True""" ctxs = self.__tripleContexts.get(enctriple, self.__defaultContexts) if not skipQuoted: return ctxs.keys() return [cid for cid, quoted in ctxs.items() if not quoted] def __tripleHasContext(self, enctriple, cid): """return True iff the triple exists in the given context""" ctxs = self.__tripleContexts.get(enctriple, self.__defaultContexts) return (cid in ctxs) def __removeTripleContext(self, enctriple, cid): """remove the context from the triple""" ctxs = self.__tripleContexts.get( enctriple, self.__defaultContexts).copy() del ctxs[cid] if ctxs == self.__defaultContexts: del self.__tripleContexts[enctriple] else: self.__tripleContexts[enctriple] = ctxs self.__contextTriples[cid].remove(enctriple) def __obj2id(self, obj): """encode object, storing it in the encoding map if necessary, and return the integer key""" if obj not in self.__obj2int: id = randid() while id in self.__int2obj: id = randid() self.__obj2int[obj] = id self.__int2obj[id] = obj return id return self.__obj2int[obj] def __encodeTriple(self, triple): """encode a whole triple, returning the encoded triple""" return tuple(map(self.__obj2id, triple)) def __decodeTriple(self, enctriple): """decode a whole encoded triple, returning the original triple""" return tuple(map(self.__int2obj.get, enctriple)) def __all_triples(self, cid): """return a generator which yields all the triples (unencoded) of the given context""" if cid not in self.__contextTriples: return for enctriple in self.__contextTriples[cid].copy(): yield self.__decodeTriple(enctriple), self.__contexts(enctriple) def __contexts(self, enctriple): """return a generator for all the non-quoted contexts (unencoded) the encoded triple appears in""" return (self.__int2obj.get(cid) for cid in self.__getTripleContexts(enctriple, skipQuoted=True) if cid is not None) def __emptygen(self): """return an empty generator""" if False: yield
def randid(randint=random.randint, choice=random.choice, signs=(-1, 1)): return choice(signs) * randint(1, 2000000000) del random