Source code for rdflib.plugins.memory

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import random

from rdflib.term import BNode
from rdflib.store import Store, NO_STORE, VALID_STORE
from six import iteritems

__all__ = ['Memory', 'IOMemory']

ANY = Any = None


[docs]class Memory(Store):
    """\
    An in memory implementation of a triple store.

    This triple store uses nested dictionaries to store triples. Each
    triple is stored in two such indices as follows spo[s][p][o] = 1 and
    pos[p][o][s] = 1.

    Authors: Michel Pelletier, Daniel Krech, Stefan Niederhauser
    """

[docs]    def __init__(self, configuration=None, identifier=None):
        super(Memory, self).__init__(configuration)
        self.identifier = identifier

        # indexed by [subject][predicate][object]
        self.__spo = {}

        # indexed by [predicate][object][subject]
        self.__pos = {}

        # indexed by [predicate][object][subject]
        self.__osp = {}

        self.__namespace = {}
        self.__prefix = {}

[docs]    def add(self, triple, context, quoted=False):
        """\
        Add a triple to the store of triples.
        """
        # add dictionary entries for spo[s][p][p] = 1 and pos[p][o][s]
        # = 1, creating the nested dictionaries where they do not yet
        # exits.
        subject, predicate, object = triple
        spo = self.__spo
        try:
            po = spo[subject]
        except:
            po = spo[subject] = {}
        try:
            o = po[predicate]
        except:
            o = po[predicate] = {}
        o[object] = 1

        pos = self.__pos
        try:
            os = pos[predicate]
        except:
            os = pos[predicate] = {}
        try:
            s = os[object]
        except:
            s = os[object] = {}
        s[subject] = 1

        osp = self.__osp
        try:
            sp = osp[object]
        except:
            sp = osp[object] = {}
        try:
            p = sp[subject]
        except:
            p = sp[subject] = {}
        p[predicate] = 1

[docs]    def remove(self, triple_pattern, context=None):
        for (subject, predicate, object), c in self.triples(triple_pattern):
            del self.__spo[subject][predicate][object]
            del self.__pos[predicate][object][subject]
            del self.__osp[object][subject][predicate]

[docs]    def triples(self, triple_pattern, context=None):
        """A generator over all the triples matching """
        subject, predicate, object = triple_pattern
        if subject != ANY:  # subject is given
            spo = self.__spo
            if subject in spo:
                subjectDictionary = spo[subject]
                if predicate != ANY:  # subject+predicate is given
                    if predicate in subjectDictionary:
                        if object != ANY:  # subject+predicate+object is given
                            if object in subjectDictionary[predicate]:
                                yield (subject, predicate, object), \
                                    self.__contexts()
                            else:  # given object not found
                                pass
                        else:  # subject+predicate is given, object unbound
                            for o in subjectDictionary[predicate].keys():
                                yield (subject, predicate, o), \
                                    self.__contexts()
                    else:  # given predicate not found
                        pass
                else:  # subject given, predicate unbound
                    for p in subjectDictionary.keys():
                        if object != ANY:  # object is given
                            if object in subjectDictionary[p]:
                                yield (subject, p, object), self.__contexts()
                            else:  # given object not found
                                pass
                        else:  # object unbound
                            for o in subjectDictionary[p].keys():
                                yield (subject, p, o), self.__contexts()
            else:  # given subject not found
                pass
        elif predicate != ANY:  # predicate is given, subject unbound
            pos = self.__pos
            if predicate in pos:
                predicateDictionary = pos[predicate]
                if object != ANY:  # predicate+object is given, subject unbound
                    if object in predicateDictionary:
                        for s in predicateDictionary[object].keys():
                            yield (s, predicate, object), self.__contexts()
                    else:  # given object not found
                        pass
                else:  # predicate is given, object+subject unbound
                    for o in predicateDictionary.keys():
                        for s in predicateDictionary[o].keys():
                            yield (s, predicate, o), self.__contexts()
        elif object != ANY:  # object is given, subject+predicate unbound
            osp = self.__osp
            if object in osp:
                objectDictionary = osp[object]
                for s in objectDictionary.keys():
                    for p in objectDictionary[s].keys():
                        yield (s, p, object), self.__contexts()
        else:  # subject+predicate+object unbound
            spo = self.__spo
            for s in spo.keys():
                subjectDictionary = spo[s]
                for p in subjectDictionary.keys():
                    for o in subjectDictionary[p].keys():
                        yield (s, p, o), self.__contexts()

[docs]    def __len__(self, context=None):
        # @@ optimize
        i = 0
        for triple in self.triples((None, None, None)):
            i += 1
        return i

[docs]    def bind(self, prefix, namespace):
        self.__prefix[namespace] = prefix
        self.__namespace[prefix] = namespace

[docs]    def namespace(self, prefix):
        return self.__namespace.get(prefix, None)

[docs]    def prefix(self, namespace):
        return self.__prefix.get(namespace, None)

[docs]    def namespaces(self):
        for prefix, namespace in iteritems(self.__namespace):
            yield prefix, namespace

    def __contexts(self):
        return (c for c in [])  # TODO: best way to return empty generator


[docs]class IOMemory(Store):
    """\
    An integer-key-optimized context-aware in-memory store.

    Uses three dict indices (for subjects, objects and predicates) holding
    sets of triples. Context information is tracked in a separate dict, with
    the triple as key and a dict of {context: quoted} items as value. The
    context information is used to filter triple query results.

    Memory usage is low due to several optimizations. RDF nodes are not
    stored directly in the indices; instead, the indices hold integer keys
    and the actual nodes are only stored once in int-to-object and
    object-to-int mapping dictionaries. A default context is determined
    based on the first triple that is added to the store, and no context
    information is actually stored for subsequent other triples with the
    same context information.

    Most operations should be quite fast, but a triples() query with two
    bound parts requires a set intersection operation, which may be slow in
    some cases. When multiple contexts are used in the same store, filtering
    based on context has to be done after each query, which may also be
    slow.

    """
    context_aware = True
    formula_aware = True
    graph_aware = True

    # The following variable name conventions are used in this class:
    #
    # subject, predicate, object             unencoded triple parts
    # triple = (subject, predicate, object)  unencoded triple
    # context:                               unencoded context
    #
    # sid, pid, oid                          integer-encoded triple parts
    # enctriple = (sid, pid, oid)            integer-encoded triple
    # cid                                    integer-encoded context

[docs]    def __init__(self, configuration=None, identifier=None):
        super(IOMemory, self).__init__()
        self.__namespace = {}
        self.__prefix = {}

        # Mappings for encoding RDF nodes using integer keys, to save memory
        # in the indexes Note that None is always mapped to itself, to make
        # it easy to test for it in either encoded or unencoded form.
        self.__int2obj = {None: None}  # maps integer keys to objects
        self.__obj2int = {None: None}  # maps objects to integer keys

        # Indexes for each triple part, and a list of contexts for each triple
        self.__subjectIndex = {}    # key: sid    val: set(enctriples)
        self.__predicateIndex = {}  # key: pid    val: set(enctriples)
        self.__objectIndex = {}     # key: oid    val: set(enctriples)
        self.__tripleContexts = {
        }  # key: enctriple    val: {cid1: quoted, cid2: quoted ...}
        self.__contextTriples = {None: set()}  # key: cid    val: set(enctriples)

        # all contexts used in store (unencoded)
        self.__all_contexts = set()
        # default context information for triples
        self.__defaultContexts = None

[docs]    def bind(self, prefix, namespace):
        self.__prefix[namespace] = prefix
        self.__namespace[prefix] = namespace

[docs]    def namespace(self, prefix):
        return self.__namespace.get(prefix, None)

[docs]    def prefix(self, namespace):
        return self.__prefix.get(namespace, None)

[docs]    def namespaces(self):
        for prefix, namespace in iteritems(self.__namespace):
            yield prefix, namespace

[docs]    def add(self, triple, context, quoted=False):
        Store.add(self, triple, context, quoted)

        if context is not None:
            self.__all_contexts.add(context)

        enctriple = self.__encodeTriple(triple)
        sid, pid, oid = enctriple

        self.__addTripleContext(enctriple, context, quoted)

        if sid in self.__subjectIndex:
            self.__subjectIndex[sid].add(enctriple)
        else:
            self.__subjectIndex[sid] = set([enctriple])

        if pid in self.__predicateIndex:
            self.__predicateIndex[pid].add(enctriple)
        else:
            self.__predicateIndex[pid] = set([enctriple])

        if oid in self.__objectIndex:
            self.__objectIndex[oid].add(enctriple)
        else:
            self.__objectIndex[oid] = set([enctriple])

[docs]    def remove(self, triplepat, context=None):
        req_cid = self.__obj2id(context)
        for triple, contexts in self.triples(triplepat, context):
            enctriple = self.__encodeTriple(triple)
            for cid in self.__getTripleContexts(enctriple):
                if context is not None and req_cid != cid:
                    continue
                self.__removeTripleContext(enctriple, cid)
            ctxs = self.__getTripleContexts(enctriple, skipQuoted=True)
            if None in ctxs and (context is None or len(ctxs) == 1):
                self.__removeTripleContext(enctriple, None)
            if len(self.__getTripleContexts(enctriple)) == 0:
                # triple has been removed from all contexts
                sid, pid, oid = enctriple
                self.__subjectIndex[sid].remove(enctriple)
                self.__predicateIndex[pid].remove(enctriple)
                self.__objectIndex[oid].remove(enctriple)

                del self.__tripleContexts[enctriple]

        if not req_cid is None and \
                req_cid in self.__contextTriples and \
                len(self.__contextTriples[req_cid]) == 0:
            # all triples are removed out of this context
            # and it's not the default context so delete it
            del self.__contextTriples[req_cid]

        if triplepat == (None, None, None) and \
                context in self.__all_contexts and \
                not self.graph_aware:
            # remove the whole context
            self.__all_contexts.remove(context)

[docs]    def triples(self, triplein, context=None):
        if context is not None:
            if context == self:  # hmm...does this really ever happen?
                context = None

        cid = self.__obj2id(context)
        enctriple = self.__encodeTriple(triplein)
        sid, pid, oid = enctriple

        # all triples case (no triple parts given as pattern)
        if sid is None and pid is None and oid is None:
            return self.__all_triples(cid)

        # optimize "triple in graph" case (all parts given)
        if sid is not None and pid is not None and oid is not None:
            if sid in self.__subjectIndex and \
               enctriple in self.__subjectIndex[sid] and \
               self.__tripleHasContext(enctriple, cid):
                return ((triplein, self.__contexts(enctriple)) for i in [0])
            else:
                return self.__emptygen()

        # remaining cases: one or two out of three given
        sets = []
        if sid is not None:
            if sid in self.__subjectIndex:
                sets.append(self.__subjectIndex[sid])
            else:
                return self.__emptygen()
        if pid is not None:
            if pid in self.__predicateIndex:
                sets.append(self.__predicateIndex[pid])
            else:
                return self.__emptygen()
        if oid is not None:
            if oid in self.__objectIndex:
                sets.append(self.__objectIndex[oid])
            else:
                return self.__emptygen()

        # to get the result, do an intersection of the sets (if necessary)
        if len(sets) > 1:
            enctriples = sets[0].intersection(*sets[1:])
        else:
            enctriples = sets[0].copy()

        return ((self.__decodeTriple(enctriple), self.__contexts(enctriple))
                for enctriple in enctriples
                if self.__tripleHasContext(enctriple, cid))

[docs]    def contexts(self, triple=None):
        if triple is None or triple == (None, None, None):
            return (context for context in self.__all_contexts)

        enctriple = self.__encodeTriple(triple)
        sid, pid, oid = enctriple
        if sid in self.__subjectIndex and enctriple in self.__subjectIndex[sid]:
            return self.__contexts(enctriple)
        else:
            return self.__emptygen()

[docs]    def __len__(self, context=None):
        cid = self.__obj2id(context)
        if cid not in self.__contextTriples:
            return 0
        return len(self.__contextTriples[cid])

[docs]    def add_graph(self, graph):
        if not self.graph_aware:
            Store.add_graph(self, graph)
        else:
            self.__all_contexts.add(graph)

[docs]    def remove_graph(self, graph):
        if not self.graph_aware:
            Store.remove_graph(self, graph)
        else:
            self.remove((None, None, None), graph)
            try:
                self.__all_contexts.remove(graph)
            except KeyError:
                pass  # we didn't know this graph, no problem

    # internal utility methods below

    def __addTripleContext(self, enctriple, context, quoted):
        """add the given context to the set of contexts for the triple"""
        cid = self.__obj2id(context)

        sid, pid, oid = enctriple
        if sid in self.__subjectIndex and enctriple in self.__subjectIndex[sid]:
            # we know the triple exists somewhere in the store
            if enctriple not in self.__tripleContexts:
                # triple exists with default ctx info
                # start with a copy of the default ctx info
                self.__tripleContexts[
                    enctriple] = self.__defaultContexts.copy()

            self.__tripleContexts[enctriple][cid] = quoted
            if not quoted:
                self.__tripleContexts[enctriple][None] = quoted
        else:
            # the triple didn't exist before in the store
            if quoted:  # this context only
                self.__tripleContexts[enctriple] = {cid: quoted}
            else:  # default context as well
                self.__tripleContexts[enctriple] = {cid: quoted, None: quoted}

        # if the triple is not quoted add it to the default context
        if not quoted:
            self.__contextTriples[None].add(enctriple)

        # always add the triple to given context, making sure it's initialized
        if cid not in self.__contextTriples:
            self.__contextTriples[cid] = set()
        self.__contextTriples[cid].add(enctriple)

        # if this is the first ever triple in the store, set default ctx info
        if self.__defaultContexts is None:
            self.__defaultContexts = self.__tripleContexts[enctriple]

        # if the context info is the same as default, no need to store it
        if self.__tripleContexts[enctriple] == self.__defaultContexts:
            del self.__tripleContexts[enctriple]

    def __getTripleContexts(self, enctriple, skipQuoted=False):
        """return a list of (encoded) contexts for the triple, skipping
           quoted contexts if skipQuoted==True"""

        ctxs = self.__tripleContexts.get(enctriple, self.__defaultContexts)

        if not skipQuoted:
            return ctxs.keys()

        return [cid for cid, quoted in ctxs.items() if not quoted]

    def __tripleHasContext(self, enctriple, cid):
        """return True iff the triple exists in the given context"""
        ctxs = self.__tripleContexts.get(enctriple, self.__defaultContexts)
        return (cid in ctxs)

    def __removeTripleContext(self, enctriple, cid):
        """remove the context from the triple"""
        ctxs = self.__tripleContexts.get(
            enctriple, self.__defaultContexts).copy()
        del ctxs[cid]
        if ctxs == self.__defaultContexts:
            del self.__tripleContexts[enctriple]
        else:
            self.__tripleContexts[enctriple] = ctxs
        self.__contextTriples[cid].remove(enctriple)

    def __obj2id(self, obj):
        """encode object, storing it in the encoding map if necessary,
           and return the integer key"""
        if obj not in self.__obj2int:
            id = randid()
            while id in self.__int2obj:
                id = randid()
            self.__obj2int[obj] = id
            self.__int2obj[id] = obj
            return id
        return self.__obj2int[obj]

    def __encodeTriple(self, triple):
        """encode a whole triple, returning the encoded triple"""
        return tuple(map(self.__obj2id, triple))

    def __decodeTriple(self, enctriple):
        """decode a whole encoded triple, returning the original
        triple"""
        return tuple(map(self.__int2obj.get, enctriple))

    def __all_triples(self, cid):
        """return a generator which yields all the triples (unencoded)
           of the given context"""
        if cid not in self.__contextTriples:
            return
        for enctriple in self.__contextTriples[cid].copy():
            yield self.__decodeTriple(enctriple), self.__contexts(enctriple)

    def __contexts(self, enctriple):
        """return a generator for all the non-quoted contexts
           (unencoded) the encoded triple appears in"""
        return (self.__int2obj.get(cid) for cid in self.__getTripleContexts(enctriple, skipQuoted=True) if cid is not None)

    def __emptygen(self):
        """return an empty generator"""
        if False:
            yield


def randid(randint=random.randint, choice=random.choice, signs=(-1, 1)):
    return choice(signs) * randint(1, 2000000000)


del random