Source code for PyOpenWorm.dataObject

import rdflib as R
from .data import DataUser
from .configure import BadConf
import itertools as IT
import traceback
import logging

__all__ = ["DataObject", "Property", "SimpleProperty", "values"]
L = logging.getLogger(__name__)

# in general it should be possible to recover the entire object from its identifier: the object should be representable as a connected graph.
# However, this need not be a connected *RDF* graph. Indeed, graph literals may hold information which can yield triples which are not
# connected by an actual node

def _bnode_to_var(x):
    return "?" + x

def _rdf_identifier_to_gp(x):
    if isinstance(x,R.BNode):
        return _bnode_to_var(x)
    elif isinstance(x,R.URIRef) and DataObject._is_variable(x):
        return DataObject._graph_variable_to_var(x).n3()
    else:
        return x.n3()

def _rdf_literal_to_python(x):
    if isinstance(x, R.Literal):
        x = x.toPython()
        if isinstance(x, R.Literal):
            x = str(x)
    return x

def _triples_to_bgp(trips):
    # XXX: Collisions could result between the variable names of different objects
    g = " .\n".join(" ".join(_rdf_identifier_to_gp(x) for x in y) for y in trips)
    return g

_DataObjects = dict()
_DataObjectsParents = dict()

class DataObject(DataUser):
    """ An object backed by the database

    Attributes
    -----------
    rdf_type : rdflib.term.URIRef
        The RDF type URI for objects of this type
    rdf_namespace : rdflib.namespace.Namespace
        The rdflib namespace (prefix for URIs) for objects from this class
    properties : list of Property
        Properties belonging to this object
    owner_properties : list of Property
        Properties belonging to parents of this object
    """
    _openSet = set()
    _closedSet = set()
    i = 0

    @classmethod
    def openSet(self):
        return self._openSet

    def __init__(self,ident=False,triples=False,**kwargs):
        try:
            DataUser.__init__(self,**kwargs)
        except BadConf, e:
            raise Exception("You may need to connect to a database before continuing.")

        if not triples:
            self._triples = []
        else:
            self._triples = triples
        self._is_releasing_triples = False
        self.properties = []
        self.owner_properties = []
        # Used in triples()
        self._id_is_set = False

        if ident:
            self._id = R.URIRef(ident)
            self._id_is_set = True
        else:
            # Randomly generate an identifier if the derived class can't
            # come up with one from the start. Ensures we always have something
            # that functions as an identifier
            import random
            import struct
            v = struct.pack("=2f",random.random(),random.random())
            cname = self.__class__.__name__
            self._id_variable = self._graph_variable(cname + v.encode('hex'))
            self._id = self.make_identifier(v)
        DataObject.addToOpenSet(self)

    def __eq__(self,other):
        return isinstance(other,DataObject) and (self.identifier() == other.identifier())

    def __str__(self):
        s = self.__class__.__name__ + "("
        s +=  ", ".join(str(x) for x in self.properties if x.hasValue())
        s += ")"
        return s

    def __repr__(self):
        return self.__str__()

    def _graph_variable(self,var_name):
        """ Make a variable for storage in the graph """
        return self.conf['rdf.namespace']["variable#"+var_name]

    @classmethod
    def addToOpenSet(cls,o):
        cls._openSet.add(o)

    @classmethod
    def removeFromOpenSet(cls,o):
        if o not in cls._closedSet:
            cls._openSet.remove(o)
            cls._closedSet.add(o)

    def id_is_variable(self):
        """ Is the uriref a graph variable? """
        return DataObject._is_variable(self.identifier(query=True))

    @classmethod
    def _is_variable(cls,uri):
        """ Is the uriref a graph variable? """
        # We should be able to extract the type from the identifier
        if not isinstance(uri,R.URIRef):
            return False
        cn = cls._extract_class_name(uri)
        #print 'cn = ', cn
        return (cn == 'variable')

    @classmethod
    def _graph_variable_to_var(cls,uri):

        from urlparse import urlparse
        u = urlparse(uri)
        x = u.path.split('/')
        #print uri
        if x[2] == 'variable':
            #print 'fragment = ', u.fragment
            return R.Variable(u.fragment)
        else:
            return uri

    @classmethod
    def _graph_variable_to_var0(cls,uri):

        from urlparse import urlparse
        u = urlparse(uri)
        x = u.path.split('/')
        #print uri
        if x[2] == 'variable':
            #print 'fragment = ', u.fragment
            return "?"+u.fragment

    def identifier(self,query=False):
        """
        The identifier for this object in the rdf graph.

        This identifier may be randomly generated, but an identifier returned from the
        graph can be used to retrieve the specific object that it refers to.
        """
        if query and not self._id_is_set:
            return self._id_variable
        else:
            return self._id

    def make_identifier(self, data):
        import hashlib
        return R.URIRef(self.rdf_namespace["a"+hashlib.md5(str(data)).hexdigest()])

    def triples(self, query=False, visited_list=False, **kwargs):
        """
        Should be overridden by derived classes to return appropriate triples

        Returns
        --------
        An iterable of triples
        """
        if visited_list == False:
            visited_list = set()

        if self in visited_list:
            return
        else:
            visited_list.add(self)

        ident = self.identifier(query=query)
        yield (ident, R.RDF['type'], self.rdf_type)

        # For objects that are defined by triples, we can just release these.
        # However, they are still data objects, so they must have the above
        # triples released as well.
        for x in self._triples:
            yield x

        # Properties (of type Property) can be attached to an object
        # However, we won't require that there even is a property list in this
        # case.
        if hasattr(self, 'properties'):
            for x in self.properties:
                if isinstance(x, SimpleProperty):
                    if x.hasValue():
                        yield (ident, x.link, x.identifier(query=query))
                        for y in x.triples(query=query, visited_list=visited_list, **kwargs):
                            yield y
                    elif x.hasVariable():
                        yield (ident, x.link, x.identifier(query=query))
                        for y in x.triples(query=query, visited_list=visited_list, **kwargs):
                            yield y
                else:
                    for y in x.triples(query=query, visited_list=visited_list, **kwargs):
                        yield y

    def graph_pattern(self,query=False):
        """ Get the graph pattern for this object.

        It should be as simple as converting the result of triples() into a BGP
        """
        visited_list = set()
        return _triples_to_bgp(self.triples(query=query, visited_list=visited_list))

    def save(self):
        """ Write in-memory data to the database. Derived classes should call this to update the store. """

        ss = set()
        self.add_statements(self.triples(visited_list=ss, saving=True))

    def object_from_id(self,identifier,rdf_type=False):
        """ Load an object from the database using its type and id

        Parameters
        ----------
        identifier : rdflib.term.URIRef
            the object's id
        rdf_type : rdflib.term.URIRef
            the object's type. Optional.
        """
        # XXX: This is a class method because we need to get the conf
        # We should be able to extract the type from the identifier
        if rdf_type:
            uri = rdf_type
        else:
            uri = identifier

        cn = self._extract_class_name(uri)
        # if its our class name, then make our own object
        # if there's a part after that, that's the property name
        o = _DataObjects[cn](ident=identifier)
        return o

    @classmethod
    def _extract_class_name(self,uri):
        from urlparse import urlparse
        u = urlparse(uri)
        x = u.path.split('/')
        if len(x) >= 3 and x[1] == 'entities':
            return x[2]

    @classmethod
    def _extract_property_name(self,uri):
        from urlparse import urlparse
        u = urlparse(uri)
        x = u.path.split('/')
        if len(x) >= 4 and x[1] == 'entities':
            return x[3]

    # Must resolve, somehow, to a set of triples that we can manipulate
    # For instance, one or more construct query could represent the object or
    # the triples might be stored in memory.
    @classmethod
    def DatatypeProperty(cls,*args,**kwargs):
        """ Create a SimpleProperty that has a simple type (string,number,etc) as its value

        Parameters
        ----------
        linkName : string
            The name of this Property.
        owner : PyOpenWorm.dataObject.DataObject
            The name of this Property.
        """
        return cls._create_property(*args,property_type='DatatypeProperty',**kwargs)

    @classmethod
    def ObjectProperty(cls, *args,**kwargs):
        """ Create a SimpleProperty that has a complex DataObject as its value

        Parameters
        ----------
        linkName : string
            The name of this Property.
        owner : PyOpenWorm.dataObject.DataObject
            The name of this Property.
        value_type : type
            The type of DataObject for values of this property
        """
        return cls._create_property(*args,property_type='ObjectProperty',**kwargs)

    @classmethod
    def _create_property(cls, linkName, owner, property_type, value_type=False, multiple=False):
        #XXX This should actually get called for all of the properties when their owner
        #    classes are defined.
        #    The initialization, however, must happen with the owner object's creation
        owner_class = cls
        owner_class_name = owner_class.__name__
        property_class_name = owner_class_name + "_" + linkName
        if value_type == False:
            value_type = DataObject

        c = None
        if property_class_name in _DataObjects:
            c = _DataObjects[property_class_name]
        else:
            if property_type == 'ObjectProperty':
                value_rdf_type = value_type.rdf_type
            else:
                value_rdf_type = False
            c = type(property_class_name,(SimpleProperty,),dict(linkName=linkName, property_type=property_type, value_rdf_type=value_rdf_type, owner_type=owner_class, multiple=multiple))
            _DataObjects[property_class_name] = c
            c.register()

        return c(owner=owner)

    @classmethod
    def register(cls):
        """ Registers the class as a DataObject to be included in the configured rdf graph.
            Puts this class under the control of the database for metadata.

        :return: None
        """
        # NOTE: This expects that configuration has been read in and that the database is available
        assert(issubclass(cls, DataObject))
        _DataObjects[cls.__name__] = cls
        _DataObjectsParents[cls.__name__] = [x for x in cls.__bases__ if issubclass(x, DataObject)]
        cls.parents = _DataObjectsParents[cls.__name__]
        cls.rdf_type = cls.conf['rdf.namespace'][cls.__name__]
        cls.rdf_namespace = R.Namespace(cls.rdf_type + "/")
        cls.conf['rdf.namespace_manager'].bind(cls.__name__, cls.rdf_namespace)

    def load(self):
        """ Load in data from the database. Derived classes should override this for their own data structures.

        ``load()`` returns an iterable object which yields DataObjects which have the same class as the object and have, for the Properties set, the same values

        :param self: An object which limits the set of objects which can be returned. Should have the configuration necessary to do the query
        """
        if not DataObject._is_variable(self.identifier(query=True)):
            yield self
        else:
            trips = list(self.triples(query=True))
            type_trip_check = lambda t: ((t[1] == R.RDF['type']) and (isinstance(t[0], R.Variable) or DataObject._is_variable(t[0])))
            non_type_trips = [t for t in trips if not type_trip_check(t)]
            type_trips = [t for t in trips if type_trip_check(t)]

            if len(non_type_trips) == 0:
                gp = _triples_to_bgp(type_trips)
            else:
                gp = _triples_to_bgp(non_type_trips)
                if type_trips:
                    gp = gp + " . FILTER (EXISTS { "+_triples_to_bgp(type_trips)+" })"


            ident = self.identifier(query=True)
            ident = self._graph_variable_to_var(ident) # XXX: Assuming that this object doesn't have a set identifier
            q = "SELECT DISTINCT {0} {0}_type where {{ {{ {1} }} . {0} rdf:type {0}_type }} ORDER BY {0}".format(ident.n3(), gp)
            qres = self.rdf.query(q)
            results = _QueryResultsTypeResolver(self, qres)()
            for x in results:
                yield x

    def retract(self):
        """ Remove this object from the data store. """
        self.retract_statements(self.graph_pattern(query=True))

    def __getitem__(self, x):
        try:
            return DataUser.__getitem__(self, x)
        except KeyError:
            raise Exception("You attempted to get the value `%s' from `%s'. It isn't here. Perhaps you misspelled the name of a Property?" % (x, self))

    def getOwners(self, property_name):
        """ Return the owners along a property pointing to this object """
        res = []
        for x in self.owner_properties:
            if isinstance(x, SimpleProperty):
                if str(x.link) == str(property_name):
                    res.append(x.owner)
        return res

class _QueryResultsTypeResolver(object):
    # Takes an iterable of (identifier, type) results in qres, sorted by the identifier
    # and adds the objects corresponding to the result list
    def __init__(self, ob, qres):
        self.ob = ob # The DataObject that created this QRTR
        self.qres = iter(qres) # The query results
        self.results = []

    def s(self):
        try:
            k = next(self.qres)
        except StopIteration as e:
            k = (None, None)
        return k

    def g0(self, ident, types):
        while ident is not None:
            k = self.s()
            n_ident = k[0]
            n_type = k[1]

            if n_ident != ident:
                o = self.ob.object_from_id(ident, get_most_specific_rdf_type(types))
                self.results.append(o)
                types = [n_type]
            else:
                types = [n_type] + types
            ident = n_ident

    def g(self):
        k = self.s()
        if k[0] is None:
            return
        else:
            self.g0(k[0], [k[1]])
    def __call__(self):
        self.g()
        return self.results

def get_most_specific_rdf_type(types):
    """ Gets the most specific rdf_type.

    Returns the URI corresponding to the lowest in the DataObject class hierarchy
    from among the given URIs.
    """
    most_specific_type = DataObject
    for x in types:
        cn = DataObject._extract_class_name(x) # TODO: Make a table to lookup by the class URI
        try:
            class_object = _DataObjects[cn]
            if issubclass(class_object, most_specific_type):
                most_specific_type = class_object
        except KeyError as e:
            L.warn("""A a Python class named "{}" corresponding to the type URI "{}" couldn't be found.
            You may want to import the module containing the class as well as add additional type
            annotations in order to resolve your objects to a more precise type than DataObject.""".format(cn, x))
    return most_specific_type.rdf_type

# Define a property by writing the get
class Property(DataObject):
    """ Store a value associated with a DataObject

    Properties can be be accessed like methods. A method call like::

        a.P()

    for a property ``P`` will return values appropriate to that property for ``a``,
    the `owner` of the property.

    Parameters
    ----------
    owner : PyOpenWorm.dataObject.DataObject
        The owner of this property
    name : string
        The name of this property. Can be accessed as an attribute like::

            owner.name

    """

    # Indicates whether the Property is multivalued
    multiple = False

    def __init__(self, name=False, owner=False, **kwargs):
        DataObject.__init__(self, **kwargs)
        self.owner = owner
        if self.owner:
            self.owner.properties.append(self)
            if name:
                setattr(self.owner, name, self)
            DataObject.removeFromOpenSet(self)
        # XXX: Default implementation is a box for a value
        self._value = False

    def get(self,*args):
        """ Get the things which are on the other side of this property

        The return value must be iterable. For a ``get`` that just returns
        a single value, an easy way to make an iterable is to wrap the
        value in a tuple like ``(value,)``.

        Derived classes must override.
        """
        # This should run a query or return a cached value
        raise NotImplementedError()
    def set(self,*args,**kwargs):
        """ Set the value of this property

        Derived classes must override.
        """
        # This should set some values and call DataObject.save()
        raise NotImplementedError()

    def one(self):
        """ Returns a single value for the ``Property`` whether or not it is multivalued.
        """

        try:
            r = self.get()
            return next(iter(r))
        except StopIteration:
            return None

    def hasValue(self):
        """ Returns true if the Property has any values set on it.

        This may be defined differently for each property
        """
        return True

    def __call__(self,*args,**kwargs):
        """ If arguments are passed to the ``Property``, its ``set`` method
        is called. Otherwise, the ``get`` method is called. If the ``multiple``
        member for the ``Property`` is set to ``True``, then a Python set containing
        the associated values is returned. Otherwise, a single bare value is returned.
        """

        if len(args) > 0 or len(kwargs) > 0:
            self.set(*args,**kwargs)
            return self
        else:
            r = self.get(*args,**kwargs)
            if isinstance(self, SimpleProperty): #XXX: _get is defined only for SimpleProperty objects
                r = IT.chain(r, self._get(*args, **kwargs))
            if self.multiple:
                return set(r)
            else:
                try:
                    return next(iter(r))
                except StopIteration:
                    return None

    # Get the property (a relationship) itself

class SimpleProperty(Property):
    """ A property that has one or more links to a literals or DataObjects """

    def __init__(self,**kwargs):
        if not hasattr(self,'linkName'):
            self.__class__.linkName = self.__class__.__name__ + "property"
        Property.__init__(self, name=self.linkName, **kwargs)
        self.value_property = SimpleProperty.rdf_namespace['value']

        # Values set on this property
        self._v = []

        # The variable to be used for querying this property
        self._var = None
        if (self.owner==False) and hasattr(self,'owner_type'):
            self.owner = self.owner_type()

        if self.owner != False:
            # XXX: Shouldn't be recreating this here...
            self.link = self.owner_type.rdf_namespace[self.linkName]

    def hasVariable(self):
        return (self._var is not None)

    def hasValue(self):
        """ Returns true if the ``Property`` has had ``load`` called previously and some value was available or if
        ``set`` has been called previously

        :return: True if this data object has a value, False if not.
        """
        return len(self._v) > 0

    def _get(self):
        for x in self._v:
            yield x

    def get(self):
        """ If the ``Property`` has had ``load`` or ``set`` called previously, returns
        the resulting values. Otherwise, queries the configured rdf graph for values
        which are set for the ``Property``'s owner.
        """
        import random as RND
        if self.id_is_variable():
            try:
                self._var = R.Variable("V"+str(int(RND.random() * 1E10)))
                gp = self.owner.graph_pattern(query=True)
                if self.property_type == 'DatatypeProperty':
                    q = u"SELECT DISTINCT {0} where {{ {1} . }}".format(self._var.n3(), gp)
                elif self.property_type == 'ObjectProperty':
                    q = "SELECT DISTINCT {0} {0}_type where {{ {{ {1} }} . {0} rdf:type {0}_type }} ORDER BY {0}".format(self._var.n3(), gp)
                else:
                    raise Exception("Inappropriate property type "+self.property_type+" in SimpleProperty::get")
            finally:
                self._var = None
            qres = self.rdf.query(q)
            if self.property_type == 'DatatypeProperty':
                for x in qres:
                    if x[0] is not None and not DataObject._is_variable(x[0]):
                        yield _rdf_literal_to_python(x[0])
            elif self.property_type == 'ObjectProperty':
                for x in _QueryResultsTypeResolver(self, qres)():
                    yield x
        else:
            for value in self.rdf.objects(self.identifier(query=False), self.value_property):
                if self.property_type == 'DatatypeProperty':
                    if value is not None and not DataObject._is_variable(value):
                        yield _rdf_literal_to_python(value)
                elif self.property_type == 'ObjectProperty':
                    constructed_qres = set()
                    for rdf_type in self.rdf.objects(value, R.RDF['type']):
                        constructed_qres.add((value, rdf_type))

                    for ob in _QueryResultsTypeResolver(self, constructed_qres)():
                        yield ob

    def set(self,v):
        import bisect
        bisect.insort(self._v, v)
        if self.property_type == 'ObjectProperty':
            v.owner_properties.append(self)

        if isinstance(v,DataObject):
            DataObject.removeFromOpenSet(v)
        self.add_statements([])

    def triples(self,*args,**kwargs):
        query=kwargs.get('query',False)
        visited_list = kwargs.get('visited_list', False)

        if visited_list == False:
            visited_list = set()

        if self in visited_list:
            return
        else:
            visited_list.add(self)

        ident = self.identifier(query=query)

        if kwargs.get('saving', False):
            yield (self.identifier(), R.RDF['type'], self.rdf_type)

        if len(self._v) > 0:
            for x in Property.triples(self,*args,**kwargs):
                yield x
            for x in self._v:
                try:
                    if self.property_type == 'DatatypeProperty':
                        if isinstance(x, R.term.Identifier):
                            yield (ident, self.value_property, x)
                        else:
                            yield (ident, self.value_property, R.Literal(x))
                    elif self.property_type == 'ObjectProperty':
                        yield (ident, self.value_property, x.identifier(query=query))
                        for t in x.triples(*args,**kwargs):
                            yield t
                except Exception:
                    traceback.print_exc()
        elif query==True:
            if self.hasVariable():
                yield (ident, self.value_property, self._var)

    def triples0(self,*args,**kwargs):
        query=kwargs.get('query',False)
        owner_id = self.owner.identifier(query=query)
        ident = self.identifier(query=query)


        if len(self._v) > 0:
            for x in Property.triples(self,*args,**kwargs):
                yield x
            yield (owner_id, self.link, ident)
            for x in self._v:
                try:
                    if self.property_type == 'DatatypeProperty':
                        yield (ident, self.value_property, R.Literal(x))
                    elif self.property_type == 'ObjectProperty':
                        yield (ident, self.value_property, x.identifier(query=query))
                        for t in x.triples(*args,**kwargs):
                            yield t
                except Exception:
                    traceback.print_exc()
        elif query==True:
            # XXX: Remove this and require that we have a variable in `self._v` before
            #      we release triples that contain variables of any kind
            gv = self._graph_variable(self.linkName)
            yield (owner_id, self.link, ident)
            yield (ident, self.value_property, gv)

    def load(self):
        """ Loads in values to this ``Property`` which have been set for the associated owner,
        or if the owner refers to an unspecified member of its class, loads values which could
        be set based on the constraints on the owner.
        """
        # This load is way simpler since we just need the values for this property
        gp = self.graph_pattern(query=True)
        q = "SELECT DISTINCT ?"+self.linkName+"  where { "+ gp +" . }"
        L.debug('load_query='+q)
        qres = self.conf['rdf.graph'].query(q)
        for k in qres:
            k = k[0]
            value = False
            if not self._is_variable(k):
                if self.property_type == 'ObjectProperty':
                    value = self.object_from_id(k)
                elif self.property_type == 'DatatypeProperty':
                    value = str(k)

                if value:
                    self._v.append(value)
        yield self

    def identifier(self,query=False):
        """ Return the URI for this object

        Parameters
        ----------
        query: bool
            Indicates whether the identifier is to be used in a query or not
        """
        ident = DataObject.identifier(self,query=query)
        if self._id_is_set:
            return ident

        if query:
            # If we're querying then our identifier should be a variable if either our value is empty
            # or our owner's identifier is a variable
            owner_id = self.owner.identifier(query=query)
            vlen = len(self._v)
            if vlen == 0 or DataObject._is_variable(owner_id):
                return ident

        # Intentional fall through from if statement ...
        value_data = ""
        if self.property_type == 'DatatypeProperty':
            value_data = "".join(str(x) for x in self._v)
        elif self.property_type == 'ObjectProperty':
            for value in self._v:
                if not isinstance(value, DataObject):
                    raise Exception("Values for an ObjectProperty ({}) must be DataObjects. Given '{}'.".format(self, value))
            value_data = "".join(str(x.identifier()) for x in self._v if self is not x)

        return self.make_identifier((str(self.owner.identifier(query=False)), self.link, value_data))

    def __str__(self):
        return unicode(self.linkName + "=" + unicode(";".join(u"`{}'".format(unicode(x)) for x in set(self._v))))


class values(DataObject):
    """
    A convenience class for working with a collection of objects

    Example::

        v = values('unc-13 neurons and muscles')
        n = P.Neuron()
        m = P.Muscle()
        n.receptor('UNC-13')
        m.receptor('UNC-13')
        for x in n.load():
            v.value(x)
        for x in m.load():
            v.value(x)
        # Save the group for later use
        v.save()
        ...
        # get the list back
        u = values('unc-13 neurons and muscles')
        nm = list(u.value())


    Parameters
    ----------
    group_name : string
        A name of the group of objects

    Attributes
    ----------
    name : DatatypeProperty
        The name of the group of objects
    value : ObjectProperty
        An object in the group
    add : ObjectProperty
        an alias for ``value``

    """
    def __init__(self,group_name,**kwargs):
        DataObject.__init__(self,**kwargs)
        self.add = values.ObjectProperty('value', owner=self)
        self.group_name = values.DatatypeProperty('name', owner=self)
        self.name(group_name)

    def identifier(self, query=False):
        return self.make_identifier(self.group_name)