Source code for smqtk.algorithms.descriptor_generator

import abc
import numpy

from smqtk.algorithms import SmqtkAlgorithm
from smqtk.representation import DescriptorElementFactory
from smqtk.representation.descriptor_element.local_elements import \
    DescriptorMemoryElement
from smqtk.utils import ContentTypeValidator
from smqtk.utils.parallel import parallel_map


DFLT_DESCRIPTOR_FACTORY = DescriptorElementFactory(DescriptorMemoryElement, {})


[docs]class DescriptorGenerator (SmqtkAlgorithm, ContentTypeValidator): """ Base abstract Feature Descriptor interface """
[docs] def compute_descriptor(self, data, descr_factory=DFLT_DESCRIPTOR_FACTORY, overwrite=False): """ Given some data, return a descriptor element containing a descriptor vector. :raises RuntimeError: Descriptor extraction failure of some kind. :raises ValueError: Given data element content was not of a valid type with respect to this descriptor. :param data: Some kind of input data for the feature descriptor. :type data: smqtk.representation.DataElement :param descr_factory: Factory instance to produce the wrapping descriptor element instance. The default factory produces ``DescriptorMemoryElement`` instances by default. :type descr_factory: smqtk.representation.DescriptorElementFactory :param overwrite: Whether or not to force re-computation of a descriptor vector for the given data even when there exists a precomputed vector in the generated DescriptorElement as generated from the provided factory. This will overwrite the persistently stored vector if the provided factory produces a DescriptorElement implementation with such storage. :type overwrite: bool :return: Result descriptor element. UUID of this output descriptor is the same as the UUID of the input data element. :rtype: smqtk.representation.DescriptorElement """ # Check content type against listed valid types ct = data.content_type() if ct not in self.valid_content_types(): self._log.error("Cannot compute descriptor from content type '%s' " "data: %s)" % (ct, data)) raise ValueError("Cannot compute descriptor from content type '%s' " "data: %s)" % (ct, data)) # Produce the descriptor element container via the provided factory # - If the generated element already contains a vector, because the # implementation provides some kind of persistent caching mechanism # or something, don't compute another descriptor vector unless the # overwrite flag is True descr_elem = descr_factory.new_descriptor(self.name, data.uuid()) if overwrite or not descr_elem.has_vector(): vec = self._compute_descriptor(data) descr_elem.set_vector(vec) else: self._log.debug("Found existing vector in generated element: %s", descr_elem) return descr_elem
[docs] def compute_descriptor_async(self, data_iter, descr_factory=DFLT_DESCRIPTOR_FACTORY, overwrite=False, procs=None, **kwds): """ Asynchronously compute feature data for multiple data items. Base implementation additional keyword arguments: use_mp [= False] If multi-processing should be used vs. multi-threading. :param data_iter: Iterable of data elements to compute features for. These must have UIDs assigned for feature association in return value. :type data_iter: collections.Iterable[smqtk.representation.DataElement] :param descr_factory: Factory instance to produce the wrapping descriptor element instance. The default factory produces ``DescriptorMemoryElement`` instances by default. :type descr_factory: smqtk.representation.DescriptorElementFactory :param overwrite: Whether or not to force re-computation of a descriptor vectors for the given data even when there exists precomputed vectors in the generated DescriptorElements as generated from the provided factory. This will overwrite the persistently stored vectors if the provided factory produces a DescriptorElement implementation such storage. :type overwrite: bool :param procs: Optional specification of how many processors to use when pooling sub-tasks. If None, we attempt to use all available cores. :type procs: int | None :raises ValueError: An input DataElement was of a content type that we cannot handle. :return: Mapping of input DataElement UUIDs to the computed descriptor element for that data. DescriptorElement UUID's are congruent with the UUID of the data element it is the descriptor of. :rtype: dict[collections.Hashable, smqtk.representation.DescriptorElement] """ self._log.info("Async compute features") use_mp = kwds.get('use_mp', False) def work(d): return d, self.compute_descriptor(d, descr_factory, overwrite) results = parallel_map(work, data_iter, cores=procs, ordered=False, use_multiprocessing=use_mp) de_map = {} for data, descriptor in results: de_map[data.uuid()] = descriptor return de_map
@abc.abstractmethod def _compute_descriptor(self, data): """ Internal method that defines the generation of the descriptor vector for a given data element. This returns a numpy array. This method is only called if the data element has been verified to be of a valid content type for this descriptor implementation. :raises RuntimeError: Feature extraction failure of some kind. :param data: Some kind of input data for the feature descriptor. :type data: smqtk.representation.DataElement :return: Feature vector. :rtype: numpy.core.multiarray.ndarray """