Source code for smqtk.algorithms.classifier._interface_classifier

import abc
from collections import deque
import itertools

from six.moves import zip

from smqtk.algorithms import SmqtkAlgorithm
from smqtk.representation import DescriptorElement

from ._defaults import DFLT_CLASSIFIER_FACTORY


[docs]class Classifier (SmqtkAlgorithm): """ Interface for algorithms that classify input descriptors into discrete labels and/or label confidences. """
[docs] @abc.abstractmethod def get_labels(self): """ Get the sequence of class labels that this classifier can classify descriptors into. This includes the negative or background label if the classifier embodies such a concept. :return: Sequence of possible classifier labels. :rtype: collections.Sequence[collections.Hashable] :raises RuntimeError: No model loaded. """
[docs] @abc.abstractmethod def _classify_arrays(self, array_iter): """ Overridable method for classifying an iterable of descriptor elements whose vectors should be classified. At this level, all input arrays are guaranteed to be of consistent dimensionality. Each classification mapping should contain confidence values for each label the configured model contains. Implementations may act in a discrete manner whereby only one label is marked with a ``1`` value (others being ``0``), or in a continuous manner whereby each label is given a confidence-like value in the [0, 1] range. :param collections.Iterable[numpy.ndarray] array_iter: Iterable of arrays to be classified. :return: Iterable of dictionaries, parallel in association to the input descriptor vectors. Each dictionary should map labels to associated confidence values. :rtype: collections.Iterable[dict[collections.Hashable, float]] """
[docs] @staticmethod def _assert_array_dim_consistency(array_iter): """ Assert that arrays are consistent in dimensionality across iterated arrays. Currently we only support iterating single dimension vectors. Arrays of more than one dimension (i.e. 2D matries, etc.) will trigger a ValueError. :param collections.Iterable[numpy.ndarray] array_iter: Iterable numpy arrays. :raises ValueError: Not all input arrays were of consistent dimensionality. :return: Iterable of the same arrays in the same order, but validated to be of common dimensionality. """ dim = None for a in array_iter: if a.ndim > 1: raise ValueError("Input vector had more than one dimension! " "(ndim = {})".format(a.ndim)) elif dim is None: dim = a.size elif a.size != dim: raise ValueError("Input vector violated dimension consistency " "(basis == {}, violation == {})" .format(dim, a.size)) yield a
[docs] def classify_arrays(self, array_iter): """ Classify an input iterable of numpy arrays into a parallel iterable of label-to-confidence mappings (dictionaries). Each classification mapping should contain confidence values for each label the configured model contains. Implementations may act in a discrete manner whereby only one label is marked with a ``1`` value (others being ``0``), or in a continuous manner whereby each label is given a confidence-like value in the [0, 1] range. :param collections.Iterable[numpy.ndarray] array_iter: Iterable of DescriptorElement instances to be classified. :raises ValueError: Input arrays were not all of consistent dimensionality. :return: Iterable of dictionaries, parallel in association to the input descriptor vectors. Each dictionary should map labels to associated confidence values. :rtype: collections.Iterable[dict[collections.Hashable, float]] """ return self._classify_arrays( self._assert_array_dim_consistency(array_iter) )
[docs] def classify_elements(self, descr_iter, factory=DFLT_CLASSIFIER_FACTORY, overwrite=False, d_elem_batch=100): """ Classify an input iterable of descriptor elements into a parallel iterable of classification elements. Classification element UIDs are inherited from the descriptor element it was generated from. We invoke ``classify_arrays`` for actual generation of classification results. See documentation for this method for further details. # We invoke ``classify_arrays`` for factory-generated classification # elements that do not yet have classifications stored, or on all input # descriptor elements if the ``overwrite`` flag is True. **Selective Iteration** For situations when it is desired to access specific generator returns, like when only one descriptor element is provided in order to get a single element out, it is strongly recommended to expand the returned generator into a sequence type first. For example, expanding out the generator's returns into a list (``list(g.generate_elements([e]))[0]``) is recommended over just getting the "next" element of the returned generator (``next(g.generate_elements([e]))``). Expansion into a sequence allows the generator to fully execute, which includes any functionality after the final ``yield`` statement in any of the underlying iterators that may perform required clean-up. **Non-redundant Processing** Certain classification element implementations, as dictated by the input factory, may be connected to persistent storage in the background. Because of this, some classification elements may already "have" classification results on construction. This method, by default, only computes new classification results for descriptor elements whose associated classification element does not report as already containing results. If the ``overwrite`` flag is True then classifications are computed for all input descriptor elements and results are set to their respective classification elements regardless of existing result storage. :param collections.Iterable[DescriptorElement] descr_iter: Iterable of DescriptorElement instances to be classified. :param smqtk.representation.ClassificationElementFactory factory: Classification element factory. The default factory yields MemoryClassificationElement instances. :param bool overwrite: Recompute classification of the input descriptor and set the results to the ClassificationElement produced by the factory. :param int d_elem_batch: The number of descriptor elements to collect before requesting the whole batch's vectors at once via ``DescriptorElement.get_many_vectors`` method. :raises ValueError: Either: (A) one or more input descriptor elements did not have a stored vector, or (B) input descriptor element arrays were not all of consistent dimensionality. :raises IndexError: Implementation of ``_classify_arrays`` either under or over produced classifications relative to the number of input descriptor vectors. :return: Iterator of result ClassificationElement instances. UUIDs of generated ClassificationElement instances will reflect the UUID of the DescriptorElement it was computed from. :rtype: collections.Iterator[smqtk.representation.ClassificationElement] """ log_debug = self._log.debug if d_elem_batch <= 0: self._log.warning("Descriptor element batching value <= 0, " "defaulting to using value of 1.") d_elem_batch = 1 # Queue populated by ``iter_tocompute_arrays`` with # ClassificationElement instances paired with a flag indicating # whether a classification was to be computed for that element. # Using deques so we can efficiently popleft off of them in the below # for-loop. This way we do not retain elements and booleans for # things we have yielded that would otherwise build up if this method # iterated for a long time. #: :type: deque[(smqtk.representation.ClassificationElement, bool)] elem_and_status_q = deque() # Flag for end of data iteration. When not None will be the index of # the last descriptor/classification element to be yielded. This will # NOT be the number of elements to be yielded, that would be # ``end_of_iter[0]+1``. #: :type: list[int|None] end_of_iter = [None] # TODO: Make generator threadsafe? # See: https://anandology.com/blog/using-iterators-and-generators/ def iter_tocompute_arrays(): """ Yield descriptor vectors for classification elements that need computing yet. :rtype: typing.Generator[numpy.ndarray] """ # Force into an iterator. descr_iterator = iter(descr_iter) # Running var for the index of final data element in input # iterator. This will be -1 or the value of the final index in the # parallel lists. last_i = -1 # Make successive islices into iterator of descriptor elements to # produces batches. We end when there is nothing left being # returned by the iterator de_batch_list = \ list(itertools.islice(descr_iterator, d_elem_batch)) while de_batch_list: # Get vectors from batch using implementation-level batch # aggregation methods where applicable. de_batch_vecs = \ DescriptorElement.get_many_vectors(de_batch_list) for d_elem, d_vec in zip(de_batch_list, de_batch_vecs): d_uid = d_elem.uuid() if d_vec is None: raise ValueError("Encountered DescriptorElement with " "no vector stored! (UID=`{}`)" .format(d_uid)) c_elem_ = factory.new_classification(self.name, d_uid) already_computed = \ not overwrite and c_elem_.has_classifications() elem_and_status_q.append((c_elem_, already_computed)) if not already_computed: # Classifications should be computed for this # descriptor log_debug("Yielding descriptor array with UID `{}` " "for classification generation." .format(d_uid)) yield d_vec else: log_debug("Classification already generated for UID " "`{}`.".format(d_uid)) last_i += len(de_batch_vecs) # Slice out the next batch of descriptor elements. This will be # empty if the iterator has been exhausted. de_batch_list = list( itertools.islice(descr_iterator, d_elem_batch) ) end_of_iter[0] = last_i classification_iter = self.classify_arrays(iter_tocompute_arrays()) for c_i, c in enumerate(classification_iter): # These pops would fail with an IndexError if there is nothing left # left from parallel allocation within ``iter_tocompute_arrays``. # This usually means that the ``self.generate_arrays`` is # generating more vectors than there are descr element slots to # fill. try: c_elem, c_already_computed = elem_and_status_q.popleft() except IndexError: # Translate index error into one with a more informative # message. raise IndexError( "Implementation's ``_classify_arrays`` over-produced " "classifications relative to input descriptor vectors." ) # Forwarding the ``classification_iter`` generator should forward # the ``iter_tocompute_arrays`` iterator, thus populating the # ``elem_and_status_q`` by some amount. The current ``c`` should be # used to populate the next ClassificationElement with a False # ``already_computed`` flag. Yield classification elements that # already had classifications until we hit an element that was # flagged for computation. while c_already_computed: yield c_elem # We clearly have a classification from the result of # computation so there should logically be some future element # in which to store this result. c_elem, c_already_computed = elem_and_status_q.popleft() # We've arrived at an element that was flagged for computation, so # store the result. log_debug("Setting computed classification to element UID=`{}`" .format(c_elem.uuid)) c_elem.set_classification(c) yield c_elem # At this point, the ``iter_tocompute_arrays`` iterator should have # completed du eto the ``self.classify_arrays`` method iterating # through it completely, resulting in assignment to # ``end_of_iter[0]``. # This also indicates that nothing more should be being added to # ``elem_and_status_q``. assert end_of_iter[0] is not None, \ "EoI value has not yet been assigned a value even though " \ "``classify_arrays`` completed. Implementation of " \ "``_classify_arrays`` may not have fully iterated through the " \ "input numpy.ndarray iterable." # Finish yielding any "already-computed" classification elements that # are past the last computed element index. for c, already_comp in elem_and_status_q: # If an element's already-computed state is False at this point, # then the implementation's ``_classify_arrays`` method must not # have yielded enough arrays to fill the elements that were flagged # for classification. if not already_comp: raise IndexError( "Implementation's ``_classify_arrays`` under-produced " "classifications to fill elements that were flagged for " "computation." ) yield c
[docs] def classify_one_element(self, descr_elem, factory=DFLT_CLASSIFIER_FACTORY, overwrite=False): """ Convenience method around ``classify_elements`` for the single-input case. See documentation for the :meth:`Classifier.classify_elements` method for more information. :param DescriptorElement descr_elem: Iterable of DescriptorElement instances to be classified. :param smqtk.representation.ClassificationElementFactory factory: Classification element factory. The default factory yields MemoryClassificationElement instances. :param bool overwrite: Recompute classification of the input descriptor and set the results to the ClassificationElement produced by the factory. :raises ValueError: The input descriptor element did not have a stored vector. :raises IndexError: Implementation of ``_classify_arrays`` either under or over produced classifications relative to the number of input descriptor vectors. :return: ClassificationElement instances. UUIDs of the generated ClassificationElement instance will reflect the UUID of the DescriptorElement it was computed from. :rtype: smqtk.representation.ClassificationElement """ return list(self.classify_elements( [descr_elem], factory=factory, overwrite=overwrite, d_elem_batch=1 ))[0]