add compute_near_duplicates() method

voxel51 · Dec 2, 2024 · d4f80bc · d4f80bc
1 parent e0678a9
commit d4f80bc
Show file tree

Hide file tree

Showing 14 changed files with 288 additions and 169 deletions.
diff --git a/fiftyone/brain/__init__.py b/fiftyone/brain/__init__.py
@@ -539,6 +539,7 @@ def compute_visualization(
 def compute_similarity(
     samples,
     patches_field=None,
+    roi_field=None,
     embeddings=None,
     brain_key=None,
     model=None,
@@ -592,6 +593,11 @@ def compute_similarity(
             :class:`fiftyone.core.labels.Detections`,
             :class:`fiftyone.core.labels.Polyline`, or
             :class:`fiftyone.core.labels.Polylines`
+        roi_field (None): an optional :class:`fiftyone.core.labels.Detection`,
+            :class:`fiftyone.core.labels.Detections`,
+            :class:`fiftyone.core.labels.Polyline`, or
+            :class:`fiftyone.core.labels.Polylines` field defining a region of
+            interest within each image to use to compute embeddings
         embeddings (None): embeddings to feed the index. This argument's
             behavior depends on whether a ``model`` is provided, as described
             below.
@@ -600,8 +606,9 @@ def compute_similarity(
             embeddings to use:
 
             -   a ``num_samples x num_dims`` array of embeddings
-            -   if ``patches_field`` is specified,  a dict mapping sample IDs
-                to ``num_patches x num_dims`` arrays of patch embeddings
+            -   if ``patches_field``/``roi_field`` is specified,  a dict
+                mapping sample IDs to ``num_patches x num_dims`` arrays of
+                patch embeddings
             -   the name of a dataset field from which to load embeddings
             -   ``None``: use the default model to compute embeddings
             -   ``False``: **do not** compute embeddings right now
@@ -614,7 +621,7 @@ def compute_similarity(
 
             In either case, when working with patch embeddings, you can provide
             either the fully-qualified path to the patch embeddings or just the
-            name of the label attribute in ``patches_field``
+            name of the label attribute in ``patches_field``/``roi_field``
         brain_key (None): a brain key under which to store the results of this
             method
         model (None): a :class:`fiftyone.core.models.Model` or the name of a
@@ -626,14 +633,14 @@ def compute_similarity(
             to the model's ``Config`` when a model name is provided
         force_square (False): whether to minimally manipulate the patch
             bounding boxes into squares prior to extraction. Only applicable
-            when a ``model`` and ``patches_field`` are specified
+            when a ``model`` and ``patches_field``/``roi_field`` are specified
         alpha (None): an optional expansion/contraction to apply to the patches
             before extracting them, in ``[-1, inf)``. If provided, the length
             and width of the box are expanded (or contracted, when
             ``alpha < 0``) by ``(100 * alpha)%``. For example, set
             ``alpha = 0.1`` to expand the boxes by 10%, and set
             ``alpha = -0.1`` to contract the boxes by 10%. Only applicable when
-            a ``model`` and ``patches_field`` are specified
+            a ``model`` and ``patches_field``/``roi_field`` are specified
         batch_size (None): an optional batch size to use when computing
             embeddings. Only applicable when a ``model`` is provided
         num_workers (None): the number of workers to use when loading images.
@@ -660,6 +667,7 @@ def compute_similarity(
     return fbs.compute_similarity(
         samples,
         patches_field,
+        roi_field,
         embeddings,
         brain_key,
         model,
@@ -675,6 +683,111 @@ def compute_similarity(
     )
 
 
+def compute_near_duplicates(
+    samples,
+    threshold=0.2,
+    roi_field=None,
+    embeddings=None,
+    similarity_index=None,
+    model=None,
+    model_kwargs=None,
+    force_square=False,
+    alpha=None,
+    batch_size=None,
+    num_workers=None,
+    skip_failures=True,
+    progress=None,
+):
+    """Detects potential duplicates in the given sample collection.
+
+    Calling this method only initializes the index. You can then call the
+    methods exposed on the returned object to perform the following operations:
+
+    -   :meth:`duplicate_ids <fiftyone.brain.similarity.DuplicatesMixin.duplicate_ids>`:
+        A list of duplicate IDs
+
+    -   :meth:`neighbors_map <fiftyone.brain.similarity.DuplicatesMixin.neighbors_map>`:
+        A dictionary mapping IDs to lists of ``(dup_id, dist)`` tuples
+
+    -   :meth:`duplicates_view() <fiftyone.brain.similarity.DuplicatesMixin.duplicates_view>`:
+        Returns a view of all duplicates in the input collection
+
+    Args:
+        samples: a :class:`fiftyone.core.collections.SampleCollection`
+        threshold (0.2): the similarity distance threshold to use when
+            detecting duplicates. Values in ``[0.1, 0.25]`` work well for the
+            default setup
+        roi_field (None): an optional :class:`fiftyone.core.labels.Detection`,
+            :class:`fiftyone.core.labels.Detections`,
+            :class:`fiftyone.core.labels.Polyline`, or
+            :class:`fiftyone.core.labels.Polylines` field defining a region of
+            interest within each image to use to compute leaks
+        embeddings (None): if no ``model`` is provided, this argument specifies
+            pre-computed embeddings to use, which can be any of the following:
+
+            -   a ``num_samples x num_dims`` array of embeddings
+            -   if ``roi_field`` is specified,  a dict mapping sample IDs to
+                ``num_patches x num_dims`` arrays of patch embeddings
+            -   the name of a dataset field containing the embeddings to use
+
+            If a ``model`` is provided, this argument specifies the name of a
+            field in which to store the computed embeddings. In either case,
+            when working with patch embeddings, you can provide either the
+            fully-qualified path to the patch embeddings or just the name of
+            the label attribute in ``roi_field``
+        similarity_index (None): a
+            :class:`fiftyone.brain.similarity.SimilarityIndex` or the brain key
+            of a similarity index to use to load pre-computed embeddings
+        model (None): a :class:`fiftyone.core.models.Model` or the name of a
+            model from the
+            `FiftyOne Model Zoo <https://docs.voxel51.com/user_guide/model_zoo/models.html>`_
+            to use to generate embeddings. The model must expose embeddings
+            (``model.has_embeddings = True``)
+        model_kwargs (None): a dictionary of optional keyword arguments to pass
+            to the model's ``Config`` when a model name is provided
+        force_square (False): whether to minimally manipulate the patch
+            bounding boxes into squares prior to extraction. Only applicable
+            when a ``model`` and ``roi_field`` are specified
+        alpha (None): an optional expansion/contraction to apply to the patches
+            before extracting them, in ``[-1, inf)``. If provided, the length
+            and width of the box are expanded (or contracted, when
+            ``alpha < 0``) by ``(100 * alpha)%``. For example, set
+            ``alpha = 0.1`` to expand the boxes by 10%, and set
+            ``alpha = -0.1`` to contract the boxes by 10%. Only applicable when
+            a ``model`` and ``roi_field`` are specified
+        batch_size (None): a batch size to use when computing embeddings. Only
+            applicable when a ``model`` is provided
+        num_workers (None): the number of workers to use when loading images.
+            Only applicable when a Torch-based model is being used to compute
+            embeddings
+        skip_failures (True): whether to gracefully continue without raising an
+            error if embeddings cannot be generated for a sample
+        progress (None): whether to render a progress bar (True/False), use the
+            default value ``fiftyone.config.show_progress_bars`` (None), or a
+            progress callback function to invoke instead
+
+    Returns:
+        a :class:`fiftyone.brain.similarity.SimilarityIndex`
+    """
+    import fiftyone.brain.internal.core.duplicates as fbd
+
+    return fbd.compute_near_duplicates(
+        samples,
+        threshold=threshold,
+        roi_field=roi_field,
+        embeddings=embeddings,
+        similarity_index=similarity_index,
+        model=model,
+        model_kwargs=model_kwargs,
+        force_square=force_square,
+        alpha=alpha,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        skip_failures=skip_failures,
+        progress=progress,
+    )
+
+
 def compute_exact_duplicates(
     samples,
     num_workers=None,
@@ -684,7 +797,7 @@ def compute_exact_duplicates(
     """Detects duplicate media in a sample collection.
 
     This method detects exact duplicates with the same filehash. Use
-    :meth:`compute_similarity` to detect near-duplicate images.
+    :meth:`compute_near_duplicates` to detect near-duplicates.
 
     If duplicates are found, the first instance in ``samples`` will be the key
     in the returned dictionary, while the subsequent duplicates will be the
@@ -714,10 +827,13 @@ def compute_leaky_splits(
     samples,
     splits,
     threshold=0.2,
+    roi_field=None,
     embeddings=None,
     similarity_index=None,
     model=None,
     model_kwargs=None,
+    force_square=False,
+    alpha=None,
     batch_size=None,
     num_workers=None,
     skip_failures=True,
@@ -752,14 +868,24 @@ def compute_leaky_splits(
         threshold (0.2): the similarity distance threshold to use when
             detecting leaks. Values in ``[0.1, 0.25]`` work well for the
             default setup
+        roi_field (None): an optional :class:`fiftyone.core.labels.Detection`,
+            :class:`fiftyone.core.labels.Detections`,
+            :class:`fiftyone.core.labels.Polyline`, or
+            :class:`fiftyone.core.labels.Polylines` field defining a region of
+            interest within each image to use to compute leaks
         embeddings (None): if no ``model`` is provided, this argument specifies
             pre-computed embeddings to use, which can be any of the following:
 
             -   a ``num_samples x num_dims`` array of embeddings
+            -   if ``roi_field`` is specified,  a dict mapping sample IDs to
+                ``num_patches x num_dims`` arrays of patch embeddings
             -   the name of a dataset field containing the embeddings to use
 
             If a ``model`` is provided, this argument specifies the name of a
-            field in which to store the computed embeddings
+            field in which to store the computed embeddings. In either case,
+            when working with patch embeddings, you can provide either the
+            fully-qualified path to the patch embeddings or just the name of
+            the label attribute in ``roi_field``
         similarity_index (None): a
             :class:`fiftyone.brain.similarity.SimilarityIndex` or the brain key
             of a similarity index to use to load pre-computed embeddings
@@ -770,6 +896,16 @@ def compute_leaky_splits(
             (``model.has_embeddings = True``)
         model_kwargs (None): a dictionary of optional keyword arguments to pass
             to the model's ``Config`` when a model name is provided
+        force_square (False): whether to minimally manipulate the patch
+            bounding boxes into squares prior to extraction. Only applicable
+            when a ``model`` and ``roi_field`` are specified
+        alpha (None): an optional expansion/contraction to apply to the patches
+            before extracting them, in ``[-1, inf)``. If provided, the length
+            and width of the box are expanded (or contracted, when
+            ``alpha < 0``) by ``(100 * alpha)%``. For example, set
+            ``alpha = 0.1`` to expand the boxes by 10%, and set
+            ``alpha = -0.1`` to contract the boxes by 10%. Only applicable when
+            a ``model`` and ``roi_field`` are specified
         batch_size (None): a batch size to use when computing embeddings. Only
             applicable when a ``model`` is provided
         num_workers (None): the number of workers to use when loading images.
@@ -790,10 +926,13 @@ def compute_leaky_splits(
         samples,
         splits,
         threshold=threshold,
+        roi_field=roi_field,
         embeddings=embeddings,
         similarity_index=similarity_index,
         model=model,
         model_kwargs=model_kwargs,
+        force_square=force_square,
+        alpha=alpha,
         batch_size=batch_size,
         num_workers=num_workers,
         skip_failures=skip_failures,

diff --git a/fiftyone/brain/internal/core/duplicates.py b/fiftyone/brain/internal/core/duplicates.py
@@ -10,13 +10,87 @@
 import logging
 import multiprocessing
 
+import eta.core.utils as etau
+
 import fiftyone.core.media as fom
 import fiftyone.core.utils as fou
 import fiftyone.core.validation as fov
 
+import fiftyone.brain as fb
+import fiftyone.brain.similarity as fbs
+import fiftyone.brain.internal.core.utils as fbu
+
 
 logger = logging.getLogger(__name__)
 
+_DEFAULT_MODEL = "resnet18-imagenet-torch"
+
+
+def compute_near_duplicates(
+    samples,
+    threshold=None,
+    roi_field=None,
+    embeddings=None,
+    similarity_index=None,
+    model=None,
+    model_kwargs=None,
+    force_square=False,
+    alpha=None,
+    batch_size=None,
+    num_workers=None,
+    skip_failures=True,
+    progress=None,
+):
+    """See ``fiftyone/brain/__init__.py``."""
+
+    fov.validate_collection(samples)
+
+    if etau.is_str(embeddings):
+        embeddings_field, embeddings_exist = fbu.parse_embeddings_field(
+            samples,
+            embeddings,
+        )
+        embeddings = None
+    else:
+        embeddings_field = None
+        embeddings_exist = None
+
+    if etau.is_str(similarity_index):
+        similarity_index = samples.load_brain_results(similarity_index)
+
+    if (
+        model is None
+        and embeddings is None
+        and similarity_index is None
+        and not embeddings_exist
+    ):
+        model = _DEFAULT_MODEL
+
+    if similarity_index is None:
+        similarity_index = fb.compute_similarity(
+            samples,
+            backend="sklearn",
+            roi_field=roi_field,
+            embeddings=embeddings_field or embeddings,
+            model=model,
+            model_kwargs=model_kwargs,
+            force_square=force_square,
+            alpha=alpha,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            skip_failures=skip_failures,
+            progress=progress,
+        )
+    elif not isinstance(similarity_index, fbs.DuplicatesMixin):
+        raise ValueError(
+            "This method only supports similarity indexes that implement the "
+            "%s mixin" % fbs.DuplicatesMixin
+        )
+
+    similarity_index.find_duplicates(thresh=threshold)
+
+    return similarity_index
+
 
 def compute_exact_duplicates(samples, num_workers, skip_failures, progress):
     """See ``fiftyone/brain/__init__.py``."""

diff --git a/fiftyone/brain/internal/core/elasticsearch.py b/fiftyone/brain/internal/core/elasticsearch.py
@@ -37,12 +37,6 @@ class ElasticsearchSimilarityConfig(SimilarityConfig):
     """Configuration for a Elasticsearch similarity instance.
 
     Args:
-        embeddings_field (None): the sample field containing the embeddings
-        model (None): the :class:`fiftyone.core.models.Model` or name of the
-            zoo model that was used to compute embeddings, if known
-        patches_field (None): the sample field defining the patches being
-            analyzed, if any
-        supports_prompts (None): whether this run supports prompt queries
         index_name (None): the name of the Elasticsearch index to use or
             create. If none is provided, a new index will be created
         metric ("cosine"): the embedding distance metric to use when creating a
@@ -63,10 +57,6 @@ class ElasticsearchSimilarityConfig(SimilarityConfig):
 
     def __init__(
         self,
-        embeddings_field=None,
-        model=None,
-        patches_field=None,
-        supports_prompts=None,
         index_name=None,
         metric="cosine",
         hosts=None,
@@ -86,13 +76,7 @@ def __init__(
                 % (metric, tuple(_SUPPORTED_METRICS.keys()))
             )
 
-        super().__init__(
-            embeddings_field=embeddings_field,
-            model=model,
-            patches_field=patches_field,
-            supports_prompts=supports_prompts,
-            **kwargs,
-        )
+        super().__init__(**kwargs)
 
         self.index_name = index_name
         self.metric = metric