Skip to content

Commit

Permalink
add compute_near_duplicates() method
Browse files Browse the repository at this point in the history
  • Loading branch information
brimoor committed Dec 2, 2024
1 parent e0678a9 commit d4f80bc
Show file tree
Hide file tree
Showing 14 changed files with 288 additions and 169 deletions.
153 changes: 146 additions & 7 deletions fiftyone/brain/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,7 @@ def compute_visualization(
def compute_similarity(
samples,
patches_field=None,
roi_field=None,
embeddings=None,
brain_key=None,
model=None,
Expand Down Expand Up @@ -592,6 +593,11 @@ def compute_similarity(
:class:`fiftyone.core.labels.Detections`,
:class:`fiftyone.core.labels.Polyline`, or
:class:`fiftyone.core.labels.Polylines`
roi_field (None): an optional :class:`fiftyone.core.labels.Detection`,
:class:`fiftyone.core.labels.Detections`,
:class:`fiftyone.core.labels.Polyline`, or
:class:`fiftyone.core.labels.Polylines` field defining a region of
interest within each image to use to compute embeddings
embeddings (None): embeddings to feed the index. This argument's
behavior depends on whether a ``model`` is provided, as described
below.
Expand All @@ -600,8 +606,9 @@ def compute_similarity(
embeddings to use:
- a ``num_samples x num_dims`` array of embeddings
- if ``patches_field`` is specified, a dict mapping sample IDs
to ``num_patches x num_dims`` arrays of patch embeddings
- if ``patches_field``/``roi_field`` is specified, a dict
mapping sample IDs to ``num_patches x num_dims`` arrays of
patch embeddings
- the name of a dataset field from which to load embeddings
- ``None``: use the default model to compute embeddings
- ``False``: **do not** compute embeddings right now
Expand All @@ -614,7 +621,7 @@ def compute_similarity(
In either case, when working with patch embeddings, you can provide
either the fully-qualified path to the patch embeddings or just the
name of the label attribute in ``patches_field``
name of the label attribute in ``patches_field``/``roi_field``
brain_key (None): a brain key under which to store the results of this
method
model (None): a :class:`fiftyone.core.models.Model` or the name of a
Expand All @@ -626,14 +633,14 @@ def compute_similarity(
to the model's ``Config`` when a model name is provided
force_square (False): whether to minimally manipulate the patch
bounding boxes into squares prior to extraction. Only applicable
when a ``model`` and ``patches_field`` are specified
when a ``model`` and ``patches_field``/``roi_field`` are specified
alpha (None): an optional expansion/contraction to apply to the patches
before extracting them, in ``[-1, inf)``. If provided, the length
and width of the box are expanded (or contracted, when
``alpha < 0``) by ``(100 * alpha)%``. For example, set
``alpha = 0.1`` to expand the boxes by 10%, and set
``alpha = -0.1`` to contract the boxes by 10%. Only applicable when
a ``model`` and ``patches_field`` are specified
a ``model`` and ``patches_field``/``roi_field`` are specified
batch_size (None): an optional batch size to use when computing
embeddings. Only applicable when a ``model`` is provided
num_workers (None): the number of workers to use when loading images.
Expand All @@ -660,6 +667,7 @@ def compute_similarity(
return fbs.compute_similarity(
samples,
patches_field,
roi_field,
embeddings,
brain_key,
model,
Expand All @@ -675,6 +683,111 @@ def compute_similarity(
)


def compute_near_duplicates(
samples,
threshold=0.2,
roi_field=None,
embeddings=None,
similarity_index=None,
model=None,
model_kwargs=None,
force_square=False,
alpha=None,
batch_size=None,
num_workers=None,
skip_failures=True,
progress=None,
):
"""Detects potential duplicates in the given sample collection.
Calling this method only initializes the index. You can then call the
methods exposed on the returned object to perform the following operations:
- :meth:`duplicate_ids <fiftyone.brain.similarity.DuplicatesMixin.duplicate_ids>`:
A list of duplicate IDs
- :meth:`neighbors_map <fiftyone.brain.similarity.DuplicatesMixin.neighbors_map>`:
A dictionary mapping IDs to lists of ``(dup_id, dist)`` tuples
- :meth:`duplicates_view() <fiftyone.brain.similarity.DuplicatesMixin.duplicates_view>`:
Returns a view of all duplicates in the input collection
Args:
samples: a :class:`fiftyone.core.collections.SampleCollection`
threshold (0.2): the similarity distance threshold to use when
detecting duplicates. Values in ``[0.1, 0.25]`` work well for the
default setup
roi_field (None): an optional :class:`fiftyone.core.labels.Detection`,
:class:`fiftyone.core.labels.Detections`,
:class:`fiftyone.core.labels.Polyline`, or
:class:`fiftyone.core.labels.Polylines` field defining a region of
interest within each image to use to compute leaks
embeddings (None): if no ``model`` is provided, this argument specifies
pre-computed embeddings to use, which can be any of the following:
- a ``num_samples x num_dims`` array of embeddings
- if ``roi_field`` is specified, a dict mapping sample IDs to
``num_patches x num_dims`` arrays of patch embeddings
- the name of a dataset field containing the embeddings to use
If a ``model`` is provided, this argument specifies the name of a
field in which to store the computed embeddings. In either case,
when working with patch embeddings, you can provide either the
fully-qualified path to the patch embeddings or just the name of
the label attribute in ``roi_field``
similarity_index (None): a
:class:`fiftyone.brain.similarity.SimilarityIndex` or the brain key
of a similarity index to use to load pre-computed embeddings
model (None): a :class:`fiftyone.core.models.Model` or the name of a
model from the
`FiftyOne Model Zoo <https://docs.voxel51.com/user_guide/model_zoo/models.html>`_
to use to generate embeddings. The model must expose embeddings
(``model.has_embeddings = True``)
model_kwargs (None): a dictionary of optional keyword arguments to pass
to the model's ``Config`` when a model name is provided
force_square (False): whether to minimally manipulate the patch
bounding boxes into squares prior to extraction. Only applicable
when a ``model`` and ``roi_field`` are specified
alpha (None): an optional expansion/contraction to apply to the patches
before extracting them, in ``[-1, inf)``. If provided, the length
and width of the box are expanded (or contracted, when
``alpha < 0``) by ``(100 * alpha)%``. For example, set
``alpha = 0.1`` to expand the boxes by 10%, and set
``alpha = -0.1`` to contract the boxes by 10%. Only applicable when
a ``model`` and ``roi_field`` are specified
batch_size (None): a batch size to use when computing embeddings. Only
applicable when a ``model`` is provided
num_workers (None): the number of workers to use when loading images.
Only applicable when a Torch-based model is being used to compute
embeddings
skip_failures (True): whether to gracefully continue without raising an
error if embeddings cannot be generated for a sample
progress (None): whether to render a progress bar (True/False), use the
default value ``fiftyone.config.show_progress_bars`` (None), or a
progress callback function to invoke instead
Returns:
a :class:`fiftyone.brain.similarity.SimilarityIndex`
"""
import fiftyone.brain.internal.core.duplicates as fbd

return fbd.compute_near_duplicates(
samples,
threshold=threshold,
roi_field=roi_field,
embeddings=embeddings,
similarity_index=similarity_index,
model=model,
model_kwargs=model_kwargs,
force_square=force_square,
alpha=alpha,
batch_size=batch_size,
num_workers=num_workers,
skip_failures=skip_failures,
progress=progress,
)


def compute_exact_duplicates(
samples,
num_workers=None,
Expand All @@ -684,7 +797,7 @@ def compute_exact_duplicates(
"""Detects duplicate media in a sample collection.
This method detects exact duplicates with the same filehash. Use
:meth:`compute_similarity` to detect near-duplicate images.
:meth:`compute_near_duplicates` to detect near-duplicates.
If duplicates are found, the first instance in ``samples`` will be the key
in the returned dictionary, while the subsequent duplicates will be the
Expand Down Expand Up @@ -714,10 +827,13 @@ def compute_leaky_splits(
samples,
splits,
threshold=0.2,
roi_field=None,
embeddings=None,
similarity_index=None,
model=None,
model_kwargs=None,
force_square=False,
alpha=None,
batch_size=None,
num_workers=None,
skip_failures=True,
Expand Down Expand Up @@ -752,14 +868,24 @@ def compute_leaky_splits(
threshold (0.2): the similarity distance threshold to use when
detecting leaks. Values in ``[0.1, 0.25]`` work well for the
default setup
roi_field (None): an optional :class:`fiftyone.core.labels.Detection`,
:class:`fiftyone.core.labels.Detections`,
:class:`fiftyone.core.labels.Polyline`, or
:class:`fiftyone.core.labels.Polylines` field defining a region of
interest within each image to use to compute leaks
embeddings (None): if no ``model`` is provided, this argument specifies
pre-computed embeddings to use, which can be any of the following:
- a ``num_samples x num_dims`` array of embeddings
- if ``roi_field`` is specified, a dict mapping sample IDs to
``num_patches x num_dims`` arrays of patch embeddings
- the name of a dataset field containing the embeddings to use
If a ``model`` is provided, this argument specifies the name of a
field in which to store the computed embeddings
field in which to store the computed embeddings. In either case,
when working with patch embeddings, you can provide either the
fully-qualified path to the patch embeddings or just the name of
the label attribute in ``roi_field``
similarity_index (None): a
:class:`fiftyone.brain.similarity.SimilarityIndex` or the brain key
of a similarity index to use to load pre-computed embeddings
Expand All @@ -770,6 +896,16 @@ def compute_leaky_splits(
(``model.has_embeddings = True``)
model_kwargs (None): a dictionary of optional keyword arguments to pass
to the model's ``Config`` when a model name is provided
force_square (False): whether to minimally manipulate the patch
bounding boxes into squares prior to extraction. Only applicable
when a ``model`` and ``roi_field`` are specified
alpha (None): an optional expansion/contraction to apply to the patches
before extracting them, in ``[-1, inf)``. If provided, the length
and width of the box are expanded (or contracted, when
``alpha < 0``) by ``(100 * alpha)%``. For example, set
``alpha = 0.1`` to expand the boxes by 10%, and set
``alpha = -0.1`` to contract the boxes by 10%. Only applicable when
a ``model`` and ``roi_field`` are specified
batch_size (None): a batch size to use when computing embeddings. Only
applicable when a ``model`` is provided
num_workers (None): the number of workers to use when loading images.
Expand All @@ -790,10 +926,13 @@ def compute_leaky_splits(
samples,
splits,
threshold=threshold,
roi_field=roi_field,
embeddings=embeddings,
similarity_index=similarity_index,
model=model,
model_kwargs=model_kwargs,
force_square=force_square,
alpha=alpha,
batch_size=batch_size,
num_workers=num_workers,
skip_failures=skip_failures,
Expand Down
74 changes: 74 additions & 0 deletions fiftyone/brain/internal/core/duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,87 @@
import logging
import multiprocessing

import eta.core.utils as etau

import fiftyone.core.media as fom
import fiftyone.core.utils as fou
import fiftyone.core.validation as fov

import fiftyone.brain as fb
import fiftyone.brain.similarity as fbs
import fiftyone.brain.internal.core.utils as fbu


logger = logging.getLogger(__name__)

_DEFAULT_MODEL = "resnet18-imagenet-torch"


def compute_near_duplicates(
samples,
threshold=None,
roi_field=None,
embeddings=None,
similarity_index=None,
model=None,
model_kwargs=None,
force_square=False,
alpha=None,
batch_size=None,
num_workers=None,
skip_failures=True,
progress=None,
):
"""See ``fiftyone/brain/__init__.py``."""

fov.validate_collection(samples)

if etau.is_str(embeddings):
embeddings_field, embeddings_exist = fbu.parse_embeddings_field(
samples,
embeddings,
)
embeddings = None
else:
embeddings_field = None
embeddings_exist = None

if etau.is_str(similarity_index):
similarity_index = samples.load_brain_results(similarity_index)

if (
model is None
and embeddings is None
and similarity_index is None
and not embeddings_exist
):
model = _DEFAULT_MODEL

if similarity_index is None:
similarity_index = fb.compute_similarity(
samples,
backend="sklearn",
roi_field=roi_field,
embeddings=embeddings_field or embeddings,
model=model,
model_kwargs=model_kwargs,
force_square=force_square,
alpha=alpha,
batch_size=batch_size,
num_workers=num_workers,
skip_failures=skip_failures,
progress=progress,
)
elif not isinstance(similarity_index, fbs.DuplicatesMixin):
raise ValueError(
"This method only supports similarity indexes that implement the "
"%s mixin" % fbs.DuplicatesMixin
)

similarity_index.find_duplicates(thresh=threshold)

return similarity_index


def compute_exact_duplicates(samples, num_workers, skip_failures, progress):
"""See ``fiftyone/brain/__init__.py``."""
Expand Down
18 changes: 1 addition & 17 deletions fiftyone/brain/internal/core/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,6 @@ class ElasticsearchSimilarityConfig(SimilarityConfig):
"""Configuration for a Elasticsearch similarity instance.
Args:
embeddings_field (None): the sample field containing the embeddings
model (None): the :class:`fiftyone.core.models.Model` or name of the
zoo model that was used to compute embeddings, if known
patches_field (None): the sample field defining the patches being
analyzed, if any
supports_prompts (None): whether this run supports prompt queries
index_name (None): the name of the Elasticsearch index to use or
create. If none is provided, a new index will be created
metric ("cosine"): the embedding distance metric to use when creating a
Expand All @@ -63,10 +57,6 @@ class ElasticsearchSimilarityConfig(SimilarityConfig):

def __init__(
self,
embeddings_field=None,
model=None,
patches_field=None,
supports_prompts=None,
index_name=None,
metric="cosine",
hosts=None,
Expand All @@ -86,13 +76,7 @@ def __init__(
% (metric, tuple(_SUPPORTED_METRICS.keys()))
)

super().__init__(
embeddings_field=embeddings_field,
model=model,
patches_field=patches_field,
supports_prompts=supports_prompts,
**kwargs,
)
super().__init__(**kwargs)

self.index_name = index_name
self.metric = metric
Expand Down
Loading

0 comments on commit d4f80bc

Please sign in to comment.