Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/leaky splits #203

Merged
merged 60 commits into from
Nov 25, 2024
Merged
Changes from 8 commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
c8207ce
initial commit
Oct 15, 2024
d8c85e2
after much deliberation, quick implementation, and shell of lengthier…
Oct 17, 2024
0a5412b
small fixes
Oct 21, 2024
d4d2b99
added basic filpath hash functionality
Oct 21, 2024
07d06a7
refactor - very wip
Oct 21, 2024
a2b9594
some fixes
Oct 22, 2024
d13a494
to views implmented
Oct 22, 2024
d87d769
implemented leaks for hash
Oct 22, 2024
e42aaf5
sklearn backend basic functionality implemented and integrated
Oct 22, 2024
110a1c0
made the hash backend give out an ordered view
Oct 22, 2024
8f630bb
cache leak view after first time it's computed
Oct 23, 2024
6b4eaec
cache leak view after first time it's computed
Oct 23, 2024
4733226
some documentation and cleanup
Oct 24, 2024
276489b
far better caching mechanism
Oct 24, 2024
a923c6a
filter res so it's actually leaks and not just sim
Oct 25, 2024
80fe828
bugfix
Oct 29, 2024
855c9ac
more bugfixes
Oct 29, 2024
f6a6652
added model kwargs to leaky splits sklearn backend
Nov 1, 2024
fe28ce7
wrote main function
Nov 5, 2024
7d4a552
removed remove_leaks, replaced it with view_without_leaks
Nov 5, 2024
d55c9c7
cleanup and documentation
Nov 13, 2024
09b5e51
cleanup and documentation
Nov 13, 2024
7449545
removed patches
Nov 19, 2024
2d86e79
added checks for non empty support and no overlap when providing spli…
Nov 19, 2024
5b56182
refactor + bugfix sometimes a sample would be kept even when it had n…
Nov 20, 2024
a607ff5
fixed accessing previous brain runs
Nov 20, 2024
7c741ee
updated main function and fixed serialization bug
Nov 20, 2024
12b0975
typo
Nov 20, 2024
a390118
added cleanup
Nov 20, 2024
41190c4
another probably redundant optimization check
Nov 20, 2024
fe89ea7
a lot of thinking and not a lot of writing code
Nov 20, 2024
ee20404
optimized leak finding
Nov 20, 2024
07d5bd3
removed old code
Nov 21, 2024
2a63a37
updated docs
Nov 21, 2024
a5ad99c
moved compute function to __init__
Nov 21, 2024
24b3a29
updated docs
Nov 21, 2024
f53022c
removed more old code
Nov 21, 2024
4047459
moved similarity registration out of class, doesn't make sense for it…
Nov 21, 2024
3b54720
documentation fixes
Nov 21, 2024
644b8d5
cleaned up imports
Nov 21, 2024
eb34ca0
dealt with leaks by sample edge case
Nov 21, 2024
3e3ddb8
assume loading of brain run happens correctly
Nov 21, 2024
d2bdbd6
changed variable name
Nov 21, 2024
13364f7
made the ethod name lowercase
Nov 21, 2024
080e8d7
renamed leaks_by_sample to leaks_for_sample
Nov 21, 2024
54ecb5a
renamed view_without_leaks to no_leak_view
Nov 21, 2024
ff08fd3
updated docs
Nov 21, 2024
a666b58
compute embeddings on the fly
Nov 21, 2024
e1a7b4f
changed method type property
Nov 21, 2024
a352898
fixed passing tags
Nov 21, 2024
1cf9b5f
fixed order of precedence for defaults, similarity conf dict, and arg…
Nov 21, 2024
c445865
made id2split internal
Nov 22, 2024
9a189e0
throw warning when a considered sample is not in any of the splits
Nov 22, 2024
49059c9
added warnings for view matching heuristics
Nov 22, 2024
992767e
updated docs to reflect importance of arguments
Nov 22, 2024
27be7fa
changed variable for clarity
Nov 22, 2024
4f6f4a0
removed unnused variable
Nov 22, 2024
7d9e418
changed leaks to leaks_view
Nov 25, 2024
a12c349
made tag leaks use tag_samples
Nov 25, 2024
9c51d46
changed _to_views docs
Nov 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
373 changes: 373 additions & 0 deletions fiftyone/brain/internal/core/leaky_splits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,373 @@
"""
Finds leaks between splits.
"""

from collections import defaultdict
from copy import copy

import fiftyone as fo
from fiftyone import ViewField as F

# pylint: disable=no-member
import cv2

import fiftyone.core.brain as fob
import fiftyone.brain.similarity as sim
import fiftyone.brain.internal.core.sklearn as skl_sim
import fiftyone.brain.internal.core.duplicates as dups
import fiftyone.core.utils as fou


def compute_leaky_splits(
jacobsela marked this conversation as resolved.
Show resolved Hide resolved
samples,
split_tags,
method="similarity",
similarity_backend=None,
similarity_backend_kwargs=None,
**kwargs,
jacobsela marked this conversation as resolved.
Show resolved Hide resolved
):
print("bar")


_BASIC_METHODS = ["filepath", "image_hash", "neural"]

### GENERAL


class LeakySplitsConfigInterface(object):
jacobsela marked this conversation as resolved.
Show resolved Hide resolved
"""Configuration for Leaky Splits

Args:
split_views (None): list of views corresponding to different splits
split_field (None): field name that contains the split that the sample belongs to
split_tags (None): list of tags that correspond to different splits
"""

def __init__(
self, split_views=None, split_field=None, split_tags=None, **kwargs
):
self.split_views = split_views
self.split_field = split_field
self.split_tags = split_tags
super().__init__(**kwargs)


class LeakySplitIndexInterface(object):
def __init__(self) -> None:
pass

@property
def num_leaks(self):
return self.leaks.count

@property
def leaks(self):
jacobsela marked this conversation as resolved.
Show resolved Hide resolved
"""
Returns view with all potential leaks.
"""
pass

def leaks_by_sample(self, sample):
"""
Return view with all leaks related to a certain sample.
"""
pass

def remove_leaks(self, remove_from):
"""Remove leaks from dataset

Args:
remove_from: tag/field value/view to remove from (e.g. remove the leak from 'test')
"""
pass

def tag_leaks(self, tag="leak"):
jacobsela marked this conversation as resolved.
Show resolved Hide resolved
"""Tag leaks"""
for s in self.leaks.iter_samples():
s.tags.append(tag)
s.save()


def _to_views(samples, split_views=None, split_field=None, split_tags=None):
"""Helper function so that we can always work with views"""

arithmetic_true = lambda x: int(x is not None)
jacobsela marked this conversation as resolved.
Show resolved Hide resolved
num_given = (
arithmetic_true(split_views)
+ arithmetic_true(split_field)
+ arithmetic_true(split_tags)
)

if num_given == 0:
raise ValueError(f"One of the split arguments must be given.")
if num_given > 1:
raise ValueError(f"Only one of the split arguments must be given.")

if split_views:
return split_views

if split_field:
return _field_to_views(samples, split_field)

if split_tags:
return _tags_to_views(samples, split_tags)
jacobsela marked this conversation as resolved.
Show resolved Hide resolved


def _field_to_views(samples, field):
field_values = samples.distinct(field)

if len(field_values) < 2:
raise ValueError(
f"Field {field} has less than 2 distinct values,"
f"can't be used to create splits"
)

views = []
for val in field_values:
view = samples.match(F(field) == val)
views.append(view)

return views


def _tags_to_views(samples, tags):
jacobsela marked this conversation as resolved.
Show resolved Hide resolved
if len(tags) < 2:
raise ValueError("Must provide at least two tags.")

views = []
for tag in tags:
view = samples.match_tags([tag])
views.append(view)
return views


###

### SKL BACKEND
class LeakySplitsSKLConfig(skl_sim.SklearnSimilarityConfig):
"""Configuration for Leaky Splits with the SKLearn backend

Args:
split_views (None): list of views corresponding to different splits
split_field (None): field name that contains the split that the sample belongs to
split_tags (None): list of tags that correspond to different splits
method ('filepath'): method to determine leaks
"""

def __init__(
self,
split_views=None,
split_field=None,
split_tags=None,
method="filepath",
**kwargs,
):
self.split_tags = split_tags
self._method = method
super().__init__(**kwargs)

@property
def method(self):
return self._method


class LeakySplitsSKL(skl_sim.SklearnSimilarity):
def initialize(self, samples, brain_key):
return LeakySplitsSKLIndex(
samples, self.config, brain_key, backend=self
)


class LeakySplitsSKLIndex(skl_sim.SklearnSimilarityIndex):
def __init__(self, samples, config, brain_key, **kwargs):
super().__init__(samples, config, brain_key, **kwargs)

self._hash_index = None
if self.config.method == "filepath":
self._initialize_hash_index(samples)

def _initialize_hash_index(self, samples):
neighbors_map = dups.compute_exact_duplicates(
samples, None, False, True
)
self._hash_index = neighbors_map

def _sort_by_hash_leak(self, sample):

conflicting_ids = []
for hash, ids in self._hash_index.items():
if sample["id"] in ids:
conflicting_ids = copy(ids)
break

sample_split = self._sample_split(sample)
tags_to_search = self._tags_to_search(sample_split)

conflicting_samples = self._dataset.select(conflicting_ids)

final = conflicting_samples.match_tags(tags_to_search)

return final

def sort_by_leak_potential(self, sample, k=None, dist_field=None):
if self.config.method == "filepath":
return self._sort_by_hash_leak(sample)

# using neural method

# isolate view to search through
sample_split = self._sample_split(sample)
tags_to_search = self._tags_to_search(sample_split)
self.use_view(self._dataset.match_tags(tags_to_search))

# run similarity
return self.sort_by_similarity(sample["id"], k, dist_field=dist_field)

def _sample_split(self, sample):
sample_split = set(self.config.split_tags) & set(sample.tags)
if len(sample_split) > 1:
raise ValueError("sample belongs to multiple splits.")
if len(sample_split) == 0:
raise ValueError(f"sample is not part of any split!")
sample_split = sample_split.pop()
return sample_split

def _tags_to_search(self, sample_split):
tags_to_search = copy(self.config.split_tags)
tags_to_search.remove(sample_split)
return tags_to_search


###

### HASH BACKEND

_HASH_METHODS = ["filepath", "image"]


class LeakySplitsHashConfig(fob.BrainMethodConfig, LeakySplitsConfigInterface):
"""

Args:
hash_field (None): string, field to write hashes into
"""

def __init__(
self,
split_views=None,
split_field=None,
split_tags=None,
method="filepath",
hash_field=None,
**kwargs,
):
self._method = method
self.hash_field = hash_field
LeakySplitsConfigInterface.__init__(
self,
split_views=split_views,
split_field=split_field,
split_tags=split_tags,
)
fob.BrainMethodConfig.__init__(self, **kwargs)

@property
def method(self):
return self._method


class LeakySplitsHash(fob.BrainMethod):
def initialize(self, samples, brain_key):
return LeakySplitsHashIndex(
samples, self.config, brain_key, backend=self
)


class LeakySplitsHashIndex(fob.BrainResults, LeakySplitIndexInterface):
""" """

def __init__(self, samples, config, brain_key, backend):
fob.BrainResults.__init__(
self, samples, config, brain_key, backend=backend
)
LeakySplitIndexInterface.__init__(self)
self._hash2ids = defaultdict(list)
self.split_views = _to_views(
samples,
self.config.split_views,
self.config.split_field,
self.config.split_tags,
)
self._dataset = samples._dataset
self._compute_hashes(samples)

@property
def _hash_function(self):
if self.config.method == "filepath":
return fou.compute_filehash

elif self.config.method == "image":
return LeakySplitsHashIndex._image_hash
jacobsela marked this conversation as resolved.
Show resolved Hide resolved

def _compute_hashes(self, samples):
for s in samples.iter_samples():
hash = str(self._hash_function(s["filepath"]))
self._hash2ids[hash].append(s["id"])
if self.config.hash_field:
s[self.config.hash_field] = hash
s.save()

@staticmethod
def _image_hash(image, hash_size=24):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think image hashing is a generally useful enough function that it should be pulled out. For example, the compute_exact_duplicates method uses this hash but could also easily be changed to use a image hash.

"""
Compute the dHash for the input image.

:param image: image filepath
:param hash_size: Size of the hash (default 8x8).
:return: The dHash value of the image as a 64-bit integer.
"""

with open(image, "r"):
image = cv2.imread(image)

# Convert the image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Resize the image to (hash_size + 1, hash_size)
resized = cv2.resize(gray, (hash_size + 1, hash_size))

# Compute the differences between adjacent pixels
diff = resized[:, 1:] > resized[:, :-1]

# Convert the difference image to a binary hash
# hash_value = sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

# Convert the difference image to a binary hash
binary_string = "".join(["1" if v else "0" for v in diff.flatten()])

# Convert the binary string to a hexadecimal string
hex_hash = f"{int(binary_string, 2):0{hash_size * hash_size // 4}x}"
jacobsela marked this conversation as resolved.
Show resolved Hide resolved

return hex_hash

@property
def leaks(self):
leak_ids = []
for id_list in self._hash2ids.values():
if len(id_list) > 1:
leak_ids = leak_ids + id_list

return self._dataset.select(leak_ids)

def leaks_by_sample(self, sample):
id = None
if isinstance(sample, str):
id = sample
else:
id = sample["id"]
for id_list in self._hash2ids.values():
if id in id_list:
return self._dataset.select(id_list)


###
Loading