voxel51 · jacobmarks · Jun 1, 2023 · Jun 1, 2023 · Jun 1, 2023 · May 28, 2023
diff --git a/DEPLOY.md b/DEPLOY.md
@@ -0,0 +1,59 @@
+## How to deploy voxelgpt to Fiftyone Teams
+
+> **You must have the [contributor steps](README.md#contributing) completed before running the commands below**
+
+**Bump the version**
+
+Bump the version of the plugin by running:
+
+```
+# bumps the patch version (for bug fixes only)
+yarn bump
+
+# sets the version
+yarn bump -- 1.2.3
+```
+
+**Commit all Files**
+
+Only files committed locally will be included in the plugin archive.
+
+This is also a good time to tag the new version.
+
+```
+VERSION=1.2.3
+git checkout -b release/$VERSION
+git add . # files you want included
+git commit -m 'release version $VERSION' # this will be in the output from the command above
+git tag $VERSION
+git push origin --follow-tags # push the commit and tags
+```
+
+**Create the Plugin Archive**
+
+```
+yarn archive
+```
+
+**Upload Archive to Teams**
+
+Goto [https://MY_FIFTYONE_TEAMS/settings/plugins](https://MY_FIFTYONE_TEAMS/settings/plugins).
+
+To install a new plugin, click "Install plugin". To upgrade an existing plugin, find it in the list and click the 3 dots and choose "Upgrade plugin".
+
+Upload the newly created archive.
+
+**Set your Permissions**
+
+Find the plugin in the list and click on "X operators". Select the appropriate permissions for your plugin.
+
+**That's it!**
+
+At this point you should have a newly installed/upgraded plugin. Users will see this change immediately.
+
+**Troubleshooting Tips**
+
+If you are seeing issues with a plugin not updating:
+
+ - check the logs for any additional information
+ - restart the appropriate pods (if you have `teams-plugins` pods, those should be the only ones restarted, otherwise restart the `fiftyone-app` pods.)
diff --git a/README.md b/README.md
@@ -358,6 +358,27 @@ You can manually lint a file if necessary like so:
 pre-commit run --files <file>
 ```
 
+**Developing and Building the Plugin JS Bundle**
+
+To build the Fiftyone plugin you must:
+
+ - install `fiftyone` from source (including the app dependencies installed). [See here](https://github.com/voxel51/fiftyone/blob/develop/CONTRIBUTING.md) for details.
+ - the environment variable `FIFTYONE_DIR` set to the source directory of `fiftyone`
+ - `[email protected]` installed.
+ - installed the `voxelgpt` dependencies by running `yarn install` in the `voxelgpt` directory
+
+To create a build, run:
+
+```sh
+# for a production build of the plugin js bundle
+yarn build
+
+# for rebuilding the bundle automatically during development
+yarn dev
+```
+
+> NOTE: when developing locally you must set `FIFTYONE_PLUGINS_DIR` to a directory containing the `voxelgpt` directory.
+
 ## How does it work?
 
 VoxelGPT uses:
@@ -394,6 +415,10 @@ The current implementation supports most FiftyOne
 certain stages like `concat()`, `mongo()`, and `geo_within()` are not yet
 supported. We're working on it!
 
+### Deploying on FiftyOne Teams
+
+Instructions for deploying the plugin to FiftyOne Teams are [here](DEPLOY.md).
+
 ## About FiftyOne
 
 If you've made it this far, we'd greatly appreciate if you'd take a moment to

diff --git a/examples/viewstage_embeddings.pkl b/examples/viewstage_embeddings.pkl
diff --git a/examples/viewstage_examples.csv b/examples/viewstage_examples.csv
@@ -29,7 +29,7 @@ first 100 samples,[limit(100)],Jacob,all,FALSE,FALSE,FALSE,FALSE,FALSE,all
 Limit the view to 35,[limit(35)],Jacob,all,FALSE,FALSE,FALSE,FALSE,FALSE,all
 Only include samples that contain predictions with > 99% confidence,"[match_labels(filter=F('confidence') > 0.99, fields='predictions')]",Jacob,all,FALSE,FALSE,FALSE,FALSE,FALSE,all
 Only include samples that contain labels with ids hofwihuf or abxjhbvcie,"[match_labels(ids=[hofwihuf, abxjhbvcie])]",Jacob,all,FALSE,FALSE,FALSE,FALSE,FALSE,all
-get samples with labels with the 'test tag,"[match_labels(tags='test')]
+get samples with labels with the 'test tag,"[match_labels(tags='test')]
 ",Jacob,all,FALSE,FALSE,FALSE,FALSE,FALSE,all
 Only include samples that have the 'mistake' tag,[match_tags('mistake')],Jacob,all,FALSE,FALSE,FALSE,FALSE,FALSE,all
 Only include samples that do not have the 'validation' tag,"[match_tags('validation', bool=False)]",Jacob,all,FALSE,FALSE,FALSE,FALSE,FALSE,all
@@ -107,7 +107,7 @@ display the whole dataset,[],Jacob,all,FALSE,FALSE,FALSE,FALSE,FALSE,all
 Exclude frame with id 'kbdskajdvfef',[exclude_frames(['kbdskajdvfef'])],Jacob,video,FALSE,FALSE,FALSE,FALSE,FALSE,all
 clip view with one clip per meeting,"[filter_labels('events', F('label') == 'meeting'), to_clips('events')]",Jacob,video,FALSE,FALSE,FALSE,FALSE,FALSE,all
 Create a clips view that contains one clip for each contiguous segment that contains at least one road sign in every frame,"[filter_labels('frames.detections', F('label') == 'road sign'), to_clips('frames.detections')]",Jacob,video,FALSE,FALSE,FALSE,FALSE,FALSE,detection
-Create a trajectories view for the vehicles in the dataset,"[filter_labels('frames.detections', F('label') == 'vehicle'),
+Create a trajectories view for the vehicles in the dataset,"[filter_labels('frames.detections', F('label') == 'vehicle'),
  to_trajectories('frames.detections')]",Jacob,video,FALSE,FALSE,FALSE,FALSE,FALSE,detection
 show 'vehicle' detections in the 'detections' field,"[filter_labels('frames.detections', F('label') == 'vehicle')]",Jacob,video,FALSE,FALSE,FALSE,FALSE,FALSE,detection
 "Create a frames view that only contains frames with at least 10 objects, sampled at a maximum frame rate of 1fps","[match_frames(F('detections.detections').length() > 10), to_frames(max_fps=1)]",Jacob,video,FALSE,FALSE,FALSE,FALSE,FALSE,detection
@@ -150,8 +150,8 @@ Discard all predictions with confidence below 0.3,"[filter_labels('predictions',
 Only include classifications in the `predictions` field whose label is 'frog' or 'turtle',"[filter_labels('predictions', F('label').is_in(['frog', 'turtle']))]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,all
 Only include polylines in the `faster-rcnn` field whose `label` is 'lane',"[filter_labels('faster-rcnn', F('label') == 'lane')]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,all
 Only contains predictions whose bounding boxes' upper left corner is a Manhattan distance of at least 1 from the origin,"[filter_labels('predictions, F('bounding_box')[0] + F('bounding_box')[1] > 1)]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
-"Create a view that only contains predictions whose bounding boxes
-have area < 0.2 with confidence > 0.9, and only include samples with
+"Create a view that only contains predictions whose bounding boxes
+have area < 0.2 with confidence > 0.9, and only include samples with
 at least 15 such objects","[filter_labels('predictions', (bbox_area < 0.2) & (F('confidence') > 0.9)), match(F('predictions.detections').length() > 15)]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
 Only include detections in the `predictions` field whose bounding box is smaller than 0.2,"[filter_labels('predictions', F('bounding_box')[2] * F('bounding_box')[3] < 0.2)]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
 Only include polylines in the `predictions` field that are filled,"[filter_labels('predictions', F('filled') == True)]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,polyline
@@ -168,7 +168,7 @@ the first 30 samples with a plant,"[match(F('ground_truth.detections.label').con
 10 random images with tables,"[match(F('ground_truth.detections.label').contains('table')), take(10)]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
 Contains a rabbit and a tortoise prediction,"[match(F('predictions.detections.label').contains(['rabbit', 'tortoise'], all=True))]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
 Contains a cat or mouse but not both,"[match(F('predictions.detections.label').contains(['cat', 'mouse']) & ~F('predictions.detections.label').contains(['cat', 'mouse'], all=True))]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
-Only contains samples whose first and last prediction have the same label,"[match(F('predictions.detections')[0].apply(F('label')) == F('predictions.detections').reverse()[0].apply(F('label')))]
+Only contains samples whose first and last prediction have the same label,"[match(F('predictions.detections')[0].apply(F('label')) == F('predictions.detections').reverse()[0].apply(F('label')))]
 ",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
 unique and wrong,"[match(F('predictions.label') != F('ground_truth.label')), sort_by('uniqueness', reverse=True)]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,classification
 fewer than 4 ground truth detections,[match(F('ground_truth.detections').length() < 10)],Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
@@ -359,4 +359,16 @@ give me all the samples without a motorcycle prediction,"[match(!F(""pred.detect
 cars or pedestrians,"[filter_labels(""gt"", F(""label"").is_in([""car"", pedestrian""])]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,all
 show me all the samples that were misclassified,"[match(F(""EVAL_KEY"") == False)]",Jacob,image,FALSE,FALSE,FALSE,TRUE,FALSE,classification
 I want any images that were correctly classified,"[match(F(""EVAL_KEY"") == True)]",Jacob,image,FALSE,FALSE,FALSE,TRUE,FALSE,classification
-show me the first 10 incorrectly classified predictions,"[match(F(""EVAL_KEY"") == False), limit(10)]",Jacob,image,FALSE,FALSE,FALSE,TRUE,FALSE,classification
+show me the first 10 incorrectly classified predictions,"[match(F(""EVAL_KEY"") == False), limit(10)]",Jacob,image,FALSE,FALSE,FALSE,TRUE,FALSE,classification
+show me object patches for airplanes in the predictions field,"[filter_labels(""predictions"", F(""label"") == ""airplane""), to_patches(""predictions"")]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
+all of the model1 detection patches in the first image,"[limit(1), to_patches(""model1"")]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
+I only want to see the high confidence object detections predicted by resnet,"[limit(10), filter_labels(""resnet"", F(""confidence"")>0.9), to_patches(""resnet"")]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
+display all the carrot objects in the wandb_05_09 field,"[filter_labels(""wandb_05_09"", F(""label"")==""carrot""), to_patches(""wandb_05_09"")",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
+show me all the non-dog detections,"[filter_labels(""ground_truth"", F(""label"") != ""dog"")]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
+only show samples that don't have a hat prediction,"[match(~F(""prediction.detections.label"").contains(""hat""))]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
+animals that aren't horses,"[filter_labels(""ground_truth"", F(""label"") != ""horses""), sort_by_similarity(""animal"", brain_key = ""TEXT_SIM_KEY"", k = 25)]",Jacob,image,FALSE,TRUE,FALSE,FALSE,FALSE,all
+samples classified as anything but a snail,"[match(~(F(""cls.label"") == ""snail""))]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,classification
+samples with 2 or more Cats,"[match(F(""gt.detections"").filter(F(""label"") == ""Cat"").length() >= 2)]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
+all images with at least 6 plates,"[match(F(""detections.detections"").filter(F(""label"") == ""plate"").length() >= 6)]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
+images with no more than four lamps or lanterns,"[match(F(""ground_truth.detections"").filter(F(""label"").is_in([""lamp"", ""lantern""])).length() <= 4)",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
+samples with exactly one prediction of a bed,"[match(F(""model.detections"").filter(F(""label"") == ""bed"").length() == 1)]",Jacob,image,FALSE,FALSE,FALSE,FALSE,FALSE,detection
diff --git a/fiftyone_api_embeddings.pkl b/fiftyone_api_embeddings.pkl
diff --git a/fiftyone_docs_embeddings.pkl b/fiftyone_docs_embeddings.pkl
diff --git a/links/dataset_view_generator.py b/links/dataset_view_generator.py
@@ -94,6 +94,10 @@
     template=TEXT_SIMILARITY_PROMPT_TEMPLATE,
 )
 
+SIMILARITY_QUERY_PROMPT_PATH = os.path.join(
+    PROMPTS_DIR, "similarity_query_extractor_prompt.txt"
+)
+
 DETECTION_KEYWORDS = (
     "_fp",
     "_fn",
@@ -105,6 +109,17 @@
 
 CLASSIFICATION_KEYWORDS = ("False", "True")
 
+TEXT_SIM_KEYWORDS = (
+    "show",
+    "display",
+    "find me",
+    "images",
+    "pictures",
+    "photos",
+    "videos",
+    "samples",
+)
+
 
 def generate_evaluation_prompt(sample_collection, eval_key):
     schema = sample_collection.get_field_schema()
@@ -671,14 +686,21 @@ def _get_confidence_subfield(field):
     def _get_ground_truth_field():
         return label_fields[0]
 
+    def _check_non_none_field(field, sample_collection):
+
+        # return True if field (fully-qualified) exists and contains non-None value
+        v = sample_collection.limit(1).values(field)[0]
+        if isinstance(v, list):
+            v = v[0]
+        return v is not None
+
     def _get_predictions_field():
         if len(label_fields) == 1:
             return label_fields[0]
 
         for field in label_fields:
             conf_field = _get_confidence_subfield(field)
-
-            if sample_collection.first()[conf_field]:
+            if _check_non_none_field(conf_field, sample_collection):
                 return field
         return label_fields[0]
 
@@ -743,6 +765,8 @@ def get_label_field(contents, used_classes):
             classes_str = f'F("label").is_in({class_strs})'
 
         contents = f"filter = {classes_str}{field_names_str}"
+        if ".label" in contents:
+            contents = contents.replace(".label", "")
         return f"match_labels({contents})"
     elif "is_in" in contents:
         is_in = contents.split("is_in([")[1].split("])")[0]
@@ -752,8 +776,12 @@ def get_label_field(contents, used_classes):
         class_strs = [f"{class_name}" for class_name in elems]
         classes_str = f'F("label").is_in({class_strs})'
         contents = f"filter = {classes_str}{field_names_str}"
+        if ".label" in contents:
+            contents = contents.replace(".label", "")
         return f"match_labels({contents})"
     elif "filter=" in contents:
+        if ".label" in contents:
+            contents = contents.replace(".label", "")
         for field in label_classes.keys():
             if f'F("{field}")' in contents:
                 contents = contents.replace(f'F("{field}")', f'F("label")')
@@ -801,9 +829,27 @@ def _validate_filter_labels(stage, label_classes):
         if label_field not in label_classes.keys():
             for field in label_classes.keys():
                 if field in label_field and field != label_field:
-                    contents = contents.replace(label_field, field)
+                    contents = contents.replace(args[0], f'"{field}"')
                     break
 
+    ##### correct three-argument hallucination of form 'filter_labels("label_field", "==", "class_name")'
+    eq_pattern = r'"([^"]+)",\s*"==",\s*"([^"]+)"'
+    eq_matches = re.findall(eq_pattern, contents)
+    if eq_matches:
+        match = eq_matches[0]
+        label_field = match[0]
+        class_name = match[1]
+        return f'filter_labels("{label_field}", F("label") == "{class_name}")'
+
+    ##### correct three-argument hallucination of form 'filter_labels("label_field", "!=", "class_name")'
+    neq_pattern = r'"([^"]+)",\s*"!=",\s*"([^"]+)"'
+    neq_matches = re.findall(neq_pattern, contents)
+    if neq_matches:
+        match = neq_matches[0]
+        label_field = match[0]
+        class_name = match[2]
+        return f'filter_labels("{label_field}", F("label") != "{class_name}")'
+
     ##### correct second argument if needed
     if len(args) == 2:
         arg1 = args[1].strip()
@@ -900,6 +946,65 @@ def _validate_negation_operator(stage):
     return stage
 
 
+def _get_false_patterns(stage):
+    false_patterns = [
+        r",\s*False",
+        r",\s*invert\s*=\s*True",
+    ]
+
+    if stage.startswith("match_labels"):
+        return false_patterns
+    elif stage.startswith("match_tags"):
+        return false_patterns
+    elif stage.startswith("exists"):
+        return false_patterns
+    else:
+        return false_patterns + [r",\s*bool\s*=\s*False"]
+
+
+def _validate_bool_condition(stage):
+    false_patterns = _get_false_patterns(stage)
+
+    for pattern in false_patterns:
+        false_matches = re.findall(pattern, stage)
+        if false_matches:
+            stage = re.sub(pattern, "", stage)
+            opening_paren_index = stage.index("(")
+            # Extract the function name
+            stage_name = stage[:opening_paren_index]
+
+            # Extract the contents
+            contents = stage[opening_paren_index + 1 : -1]
+            return f"{stage_name}(~({contents}))"
+    return stage
+
+
+def load_similarity_query_prompt():
+    cache = get_cache()
+    key = "similarity_query_prefix"
+    if key not in cache:
+        with open(SIMILARITY_QUERY_PROMPT_PATH, "r") as f:
+            cache[key] = f.read()
+    return cache[key]
+
+
+def extract_similarity_query(stage):
+    pattern = r'sort_by_similarity\("([^"]+)"'
+    query = re.search(pattern, stage).group(1)
+    sim_query_prompt = load_similarity_query_prompt().replace("QUERY", query)
+    new_query = get_llm().call_as_llm(sim_query_prompt).strip()
+    return stage.replace(query, new_query)
+
+
+def _validate_text_similarity(stage):
+    if "sort_by_similarity" not in stage:
+        return stage
+    if any(keyword in stage for keyword in TEXT_SIM_KEYWORDS):
+        return extract_similarity_query(stage)
+    else:
+        return stage
+
+
 def _postprocess_stages(
     stages,
     sample_collection,
@@ -926,8 +1031,10 @@ def _postprocess_stages(
             _stage = _validate_filter_labels(_stage, label_classes)
         if "match_labels" in _stage:
             _stage = _validate_match_labels(_stage, label_classes)
-        _stage = _validate_negation_operator(_stage)
 
+        _stage = _validate_negation_operator(_stage)
+        _stage = _validate_bool_condition(_stage)
+        _stage = _validate_text_similarity(_stage)
         new_stages.append(_stage)
 
     return new_stages