From add1d9cff89d8c355f852a92168ef067ac6ebd83 Mon Sep 17 00:00:00 2001
From: "houhan@gmail.com" <han.hou@alleninstitute.org>
Date: Fri, 22 Nov 2024 20:52:00 +0000
Subject: [PATCH 1/4] query "behavior" instead of "fib"

---
 code/util/fetch_data_docDB.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/code/util/fetch_data_docDB.py b/code/util/fetch_data_docDB.py
index eb323cd..3009ddb 100644
--- a/code/util/fetch_data_docDB.py
+++ b/code/util/fetch_data_docDB.py
@@ -110,10 +110,11 @@ def strip_dict_for_id(co_asset_id_dict_list):
     return result_list                   
 
 def fetch_fip_data(client):
-    # search for records that have the "fib" (for fiber photometry) modality in data_description
-    logger.warning("fetching 'fib' records...")
+    # To compare the FIP pipeline with my temporary pipeline for all behavior sessions, we should 
+    # query "behavior" from the data_description.modality.abbreviation field.
+    logger.warning("fetching 'behavior' records...")
     modality_results = client.retrieve_docdb_records(
-        filter_query={"data_description.modality.abbreviation": "fib"},
+        filter_query={"data_description.modality.abbreviation": "behavior"},
         paginate_batch_size=500
     )              
     logger.warning(f"found {len(modality_results)} results")

From 1ccb0b5fa6eed4a0be670a5453a8730d6883d5e1 Mon Sep 17 00:00:00 2001
From: "houhan@gmail.com" <han.hou@alleninstitute.org>
Date: Fri, 22 Nov 2024 21:45:35 +0000
Subject: [PATCH 2/4] fix a bug where procedure doesn't have "procedure_type"
 field

---
 code/util/fetch_data_docDB.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/util/fetch_data_docDB.py b/code/util/fetch_data_docDB.py
index 3009ddb..7c29047 100644
--- a/code/util/fetch_data_docDB.py
+++ b/code/util/fetch_data_docDB.py
@@ -43,7 +43,7 @@ def fetch_individual_procedures(r):
 def fetch_fiber_probes(r):
     probes = []
     for sp in fetch_individual_procedures(r):
-        if sp['procedure_type'] == 'Fiber implant':
+        if sp.get('procedure_type') == 'Fiber implant':
             probes += sp['probes']
     return probes
                 

From d97a2159da745f335c6fb275a1420e373ec15bd8 Mon Sep 17 00:00:00 2001
From: "houhan@gmail.com" <han.hou@alleninstitute.org>
Date: Fri, 22 Nov 2024 23:40:15 +0000
Subject: [PATCH 3/4] use "software.name": "dynamic-foraging-task"

---
 code/util/fetch_data_docDB.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/code/util/fetch_data_docDB.py b/code/util/fetch_data_docDB.py
index 7c29047..4cfb665 100644
--- a/code/util/fetch_data_docDB.py
+++ b/code/util/fetch_data_docDB.py
@@ -17,7 +17,7 @@
 @st.cache_data(ttl=3600*12) # Cache the df_docDB up to 12 hours
 def load_data_from_docDB():
     client = load_client()
-    df = fetch_fip_data(client)
+    df = fetch_dynamic_foraging_data(client)
     return df
 
 @st.cache_resource
@@ -109,33 +109,32 @@ def strip_dict_for_id(co_asset_id_dict_list):
             
     return result_list                   
 
-def fetch_fip_data(client):
-    # To compare the FIP pipeline with my temporary pipeline for all behavior sessions, we should 
-    # query "behavior" from the data_description.modality.abbreviation field.
-    logger.warning("fetching 'behavior' records...")
-    modality_results = client.retrieve_docdb_records(
-        filter_query={"data_description.modality.abbreviation": "behavior"},
+def fetch_dynamic_foraging_data(client):
+    # To compare the new FIP pipeline with my temporary pipeline for all dynamic foraging sessions,
+    # let's directly query the software name
+    logger.warning("fetching 'dynamic foraging' in software name...")
+    software_name_results = client.retrieve_docdb_records(
+        filter_query={"session.data_streams.software.name": "dynamic-foraging-task",
+                      "name": {"$not": {"$regex": ".*processed.*"}}, # only raw data
+                      },
         paginate_batch_size=500
     )              
-    logger.warning(f"found {len(modality_results)} results")
+    logger.warning(f"found {len(software_name_results)} results")
 
     # there are more from the past that didn't specify modality correctly. 
     # until this is fixed, need to guess by asset name 
     logger.warning("fetching FIP records by name...")
-    name_results = client.retrieve_docdb_records(
+    name_FIP_results = client.retrieve_docdb_records(
         filter_query={"name": {"$regex": "^FIP.*"}},
         paginate_batch_size=500
     )
-    logger.warning(f"found {len(name_results)} results")
+    logger.warning(f"found {len(name_FIP_results)} results")
 
     # in case there is overlap between these two queries, filter down to a single list with unique IDs
-    unique_results_by_id = {**{ r['_id']: r for r in modality_results }, **{ r['_id']: r for r in name_results }}
+    unique_results_by_id = {**{ r['_id']: r for r in software_name_results }, **{ r['_id']: r for r in name_FIP_results }}
     results = list(unique_results_by_id.values())
     logger.warning(f"found {len(results)} unique results")
-    
-    # filter out results with 'processed' in the name because I can't rely on data_description.data_level :(
-    results = [ r for r in results if not 'processed' in r['name'] ]
-    
+        
     # make a dataframe
     records_df = pd.DataFrame.from_records([map_record_to_dict(d) for d in results ])
     

From 5ffa67b782fcae3bcbd6819ba5c2df07b9db7e16 Mon Sep 17 00:00:00 2001
From: "houhan@gmail.com" <han.hou@alleninstitute.org>
Date: Sat, 23 Nov 2024 02:11:27 +0000
Subject: [PATCH 4/4] add has_FIB_in_data_streams

---
 code/util/fetch_data_docDB.py | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/code/util/fetch_data_docDB.py b/code/util/fetch_data_docDB.py
index 4cfb665..811f64e 100644
--- a/code/util/fetch_data_docDB.py
+++ b/code/util/fetch_data_docDB.py
@@ -39,14 +39,14 @@ def fetch_individual_procedures(r):
     else: # pre Surgery
         sub_procs = (r.get('procedures') or {}).get('subject_procedures') or {}
         yield from sub_procs
-        
+
 def fetch_fiber_probes(r):
     probes = []
     for sp in fetch_individual_procedures(r):
         if sp.get('procedure_type') == 'Fiber implant':
             probes += sp['probes']
     return probes
-                
+
 def fetch_injections(r):    
     injections=[]
     for sp in fetch_individual_procedures(r):        
@@ -60,7 +60,7 @@ def fetch_injections(r):
             })            
        
     return injections
-    
+
 
 def get_viruses(injections):
     virus_names = []
@@ -86,7 +86,7 @@ def get_viruses(injections):
                     else:
                         NM_recorded.append(NM)
     return virus_names, NM_recorded   
-    
+
 
 def strip_dict_for_id(co_asset_id_dict_list):
     result_list = []
@@ -166,8 +166,6 @@ def fetch_dynamic_foraging_data(client):
     return records_df
 
 
-
-
 def map_record_to_dict(record):
     """ function to map a metadata dictionary to a simpler dictionary with the fields we care about """
     dd = record.get('data_description', {}) or {}
@@ -175,16 +173,28 @@ def map_record_to_dict(record):
     creation_time = dd.get('creation_time', '') or ''
     subject = record.get('subject', {}) or {}
     subject_id = subject.get('subject_id') or ''
-    subject_genotype = subject.get('genotype') or ''
-    session = record.get('session') or {}
-    task_type = session.get('session_type') or ''
+    subject_genotype = subject.get("genotype") or ""
+    session = record.get("session") or {}
+    task_type = session.get("session_type") or ""
+
+    # -- Check whether FIP exists
+    # per this issue, https://github.com/AllenNeuralDynamics/dynamic-foraging-task/issues/1056
+    # here I check whether fib exists in the data streams
+    data_streams = session.get("data_streams") or []
+    has_fib_in_data_streams = any(
+        [
+            "fib" in (mod.get("abbreviation") or "")
+            for ds in (session.get("data_streams") or [])
+            for mod in (ds.get("stream_modalities") or [])
+        ]
+    )
 
     try:
         injections = fetch_injections(record)
         virus_names, NM_recorded = get_viruses(injections)
     except:
         injections, virus_names, NM_recorded = [], [], []
-        
+
     return {
         'session_loc': record['location'],
         'session_name': record['name'],
@@ -196,6 +206,7 @@ def map_record_to_dict(record):
         'injections': str(injections),
         'task_type': task_type, 
         'virus':virus_names, 
+        'has_fib_in_data_streams': has_fib_in_data_streams,
         # 'NM_recorded':NM_recorded
         
     }
@@ -206,5 +217,3 @@ def find_result(x, lookup):
         if result_name.startswith(x):
             return result
     return {}
-    
-