support adding specific columns to geometry based hash

bcgov · Sep 27, 2024 · 279ba3c · 279ba3c
1 parent 1f7b77c
commit 279ba3c
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 25 deletions.
diff --git a/source_schema.json b/source_schema.json
@@ -67,6 +67,16 @@
           ]
         }
       },
+      "hash_fields": {
+        "description": "When no primary key is available, append these fields to geometry when creating a hash based primary key",
+        "type": "array",
+        "items": {
+          "type": [
+            "string",
+            "null"
+          ]
+        }
+      },
       "metadata_url": {
         "description": "Link to source metadata, where available",
         "type": [

diff --git a/src/fit_opendatadownloader/cli.py b/src/fit_opendatadownloader/cli.py
@@ -151,10 +151,14 @@ def process(
         # download data from source to a geodataframe (df)
         df = layer.download()
 
-        # clean the data and warn if duplicates are found
-        # Duplicates processing for geometries is based on provided precision value
-        # This fails if primary key + geometry is non-unique
-        df = fdl.clean(df, layer.fields, layer.primary_key, precision=0.1)
+        # clean the data slightly
+        df = fdl.clean(
+            df,
+            fields=layer.fields,
+            primary_key=layer.primary_key,
+            hash_fields=layer.hash_fields,
+            precision=0.1,
+        )
 
         # process and dump to file if "validate" option is not set
         if not validate:

diff --git a/src/fit_opendatadownloader/data_source.py b/src/fit_opendatadownloader/data_source.py
@@ -82,7 +82,8 @@ def download(self):
 def clean(
     df,
     fields,
-    primary_key=None,
+    primary_key=[],
+    hash_fields=[],
     precision=0.01,
     fdl_primary_key="fdl_load_id",
     drop_geom_duplicates=False,
@@ -108,7 +109,7 @@ def clean(
     if df.geometry.name != "geometry":
         df = df.rename_geometry("geometry")
     cleaned_column_map = {}
-    for column in fields:
+    for column in fields + hash_fields:
         cleaned_column_map[column] = re.sub(
             r"\W+", "", column.lower().strip().replace(" ", "_")
         )
@@ -117,6 +118,8 @@ def clean(
     # assign cleaned column names to fields list
     fields = list(cleaned_column_map.values())
 
+    hash_fields = [cleaned_column_map[k] for k in hash_fields]
+
     # drop any columns not listed in config (minus geometry)
     df = df[fields + ["geometry"]]
 
@@ -139,30 +142,23 @@ def clean(
         LOG.info(
             f"Adding hashed key {fdl_primary_key}, based on hash of provided primary_key {','.join(pks)}"
         )
-        df = fcd.add_hash_key(df, fdl_primary_key, fields=pks, hash_geometry=False)
+        df = fcd.add_hash_key(
+            df, new_field=fdl_primary_key, fields=pks, hash_geometry=False
+        )
         pks = [fdl_primary_key]
 
-    # if no primary key provided, just use the geometry
+    # if no primary key provided, use the geometry (and additional hash fields if provided)
     else:
-        LOG.info(
-            f"Adding hashed key {fdl_primary_key}, based on hash of geometry {','.join(pks)}"
-        )
+        LOG.info(f"Adding hashed key {fdl_primary_key}, based on hash of geometry")
         df = fcd.add_hash_key(
-            df, fdl_primary_key, hash_geometry=True, precision=precision
+            df,
+            new_field=fdl_primary_key,
+            fields=hash_fields,
+            hash_geometry=True,
+            precision=precision,
         )
         pks = [fdl_primary_key]
 
-    # duplicates could be present if using geometry hash as pk
-    # report on duplicates and drop (if specified)
-    n_duplicates = len(df.drop_duplicates(subset=pks))
-    if n_duplicates > 0:
-        LOG.warning(f"{n_duplicates} duplicates are present in data")
-        if drop_geom_duplicates:
-            df = df.drop_duplicates(subset=pks)
-            LOG.warning(
-                f"Dropped {n_duplicates} duplicate rows (equivalent geometries)"
-            )
-
     return df
 
 

diff --git a/tests/test_download.py b/tests/test_download.py
@@ -167,7 +167,7 @@ def test_hash_pk():
     ]
     layer = fdl.parse_config(sources)[0]
     df = layer.download()
-    df = fdl.clean(df, layer.fields, precision=0.1)
+    df = fdl.clean(df, fields=layer.fields, precision=0.1)
     assert df["fdl_load_id"].iloc[0] == "597b8d8bef757cb12fec15ce027fb2c6f84775d7"
 
 
@@ -185,5 +185,73 @@ def test_mixed_types():
     ]
     layer = fdl.parse_config(sources)[0]
     df = layer.download()
-    df = fdl.clean(df, layer.fields, layer.primary_key, precision=0.1)
+    df = fdl.clean(
+        df, fields=layer.fields, primary_key=layer.primary_key, precision=0.1
+    )
     assert [t.upper() for t in df.geometry.geom_type.unique()] == ["MULTIPOINT"]
+
+
+def test_duplicate_pk():
+    sources = [
+        {
+            "out_layer": "parks",
+            "source": "tests/data/dups.geojson",
+            "protocol": "http",
+            "fields": [
+                "SOURCE_DATA_ID",
+            ],
+            "primary_key": [
+                "SOURCE_DATA_ID",
+            ],
+            "schedule": "Q",
+        }
+    ]
+    layer = fdl.parse_config(sources)[0]
+    df = layer.download()
+    with pytest.raises(ValueError):
+        df = fdl.clean(
+            df, fields=layer.fields, primary_key=layer.primary_key, precision=0.1
+        )
+
+
+def test_duplicate_geom():
+    sources = [
+        {
+            "out_layer": "parks",
+            "source": "tests/data/dups.geojson",
+            "protocol": "http",
+            "fields": [
+                "SOURCE_DATA_ID",
+            ],
+            "schedule": "Q",
+        }
+    ]
+    layer = fdl.parse_config(sources)[0]
+    df = layer.download()
+    df.at[1, "geometry"] = df.at[0, "geometry"]
+    with pytest.raises(ValueError):
+        df = fdl.clean(df, fields=layer.fields, precision=0.1)
+
+
+def test_hash_fields():
+    sources = [
+        {
+            "out_layer": "parks",
+            "source": "tests/data/dups.geojson",
+            "protocol": "http",
+            "fields": ["SOURCE_DATA_ID", "DESCRIPTION"],
+            "hash_fields": ["DESCRIPTION"],
+            "schedule": "Q",
+        }
+    ]
+    layer = fdl.parse_config(sources)[0]
+    df = layer.download()
+    df.at[1, "geometry"] = df.at[0, "geometry"]
+    assert (
+        len(
+            fdl.clean(
+                df, fields=layer.fields, hash_fields=layer.hash_fields, precision=0.1
+            )
+        )
+        == 2
+    )