Skip to content

Commit

Permalink
support adding specific columns to geometry based hash
Browse files Browse the repository at this point in the history
  • Loading branch information
smnorris committed Sep 27, 2024
1 parent 1f7b77c commit 279ba3c
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 25 deletions.
10 changes: 10 additions & 0 deletions source_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,16 @@
]
}
},
"hash_fields": {
"description": "When no primary key is available, append these fields to geometry when creating a hash based primary key",
"type": "array",
"items": {
"type": [
"string",
"null"
]
}
},
"metadata_url": {
"description": "Link to source metadata, where available",
"type": [
Expand Down
12 changes: 8 additions & 4 deletions src/fit_opendatadownloader/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,14 @@ def process(
# download data from source to a geodataframe (df)
df = layer.download()

# clean the data and warn if duplicates are found
# Duplicates processing for geometries is based on provided precision value
# This fails if primary key + geometry is non-unique
df = fdl.clean(df, layer.fields, layer.primary_key, precision=0.1)
# clean the data slightly
df = fdl.clean(
df,
fields=layer.fields,
primary_key=layer.primary_key,
hash_fields=layer.hash_fields,
precision=0.1,
)

# process and dump to file if "validate" option is not set
if not validate:
Expand Down
34 changes: 15 additions & 19 deletions src/fit_opendatadownloader/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ def download(self):
def clean(
df,
fields,
primary_key=None,
primary_key=[],
hash_fields=[],
precision=0.01,
fdl_primary_key="fdl_load_id",
drop_geom_duplicates=False,
Expand All @@ -108,7 +109,7 @@ def clean(
if df.geometry.name != "geometry":
df = df.rename_geometry("geometry")
cleaned_column_map = {}
for column in fields:
for column in fields + hash_fields:
cleaned_column_map[column] = re.sub(
r"\W+", "", column.lower().strip().replace(" ", "_")
)
Expand All @@ -117,6 +118,8 @@ def clean(
# assign cleaned column names to fields list
fields = list(cleaned_column_map.values())

hash_fields = [cleaned_column_map[k] for k in hash_fields]

# drop any columns not listed in config (minus geometry)
df = df[fields + ["geometry"]]

Expand All @@ -139,30 +142,23 @@ def clean(
LOG.info(
f"Adding hashed key {fdl_primary_key}, based on hash of provided primary_key {','.join(pks)}"
)
df = fcd.add_hash_key(df, fdl_primary_key, fields=pks, hash_geometry=False)
df = fcd.add_hash_key(
df, new_field=fdl_primary_key, fields=pks, hash_geometry=False
)
pks = [fdl_primary_key]

# if no primary key provided, just use the geometry
# if no primary key provided, use the geometry (and additional hash fields if provided)
else:
LOG.info(
f"Adding hashed key {fdl_primary_key}, based on hash of geometry {','.join(pks)}"
)
LOG.info(f"Adding hashed key {fdl_primary_key}, based on hash of geometry")
df = fcd.add_hash_key(
df, fdl_primary_key, hash_geometry=True, precision=precision
df,
new_field=fdl_primary_key,
fields=hash_fields,
hash_geometry=True,
precision=precision,
)
pks = [fdl_primary_key]

# duplicates could be present if using geometry hash as pk
# report on duplicates and drop (if specified)
n_duplicates = len(df.drop_duplicates(subset=pks))
if n_duplicates > 0:
LOG.warning(f"{n_duplicates} duplicates are present in data")
if drop_geom_duplicates:
df = df.drop_duplicates(subset=pks)
LOG.warning(
f"Dropped {n_duplicates} duplicate rows (equivalent geometries)"
)

return df


Expand Down
72 changes: 70 additions & 2 deletions tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def test_hash_pk():
]
layer = fdl.parse_config(sources)[0]
df = layer.download()
df = fdl.clean(df, layer.fields, precision=0.1)
df = fdl.clean(df, fields=layer.fields, precision=0.1)
assert df["fdl_load_id"].iloc[0] == "597b8d8bef757cb12fec15ce027fb2c6f84775d7"


Expand All @@ -185,5 +185,73 @@ def test_mixed_types():
]
layer = fdl.parse_config(sources)[0]
df = layer.download()
df = fdl.clean(df, layer.fields, layer.primary_key, precision=0.1)
df = fdl.clean(
df, fields=layer.fields, primary_key=layer.primary_key, precision=0.1
)
assert [t.upper() for t in df.geometry.geom_type.unique()] == ["MULTIPOINT"]


def test_duplicate_pk():
sources = [
{
"out_layer": "parks",
"source": "tests/data/dups.geojson",
"protocol": "http",
"fields": [
"SOURCE_DATA_ID",
],
"primary_key": [
"SOURCE_DATA_ID",
],
"schedule": "Q",
}
]
layer = fdl.parse_config(sources)[0]
df = layer.download()
with pytest.raises(ValueError):
df = fdl.clean(
df, fields=layer.fields, primary_key=layer.primary_key, precision=0.1
)


def test_duplicate_geom():
sources = [
{
"out_layer": "parks",
"source": "tests/data/dups.geojson",
"protocol": "http",
"fields": [
"SOURCE_DATA_ID",
],
"schedule": "Q",
}
]
layer = fdl.parse_config(sources)[0]
df = layer.download()
df.at[1, "geometry"] = df.at[0, "geometry"]
with pytest.raises(ValueError):
df = fdl.clean(df, fields=layer.fields, precision=0.1)


def test_hash_fields():
sources = [
{
"out_layer": "parks",
"source": "tests/data/dups.geojson",
"protocol": "http",
"fields": ["SOURCE_DATA_ID", "DESCRIPTION"],
"hash_fields": ["DESCRIPTION"],
"schedule": "Q",
}
]
layer = fdl.parse_config(sources)[0]
df = layer.download()
df.at[1, "geometry"] = df.at[0, "geometry"]
assert (
len(
fdl.clean(
df, fields=layer.fields, hash_fields=layer.hash_fields, precision=0.1
)
)
== 2
)

0 comments on commit 279ba3c

Please sign in to comment.