Updated attribute metadata

* Added test case for output meta data * Added json formatted source string to attributes * Added title string to attributes * Updated ID string to include level
GEUS-Glaciology-and-Climate · Aug 16, 2024 · 083bffe · 083bffe
1 parent 92c4184
commit 083bffe
Show file tree

Hide file tree

Showing 6 changed files with 315 additions and 14 deletions.
diff --git a/src/pypromice/process/aws.py b/src/pypromice/process/aws.py
@@ -19,7 +19,7 @@
 from pypromice.process.L1toL2 import toL2
 from pypromice.process.L2toL3 import toL3
 from pypromice.process import write, load, utilities
-from pypromice.process.resample import resample_dataset
+from pypromice.utilities.git import get_commit_hash_and_check_dirty
 
 pd.set_option("display.precision", 2)
 xr.set_options(keep_attrs=True)
@@ -60,6 +60,19 @@ def __init__(
         self.vars = pypromice.resources.load_variables(var_file)
         self.meta = pypromice.resources.load_metadata(meta_file)
 
+        config_hash = get_commit_hash_and_check_dirty(Path(config_file))
+        config_source_string = f"{Path(config_file).name}:{config_hash}"
+        inpath_hash = get_commit_hash_and_check_dirty(Path(inpath))
+        inpath_source_string = f"{Path(inpath).name}:{inpath_hash}"
+
+        source_dict = dict(
+            pypromice = metadata.version("pypromice"),
+            l0_config_file = config_source_string,
+            l0_data_root = inpath_source_string,
+        )
+        self.meta["source"] = json.dumps(source_dict)
+
+
         # Load config file
         L0 = self.loadL0()
         self.L0=[]

diff --git a/src/pypromice/process/write.py b/src/pypromice/process/write.py
@@ -6,7 +6,6 @@
 import datetime
 import logging
 import os
-from importlib import metadata
 
 import numpy as np
 import pandas as pd
@@ -310,20 +309,41 @@ def addMeta(ds, meta):
             sample_rate = "monthly"
 
     if "station_id" in ds.attrs.keys():
-        ds.attrs["id"] = (
-            "dk.geus.promice.station." + ds.attrs["station_id"] + "." + sample_rate
-        )
+        id_components = [
+            "dk",
+            "geus",
+            "promice",
+            "station",
+            ds.attrs["station_id"],
+            ds.attrs["level"],
+            sample_rate,
+        ]
+        ds.attrs["id"] = ".".join(id_components)
     else:
-        ds.attrs["id"] = (
-            "dk.geus.promice.site." + ds.attrs["site_id"] + "." + sample_rate
-        )
+        id_components = [
+            "dk",
+            "geus",
+            "promice",
+            "site",
+            ds.attrs["site_id"],
+            ds.attrs["level"],
+            sample_rate,
+        ]
+        ds.attrs["id"] = ".".join(id_components)
 
     ds.attrs["history"] = "Generated on " + datetime.datetime.utcnow().isoformat()
     ds.attrs["date_created"] = str(datetime.datetime.now().isoformat())
     ds.attrs["date_modified"] = ds.attrs["date_created"]
     ds.attrs["date_issued"] = ds.attrs["date_created"]
     ds.attrs["date_metadata_modified"] = ds.attrs["date_created"]
-    ds.attrs["processing_level"] = ds.attrs["level"].replace("L", "level ")
+    ds.attrs["processing_level"] = ds.attrs["level"].replace("L", "Level ")
+
+    title_string_format = "AWS measurements from {station_id} processed to {processing_level}. {sample_rate} average."
+    ds.attrs["title"] = title_string_format.format(
+        station_id=ds.attrs["station_id"],
+        processing_level=ds.attrs["processing_level"].lower(),
+        sample_rate=sample_rate.capitalize(),
+    )
 
     if "lat" in ds.keys():
         lat_min = ds["lat"].min().values
@@ -384,11 +404,6 @@ def addMeta(ds, meta):
     ds.attrs["time_coverage_start"] = str(ds["time"][0].values)
     ds.attrs["time_coverage_end"] = str(ds["time"][-1].values)
 
-    try:
-        ds.attrs["source"] = "pypromice v" + str(metadata.version("pypromice"))
-    except:
-        ds.attrs["source"] = "pypromice"
-
     # https://www.digi.com/resources/documentation/digidocs/90001437-13/reference/r_iso_8601_duration_format.htm
     try:
         ds.attrs["time_coverage_duration"] = str(

diff --git a/src/pypromice/utilities/git.py b/src/pypromice/utilities/git.py
@@ -0,0 +1,68 @@
+import subprocess
+import os
+from pathlib import Path
+
+import logging
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+
+def get_commit_hash_and_check_dirty(file_path) -> str:
+    repo_path = Path(file_path).parent
+
+    try:
+        # Ensure the file path is relative to the repository
+        relative_file_path = os.path.relpath(file_path, repo_path)
+
+        # Get the latest commit hash for the file
+        commit_hash = (
+            subprocess.check_output(
+                [
+                    "git",
+                    "-C",
+                    repo_path,
+                    "log",
+                    "-n",
+                    "1",
+                    "--pretty=format:%H",
+                    #"--",
+                    #relative_file_path,
+                ],
+                stderr=subprocess.STDOUT,
+            )
+            .strip()
+            .decode("utf-8")
+        )
+
+        # Check if the file is dirty (has uncommitted changes)
+        diff_output = (
+            subprocess.check_output(
+                ["git", "-C", repo_path, "diff", "--", relative_file_path],
+                stderr=subprocess.STDOUT,
+            )
+            .strip()
+            .decode("utf-8")
+        )
+
+        # If diff_output is not empty, the file has uncommitted changes
+        is_dirty = len(diff_output) > 0
+
+        if is_dirty:
+            logger.warning(f"Warning: The file {file_path} is dirty compared to the last commit. {commit_hash}")
+            return 'unknown'
+        if commit_hash == "":
+            logger.warning(f"Warning: The file {file_path} is not under version control.")
+            return 'unknown'
+
+        print(f"Commit hash: {commit_hash}")
+        return commit_hash
+    except subprocess.CalledProcessError as e:
+        logger.warning(f"Error: {e.output.decode('utf-8')}")
+        return 'unknown'
+
+
+# %%
+
+get_commit_hash_and_check_dirty("/Users/maclu/data/aws-l0/README.md")
+get_commit_hash_and_check_dirty("/Users/maclu/data/aws-l0/dirty.txt")
diff --git a/tests/data/test_config1_raw.toml b/tests/data/test_config1_raw.toml
@@ -0,0 +1,49 @@
+station_id = 'TEST1'
+logger_type = 'CR1000X'
+nodata     = ['-999', 'NAN'] # if one is a string, all must be strings
+number_of_booms = 1 #1-boom = promice, 2-boom = gc-net
+latitude = 79.91
+longitude = 24.09
+
+['test_raw1.txt']
+format     = 'raw'
+skiprows = 4
+hygroclip_t_offset = 40
+dsr_eng_coef = 14.01
+usr_eng_coef = 12.72
+dlr_eng_coef = 11.08
+ulr_eng_coef = 11.42
+pt_z_coef = 0.39571
+pt_z_p_coef = 1022.5
+pt_z_factor = 2.5
+pt_antifreeze = 50
+boom_azimuth = 0
+columns = ['time','rec','SKIP_3','p_u','t_u','SKIP_6','rh_u','wspd_u','wdir_u','wdir_std_u',
+	'dsr','usr','dlr','ulr','t_rad','z_boom_u','z_boom_q_u','z_stake','z_stake_q','z_pt',
+ 	't_i_1','t_i_2','t_i_3','t_i_4','t_i_5','t_i_6','t_i_7',
+ 	't_i_8','tilt_x','tilt_y','gps_time','gps_lat','gps_lon',
+ 	'gps_alt','gps_geoid','SKIP_36','gps_q','gps_numsat','gps_hdop',
+	't_log','fan_dc_u','batt_v_ini','batt_v']
+
+['test_raw_SlimTableMem1.txt']
+format     = 'STM'
+skiprows = 4
+latitude = 79.83
+longitude = 25.17
+hygroclip_t_offset = 40
+dsr_eng_coef = 12.55
+usr_eng_coef = 12.52
+dlr_eng_coef = 13.73
+ulr_eng_coef = 10.43
+pt_z_coef = 0.4369
+pt_z_p_coef = 1005.9
+pt_z_factor = 2.5
+pt_antifreeze = 50
+boom_azimuth = 0
+columns = ['time','rec','min_y','p_u','t_u','SKIP_6','rh_u','wspd_u','wdir_u',
+ 'wd_std_u','dsr','usr','dlr','ulr','t_rad','z_boom_u',
+ 'z_boom_q_u','z_stake','z_stake_q','z_pt','t_i_1','t_i_2',
+ 't_i_3','t_i_4','t_i_5','t_i_6','t_i_7','t_i_8','tilt_x',
+ 'tilt_y','gps_time','gps_lat','gps_lon','gps_alt',
+ 'gps_geoid','SKIP_36','gps_q','gps_numsats','gps_hdop',
+ 't_log','fan_dc_u','batt_v_ss','batt_v']
diff --git a/tests/data/test_config1_tx.toml b/tests/data/test_config1_tx.toml
@@ -0,0 +1,25 @@
+station_id = 'TEST1'
+logger_type = 'CR1000X'
+nodata     = ['-999', 'NAN'] # if one is a string, all must be strings
+number_of_booms = 1 #1-boom = promice, 2-boom = gc-net
+latitude = 79.91
+longitude = 24.09
+
+['test_raw_transmitted1.txt']
+format     = 'TX'
+skiprows = 0
+hygroclip_t_offset = 0
+dsr_eng_coef = 12.2
+usr_eng_coef = 11.15
+dlr_eng_coef = 10.28
+ulr_eng_coef = 8.24
+pt_z_coef = 0.3999
+pt_z_p_coef = 982.4
+pt_z_factor = 2.5
+pt_antifreeze = 50
+boom_azimuth = 0
+columns = ['time','rec','p_u','t_u','SKIP_5','rh_u','wspd_u','wdir_u','dsr',
+ 'usr','dlr','ulr','t_rad','z_boom_u','z_stake','z_pt',
+ 't_i_1','t_i_2','t_i_3','t_i_4','t_i_5','t_i_6','t_i_7',
+ 't_i_8','tilt_x','tilt_y','gps_time','gps_lat','gps_lon',
+ 'gps_alt','gps_hdop','fan_dc_u','batt_v']
diff --git a/tests/e2e/test_get_l2.py b/tests/e2e/test_get_l2.py
@@ -0,0 +1,131 @@
+import datetime
+import json
+import tempfile
+import unittest
+from importlib import metadata
+from pathlib import Path
+
+import pandas as pd
+import xarray as xr
+
+from pypromice.process.get_l2 import get_l2
+
+TEST_ROOT = Path(__file__).parent.parent
+TEST_DATA_ROOT_PATH = TEST_ROOT / "data"
+
+
+class GetL2TestCase(unittest.TestCase):
+    def test_get_l2_tx(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            output_path = Path(tmpdirname) / "output"
+            config_file = TEST_DATA_ROOT_PATH / "test_config1_tx.toml"
+
+            aws = get_l2(
+                config_file=config_file.as_posix(),
+                inpath=TEST_DATA_ROOT_PATH.as_posix(),
+                outpath=output_path,
+                variables=None,
+                metadata=None,
+            )
+
+            station_id = "TEST1"
+            expected_dir = output_path / station_id
+            expected_dataset_paths = {
+                "nc_hour": expected_dir / f"{station_id}_hour.nc",
+                "csv_hour": expected_dir / f"{station_id}_hour.csv",
+            }
+            self.assertSetEqual({expected_dir}, set(output_path.iterdir()))
+            self.assertSetEqual(
+                set(expected_dataset_paths.values()), set(expected_dir.iterdir())
+            )
+
+    def test_get_l2_raw(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            output_path = Path(tmpdirname) / "output"
+            config_file = TEST_DATA_ROOT_PATH / "test_config1_raw.toml"
+
+            aws = get_l2(
+                config_file=config_file.as_posix(),
+                inpath=TEST_DATA_ROOT_PATH.as_posix(),
+                outpath=output_path,
+                variables=None,
+                metadata=None,
+            )
+
+            station_id = "TEST1"
+            expected_dir = output_path / station_id
+            expected_dataset_paths = {
+                "nc_hour": expected_dir / f"{station_id}_hour.nc",
+                "csv_hour": expected_dir / f"{station_id}_hour.csv",
+                "nc_10min": expected_dir / f"{station_id}_10min.nc",
+                "csv_10min": expected_dir / f"{station_id}_10min.csv",
+            }
+            self.assertSetEqual({expected_dir}, set(output_path.iterdir()))
+            self.assertSetEqual(
+                set(expected_dataset_paths.values()), set(expected_dir.iterdir())
+            )
+            # Test output file format
+            dataset_hour = xr.open_dataset(expected_dataset_paths["nc_hour"])
+            dataset_10min = xr.open_dataset(expected_dataset_paths["nc_10min"])
+
+            self.assertEqual(
+                dataset_10min.attrs["id"],
+                f"dk.geus.promice.station.{station_id}.L2.10min",
+            )
+            self.assertEqual(
+                dataset_hour.attrs["id"],
+                f"dk.geus.promice.station.{station_id}.L2.hourly",
+            )
+            self.assertEqual(
+                dataset_10min.attrs["title"],
+                f"AWS measurements from {station_id} processed to level 2. 10min average.",
+            )
+            self.assertEqual(
+                dataset_hour.attrs["title"],
+                f"AWS measurements from {station_id} processed to level 2. Hourly average.",
+            )
+
+            t0 = datetime.datetime.utcnow()
+            for dataset in [dataset_hour, dataset_10min]:
+                self.assertEqual(dataset.attrs["format"], "raw")
+                self.assertEqual(dataset.attrs["station_id"], station_id)
+                self.assertIsInstance(dataset.attrs["date_created"], str)
+                date_created = pd.to_datetime(dataset.attrs["date_created"])
+                self.assertLess(t0 - date_created, datetime.timedelta(seconds=1))
+                self.assertEqual(
+                    dataset.attrs["date_issued"], dataset.attrs["date_created"]
+                )
+                self.assertEqual(
+                    dataset.attrs["date_modified"], dataset.attrs["date_created"]
+                )
+                self.assertEqual(
+                    dataset.attrs["processing_level"],
+                    "Level 2",
+                )
+                self.assertEqual(
+                    dataset.attrs["institution"],
+                    "Geological Survey of Denmark and Greenland (GEUS)",
+                )
+                source_decoded = json.loads(dataset.attrs["source"])
+                self.assertSetEqual(
+                    {"pypromice", "l0_config_file", "l0_data_root"},
+                    set(source_decoded.keys()),
+                )
+                self.assertEqual(
+                    source_decoded["pypromice"],
+                    metadata.version("pypromice"),
+                )
+                config_file_name, config_hash = source_decoded["l0_config_file"].rsplit(
+                    ":", 1
+                )
+                self.assertEqual(
+                    config_file_name,
+                    config_file.name,
+                )
+                data_root_name, data_root_hash = source_decoded["l0_data_root"].rsplit(":", 1)
+                self.assertEqual(
+                    data_root_name,
+                    TEST_DATA_ROOT_PATH.name,
+                )
+                self.assertNotEquals(config_hash, 'unknown', 'This test will fail while the commit is dirty')
+                self.assertNotEquals(data_root_hash, 'unknown', 'This test will fail while the commit is dirty')