diff --git a/src/pypromice/process/aws.py b/src/pypromice/process/aws.py index 68aa151e..dc290cb3 100644 --- a/src/pypromice/process/aws.py +++ b/src/pypromice/process/aws.py @@ -19,7 +19,7 @@ from pypromice.process.L1toL2 import toL2 from pypromice.process.L2toL3 import toL3 from pypromice.process import write, load, utilities -from pypromice.process.resample import resample_dataset +from pypromice.utilities.git import get_commit_hash_and_check_dirty pd.set_option("display.precision", 2) xr.set_options(keep_attrs=True) @@ -60,6 +60,19 @@ def __init__( self.vars = pypromice.resources.load_variables(var_file) self.meta = pypromice.resources.load_metadata(meta_file) + config_hash = get_commit_hash_and_check_dirty(Path(config_file)) + config_source_string = f"{Path(config_file).name}:{config_hash}" + inpath_hash = get_commit_hash_and_check_dirty(Path(inpath)) + inpath_source_string = f"{Path(inpath).name}:{inpath_hash}" + + source_dict = dict( + pypromice = metadata.version("pypromice"), + l0_config_file = config_source_string, + l0_data_root = inpath_source_string, + ) + self.meta["source"] = json.dumps(source_dict) + + # Load config file L0 = self.loadL0() self.L0=[] diff --git a/src/pypromice/process/write.py b/src/pypromice/process/write.py index 9cdbd3be..2b6e3d30 100644 --- a/src/pypromice/process/write.py +++ b/src/pypromice/process/write.py @@ -6,7 +6,6 @@ import datetime import logging import os -from importlib import metadata import numpy as np import pandas as pd @@ -310,20 +309,41 @@ def addMeta(ds, meta): sample_rate = "monthly" if "station_id" in ds.attrs.keys(): - ds.attrs["id"] = ( - "dk.geus.promice.station." + ds.attrs["station_id"] + "." + sample_rate - ) + id_components = [ + "dk", + "geus", + "promice", + "station", + ds.attrs["station_id"], + ds.attrs["level"], + sample_rate, + ] + ds.attrs["id"] = ".".join(id_components) else: - ds.attrs["id"] = ( - "dk.geus.promice.site." + ds.attrs["site_id"] + "." + sample_rate - ) + id_components = [ + "dk", + "geus", + "promice", + "site", + ds.attrs["site_id"], + ds.attrs["level"], + sample_rate, + ] + ds.attrs["id"] = ".".join(id_components) ds.attrs["history"] = "Generated on " + datetime.datetime.utcnow().isoformat() ds.attrs["date_created"] = str(datetime.datetime.now().isoformat()) ds.attrs["date_modified"] = ds.attrs["date_created"] ds.attrs["date_issued"] = ds.attrs["date_created"] ds.attrs["date_metadata_modified"] = ds.attrs["date_created"] - ds.attrs["processing_level"] = ds.attrs["level"].replace("L", "level ") + ds.attrs["processing_level"] = ds.attrs["level"].replace("L", "Level ") + + title_string_format = "AWS measurements from {station_id} processed to {processing_level}. {sample_rate} average." + ds.attrs["title"] = title_string_format.format( + station_id=ds.attrs["station_id"], + processing_level=ds.attrs["processing_level"].lower(), + sample_rate=sample_rate.capitalize(), + ) if "lat" in ds.keys(): lat_min = ds["lat"].min().values @@ -384,11 +404,6 @@ def addMeta(ds, meta): ds.attrs["time_coverage_start"] = str(ds["time"][0].values) ds.attrs["time_coverage_end"] = str(ds["time"][-1].values) - try: - ds.attrs["source"] = "pypromice v" + str(metadata.version("pypromice")) - except: - ds.attrs["source"] = "pypromice" - # https://www.digi.com/resources/documentation/digidocs/90001437-13/reference/r_iso_8601_duration_format.htm try: ds.attrs["time_coverage_duration"] = str( diff --git a/src/pypromice/utilities/git.py b/src/pypromice/utilities/git.py new file mode 100644 index 00000000..e49c74f0 --- /dev/null +++ b/src/pypromice/utilities/git.py @@ -0,0 +1,68 @@ +import subprocess +import os +from pathlib import Path + +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + + +def get_commit_hash_and_check_dirty(file_path) -> str: + repo_path = Path(file_path).parent + + try: + # Ensure the file path is relative to the repository + relative_file_path = os.path.relpath(file_path, repo_path) + + # Get the latest commit hash for the file + commit_hash = ( + subprocess.check_output( + [ + "git", + "-C", + repo_path, + "log", + "-n", + "1", + "--pretty=format:%H", + #"--", + #relative_file_path, + ], + stderr=subprocess.STDOUT, + ) + .strip() + .decode("utf-8") + ) + + # Check if the file is dirty (has uncommitted changes) + diff_output = ( + subprocess.check_output( + ["git", "-C", repo_path, "diff", "--", relative_file_path], + stderr=subprocess.STDOUT, + ) + .strip() + .decode("utf-8") + ) + + # If diff_output is not empty, the file has uncommitted changes + is_dirty = len(diff_output) > 0 + + if is_dirty: + logger.warning(f"Warning: The file {file_path} is dirty compared to the last commit. {commit_hash}") + return 'unknown' + if commit_hash == "": + logger.warning(f"Warning: The file {file_path} is not under version control.") + return 'unknown' + + print(f"Commit hash: {commit_hash}") + return commit_hash + except subprocess.CalledProcessError as e: + logger.warning(f"Error: {e.output.decode('utf-8')}") + return 'unknown' + + +# %% + +get_commit_hash_and_check_dirty("/Users/maclu/data/aws-l0/README.md") +get_commit_hash_and_check_dirty("/Users/maclu/data/aws-l0/dirty.txt") \ No newline at end of file diff --git a/tests/data/test_config1_raw.toml b/tests/data/test_config1_raw.toml new file mode 100644 index 00000000..0e397868 --- /dev/null +++ b/tests/data/test_config1_raw.toml @@ -0,0 +1,49 @@ +station_id = 'TEST1' +logger_type = 'CR1000X' +nodata = ['-999', 'NAN'] # if one is a string, all must be strings +number_of_booms = 1 #1-boom = promice, 2-boom = gc-net +latitude = 79.91 +longitude = 24.09 + +['test_raw1.txt'] +format = 'raw' +skiprows = 4 +hygroclip_t_offset = 40 +dsr_eng_coef = 14.01 +usr_eng_coef = 12.72 +dlr_eng_coef = 11.08 +ulr_eng_coef = 11.42 +pt_z_coef = 0.39571 +pt_z_p_coef = 1022.5 +pt_z_factor = 2.5 +pt_antifreeze = 50 +boom_azimuth = 0 +columns = ['time','rec','SKIP_3','p_u','t_u','SKIP_6','rh_u','wspd_u','wdir_u','wdir_std_u', + 'dsr','usr','dlr','ulr','t_rad','z_boom_u','z_boom_q_u','z_stake','z_stake_q','z_pt', + 't_i_1','t_i_2','t_i_3','t_i_4','t_i_5','t_i_6','t_i_7', + 't_i_8','tilt_x','tilt_y','gps_time','gps_lat','gps_lon', + 'gps_alt','gps_geoid','SKIP_36','gps_q','gps_numsat','gps_hdop', + 't_log','fan_dc_u','batt_v_ini','batt_v'] + +['test_raw_SlimTableMem1.txt'] +format = 'STM' +skiprows = 4 +latitude = 79.83 +longitude = 25.17 +hygroclip_t_offset = 40 +dsr_eng_coef = 12.55 +usr_eng_coef = 12.52 +dlr_eng_coef = 13.73 +ulr_eng_coef = 10.43 +pt_z_coef = 0.4369 +pt_z_p_coef = 1005.9 +pt_z_factor = 2.5 +pt_antifreeze = 50 +boom_azimuth = 0 +columns = ['time','rec','min_y','p_u','t_u','SKIP_6','rh_u','wspd_u','wdir_u', + 'wd_std_u','dsr','usr','dlr','ulr','t_rad','z_boom_u', + 'z_boom_q_u','z_stake','z_stake_q','z_pt','t_i_1','t_i_2', + 't_i_3','t_i_4','t_i_5','t_i_6','t_i_7','t_i_8','tilt_x', + 'tilt_y','gps_time','gps_lat','gps_lon','gps_alt', + 'gps_geoid','SKIP_36','gps_q','gps_numsats','gps_hdop', + 't_log','fan_dc_u','batt_v_ss','batt_v'] diff --git a/tests/data/test_config1_tx.toml b/tests/data/test_config1_tx.toml new file mode 100644 index 00000000..0f2dc0f4 --- /dev/null +++ b/tests/data/test_config1_tx.toml @@ -0,0 +1,25 @@ +station_id = 'TEST1' +logger_type = 'CR1000X' +nodata = ['-999', 'NAN'] # if one is a string, all must be strings +number_of_booms = 1 #1-boom = promice, 2-boom = gc-net +latitude = 79.91 +longitude = 24.09 + +['test_raw_transmitted1.txt'] +format = 'TX' +skiprows = 0 +hygroclip_t_offset = 0 +dsr_eng_coef = 12.2 +usr_eng_coef = 11.15 +dlr_eng_coef = 10.28 +ulr_eng_coef = 8.24 +pt_z_coef = 0.3999 +pt_z_p_coef = 982.4 +pt_z_factor = 2.5 +pt_antifreeze = 50 +boom_azimuth = 0 +columns = ['time','rec','p_u','t_u','SKIP_5','rh_u','wspd_u','wdir_u','dsr', + 'usr','dlr','ulr','t_rad','z_boom_u','z_stake','z_pt', + 't_i_1','t_i_2','t_i_3','t_i_4','t_i_5','t_i_6','t_i_7', + 't_i_8','tilt_x','tilt_y','gps_time','gps_lat','gps_lon', + 'gps_alt','gps_hdop','fan_dc_u','batt_v'] diff --git a/tests/e2e/test_get_l2.py b/tests/e2e/test_get_l2.py new file mode 100644 index 00000000..1bfbbf33 --- /dev/null +++ b/tests/e2e/test_get_l2.py @@ -0,0 +1,131 @@ +import datetime +import json +import tempfile +import unittest +from importlib import metadata +from pathlib import Path + +import pandas as pd +import xarray as xr + +from pypromice.process.get_l2 import get_l2 + +TEST_ROOT = Path(__file__).parent.parent +TEST_DATA_ROOT_PATH = TEST_ROOT / "data" + + +class GetL2TestCase(unittest.TestCase): + def test_get_l2_tx(self): + with tempfile.TemporaryDirectory() as tmpdirname: + output_path = Path(tmpdirname) / "output" + config_file = TEST_DATA_ROOT_PATH / "test_config1_tx.toml" + + aws = get_l2( + config_file=config_file.as_posix(), + inpath=TEST_DATA_ROOT_PATH.as_posix(), + outpath=output_path, + variables=None, + metadata=None, + ) + + station_id = "TEST1" + expected_dir = output_path / station_id + expected_dataset_paths = { + "nc_hour": expected_dir / f"{station_id}_hour.nc", + "csv_hour": expected_dir / f"{station_id}_hour.csv", + } + self.assertSetEqual({expected_dir}, set(output_path.iterdir())) + self.assertSetEqual( + set(expected_dataset_paths.values()), set(expected_dir.iterdir()) + ) + + def test_get_l2_raw(self): + with tempfile.TemporaryDirectory() as tmpdirname: + output_path = Path(tmpdirname) / "output" + config_file = TEST_DATA_ROOT_PATH / "test_config1_raw.toml" + + aws = get_l2( + config_file=config_file.as_posix(), + inpath=TEST_DATA_ROOT_PATH.as_posix(), + outpath=output_path, + variables=None, + metadata=None, + ) + + station_id = "TEST1" + expected_dir = output_path / station_id + expected_dataset_paths = { + "nc_hour": expected_dir / f"{station_id}_hour.nc", + "csv_hour": expected_dir / f"{station_id}_hour.csv", + "nc_10min": expected_dir / f"{station_id}_10min.nc", + "csv_10min": expected_dir / f"{station_id}_10min.csv", + } + self.assertSetEqual({expected_dir}, set(output_path.iterdir())) + self.assertSetEqual( + set(expected_dataset_paths.values()), set(expected_dir.iterdir()) + ) + # Test output file format + dataset_hour = xr.open_dataset(expected_dataset_paths["nc_hour"]) + dataset_10min = xr.open_dataset(expected_dataset_paths["nc_10min"]) + + self.assertEqual( + dataset_10min.attrs["id"], + f"dk.geus.promice.station.{station_id}.L2.10min", + ) + self.assertEqual( + dataset_hour.attrs["id"], + f"dk.geus.promice.station.{station_id}.L2.hourly", + ) + self.assertEqual( + dataset_10min.attrs["title"], + f"AWS measurements from {station_id} processed to level 2. 10min average.", + ) + self.assertEqual( + dataset_hour.attrs["title"], + f"AWS measurements from {station_id} processed to level 2. Hourly average.", + ) + + t0 = datetime.datetime.utcnow() + for dataset in [dataset_hour, dataset_10min]: + self.assertEqual(dataset.attrs["format"], "raw") + self.assertEqual(dataset.attrs["station_id"], station_id) + self.assertIsInstance(dataset.attrs["date_created"], str) + date_created = pd.to_datetime(dataset.attrs["date_created"]) + self.assertLess(t0 - date_created, datetime.timedelta(seconds=1)) + self.assertEqual( + dataset.attrs["date_issued"], dataset.attrs["date_created"] + ) + self.assertEqual( + dataset.attrs["date_modified"], dataset.attrs["date_created"] + ) + self.assertEqual( + dataset.attrs["processing_level"], + "Level 2", + ) + self.assertEqual( + dataset.attrs["institution"], + "Geological Survey of Denmark and Greenland (GEUS)", + ) + source_decoded = json.loads(dataset.attrs["source"]) + self.assertSetEqual( + {"pypromice", "l0_config_file", "l0_data_root"}, + set(source_decoded.keys()), + ) + self.assertEqual( + source_decoded["pypromice"], + metadata.version("pypromice"), + ) + config_file_name, config_hash = source_decoded["l0_config_file"].rsplit( + ":", 1 + ) + self.assertEqual( + config_file_name, + config_file.name, + ) + data_root_name, data_root_hash = source_decoded["l0_data_root"].rsplit(":", 1) + self.assertEqual( + data_root_name, + TEST_DATA_ROOT_PATH.name, + ) + self.assertNotEquals(config_hash, 'unknown', 'This test will fail while the commit is dirty') + self.assertNotEquals(data_root_hash, 'unknown', 'This test will fail while the commit is dirty')