Skip to content

Commit

Permalink
Updated attribute metadata
Browse files Browse the repository at this point in the history
* Added test case for output meta data
* Added json formatted source string to attributes
* Added title string to attributes
* Updated ID string to include level
  • Loading branch information
ladsmund committed Aug 16, 2024
1 parent 92c4184 commit 083bffe
Show file tree
Hide file tree
Showing 6 changed files with 315 additions and 14 deletions.
15 changes: 14 additions & 1 deletion src/pypromice/process/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from pypromice.process.L1toL2 import toL2
from pypromice.process.L2toL3 import toL3
from pypromice.process import write, load, utilities
from pypromice.process.resample import resample_dataset
from pypromice.utilities.git import get_commit_hash_and_check_dirty

pd.set_option("display.precision", 2)
xr.set_options(keep_attrs=True)
Expand Down Expand Up @@ -60,6 +60,19 @@ def __init__(
self.vars = pypromice.resources.load_variables(var_file)
self.meta = pypromice.resources.load_metadata(meta_file)

config_hash = get_commit_hash_and_check_dirty(Path(config_file))
config_source_string = f"{Path(config_file).name}:{config_hash}"
inpath_hash = get_commit_hash_and_check_dirty(Path(inpath))
inpath_source_string = f"{Path(inpath).name}:{inpath_hash}"

source_dict = dict(
pypromice = metadata.version("pypromice"),
l0_config_file = config_source_string,
l0_data_root = inpath_source_string,
)
self.meta["source"] = json.dumps(source_dict)


# Load config file
L0 = self.loadL0()
self.L0=[]
Expand Down
41 changes: 28 additions & 13 deletions src/pypromice/process/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import datetime
import logging
import os
from importlib import metadata

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -310,20 +309,41 @@ def addMeta(ds, meta):
sample_rate = "monthly"

if "station_id" in ds.attrs.keys():
ds.attrs["id"] = (
"dk.geus.promice.station." + ds.attrs["station_id"] + "." + sample_rate
)
id_components = [
"dk",
"geus",
"promice",
"station",
ds.attrs["station_id"],
ds.attrs["level"],
sample_rate,
]
ds.attrs["id"] = ".".join(id_components)
else:
ds.attrs["id"] = (
"dk.geus.promice.site." + ds.attrs["site_id"] + "." + sample_rate
)
id_components = [
"dk",
"geus",
"promice",
"site",
ds.attrs["site_id"],
ds.attrs["level"],
sample_rate,
]
ds.attrs["id"] = ".".join(id_components)

ds.attrs["history"] = "Generated on " + datetime.datetime.utcnow().isoformat()
ds.attrs["date_created"] = str(datetime.datetime.now().isoformat())
ds.attrs["date_modified"] = ds.attrs["date_created"]
ds.attrs["date_issued"] = ds.attrs["date_created"]
ds.attrs["date_metadata_modified"] = ds.attrs["date_created"]
ds.attrs["processing_level"] = ds.attrs["level"].replace("L", "level ")
ds.attrs["processing_level"] = ds.attrs["level"].replace("L", "Level ")

title_string_format = "AWS measurements from {station_id} processed to {processing_level}. {sample_rate} average."
ds.attrs["title"] = title_string_format.format(
station_id=ds.attrs["station_id"],
processing_level=ds.attrs["processing_level"].lower(),
sample_rate=sample_rate.capitalize(),
)

if "lat" in ds.keys():
lat_min = ds["lat"].min().values
Expand Down Expand Up @@ -384,11 +404,6 @@ def addMeta(ds, meta):
ds.attrs["time_coverage_start"] = str(ds["time"][0].values)
ds.attrs["time_coverage_end"] = str(ds["time"][-1].values)

try:
ds.attrs["source"] = "pypromice v" + str(metadata.version("pypromice"))
except:
ds.attrs["source"] = "pypromice"

# https://www.digi.com/resources/documentation/digidocs/90001437-13/reference/r_iso_8601_duration_format.htm
try:
ds.attrs["time_coverage_duration"] = str(
Expand Down
68 changes: 68 additions & 0 deletions src/pypromice/utilities/git.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import subprocess
import os
from pathlib import Path

import logging
from typing import Optional

logger = logging.getLogger(__name__)


def get_commit_hash_and_check_dirty(file_path) -> str:
repo_path = Path(file_path).parent

try:
# Ensure the file path is relative to the repository
relative_file_path = os.path.relpath(file_path, repo_path)

# Get the latest commit hash for the file
commit_hash = (
subprocess.check_output(
[
"git",
"-C",
repo_path,
"log",
"-n",
"1",
"--pretty=format:%H",
#"--",
#relative_file_path,
],
stderr=subprocess.STDOUT,
)
.strip()
.decode("utf-8")
)

# Check if the file is dirty (has uncommitted changes)
diff_output = (
subprocess.check_output(
["git", "-C", repo_path, "diff", "--", relative_file_path],
stderr=subprocess.STDOUT,
)
.strip()
.decode("utf-8")
)

# If diff_output is not empty, the file has uncommitted changes
is_dirty = len(diff_output) > 0

if is_dirty:
logger.warning(f"Warning: The file {file_path} is dirty compared to the last commit. {commit_hash}")
return 'unknown'
if commit_hash == "":
logger.warning(f"Warning: The file {file_path} is not under version control.")
return 'unknown'

print(f"Commit hash: {commit_hash}")
return commit_hash
except subprocess.CalledProcessError as e:
logger.warning(f"Error: {e.output.decode('utf-8')}")
return 'unknown'


# %%

get_commit_hash_and_check_dirty("/Users/maclu/data/aws-l0/README.md")
get_commit_hash_and_check_dirty("/Users/maclu/data/aws-l0/dirty.txt")
49 changes: 49 additions & 0 deletions tests/data/test_config1_raw.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
station_id = 'TEST1'
logger_type = 'CR1000X'
nodata = ['-999', 'NAN'] # if one is a string, all must be strings
number_of_booms = 1 #1-boom = promice, 2-boom = gc-net
latitude = 79.91
longitude = 24.09

['test_raw1.txt']
format = 'raw'
skiprows = 4
hygroclip_t_offset = 40
dsr_eng_coef = 14.01
usr_eng_coef = 12.72
dlr_eng_coef = 11.08
ulr_eng_coef = 11.42
pt_z_coef = 0.39571
pt_z_p_coef = 1022.5
pt_z_factor = 2.5
pt_antifreeze = 50
boom_azimuth = 0
columns = ['time','rec','SKIP_3','p_u','t_u','SKIP_6','rh_u','wspd_u','wdir_u','wdir_std_u',
'dsr','usr','dlr','ulr','t_rad','z_boom_u','z_boom_q_u','z_stake','z_stake_q','z_pt',
't_i_1','t_i_2','t_i_3','t_i_4','t_i_5','t_i_6','t_i_7',
't_i_8','tilt_x','tilt_y','gps_time','gps_lat','gps_lon',
'gps_alt','gps_geoid','SKIP_36','gps_q','gps_numsat','gps_hdop',
't_log','fan_dc_u','batt_v_ini','batt_v']

['test_raw_SlimTableMem1.txt']
format = 'STM'
skiprows = 4
latitude = 79.83
longitude = 25.17
hygroclip_t_offset = 40
dsr_eng_coef = 12.55
usr_eng_coef = 12.52
dlr_eng_coef = 13.73
ulr_eng_coef = 10.43
pt_z_coef = 0.4369
pt_z_p_coef = 1005.9
pt_z_factor = 2.5
pt_antifreeze = 50
boom_azimuth = 0
columns = ['time','rec','min_y','p_u','t_u','SKIP_6','rh_u','wspd_u','wdir_u',
'wd_std_u','dsr','usr','dlr','ulr','t_rad','z_boom_u',
'z_boom_q_u','z_stake','z_stake_q','z_pt','t_i_1','t_i_2',
't_i_3','t_i_4','t_i_5','t_i_6','t_i_7','t_i_8','tilt_x',
'tilt_y','gps_time','gps_lat','gps_lon','gps_alt',
'gps_geoid','SKIP_36','gps_q','gps_numsats','gps_hdop',
't_log','fan_dc_u','batt_v_ss','batt_v']
25 changes: 25 additions & 0 deletions tests/data/test_config1_tx.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
station_id = 'TEST1'
logger_type = 'CR1000X'
nodata = ['-999', 'NAN'] # if one is a string, all must be strings
number_of_booms = 1 #1-boom = promice, 2-boom = gc-net
latitude = 79.91
longitude = 24.09

['test_raw_transmitted1.txt']
format = 'TX'
skiprows = 0
hygroclip_t_offset = 0
dsr_eng_coef = 12.2
usr_eng_coef = 11.15
dlr_eng_coef = 10.28
ulr_eng_coef = 8.24
pt_z_coef = 0.3999
pt_z_p_coef = 982.4
pt_z_factor = 2.5
pt_antifreeze = 50
boom_azimuth = 0
columns = ['time','rec','p_u','t_u','SKIP_5','rh_u','wspd_u','wdir_u','dsr',
'usr','dlr','ulr','t_rad','z_boom_u','z_stake','z_pt',
't_i_1','t_i_2','t_i_3','t_i_4','t_i_5','t_i_6','t_i_7',
't_i_8','tilt_x','tilt_y','gps_time','gps_lat','gps_lon',
'gps_alt','gps_hdop','fan_dc_u','batt_v']
131 changes: 131 additions & 0 deletions tests/e2e/test_get_l2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import datetime
import json
import tempfile
import unittest
from importlib import metadata
from pathlib import Path

import pandas as pd
import xarray as xr

from pypromice.process.get_l2 import get_l2

TEST_ROOT = Path(__file__).parent.parent
TEST_DATA_ROOT_PATH = TEST_ROOT / "data"


class GetL2TestCase(unittest.TestCase):
def test_get_l2_tx(self):
with tempfile.TemporaryDirectory() as tmpdirname:
output_path = Path(tmpdirname) / "output"
config_file = TEST_DATA_ROOT_PATH / "test_config1_tx.toml"

aws = get_l2(
config_file=config_file.as_posix(),
inpath=TEST_DATA_ROOT_PATH.as_posix(),
outpath=output_path,
variables=None,
metadata=None,
)

station_id = "TEST1"
expected_dir = output_path / station_id
expected_dataset_paths = {
"nc_hour": expected_dir / f"{station_id}_hour.nc",
"csv_hour": expected_dir / f"{station_id}_hour.csv",
}
self.assertSetEqual({expected_dir}, set(output_path.iterdir()))
self.assertSetEqual(
set(expected_dataset_paths.values()), set(expected_dir.iterdir())
)

def test_get_l2_raw(self):
with tempfile.TemporaryDirectory() as tmpdirname:
output_path = Path(tmpdirname) / "output"
config_file = TEST_DATA_ROOT_PATH / "test_config1_raw.toml"

aws = get_l2(
config_file=config_file.as_posix(),
inpath=TEST_DATA_ROOT_PATH.as_posix(),
outpath=output_path,
variables=None,
metadata=None,
)

station_id = "TEST1"
expected_dir = output_path / station_id
expected_dataset_paths = {
"nc_hour": expected_dir / f"{station_id}_hour.nc",
"csv_hour": expected_dir / f"{station_id}_hour.csv",
"nc_10min": expected_dir / f"{station_id}_10min.nc",
"csv_10min": expected_dir / f"{station_id}_10min.csv",
}
self.assertSetEqual({expected_dir}, set(output_path.iterdir()))
self.assertSetEqual(
set(expected_dataset_paths.values()), set(expected_dir.iterdir())
)
# Test output file format
dataset_hour = xr.open_dataset(expected_dataset_paths["nc_hour"])
dataset_10min = xr.open_dataset(expected_dataset_paths["nc_10min"])

self.assertEqual(
dataset_10min.attrs["id"],
f"dk.geus.promice.station.{station_id}.L2.10min",
)
self.assertEqual(
dataset_hour.attrs["id"],
f"dk.geus.promice.station.{station_id}.L2.hourly",
)
self.assertEqual(
dataset_10min.attrs["title"],
f"AWS measurements from {station_id} processed to level 2. 10min average.",
)
self.assertEqual(
dataset_hour.attrs["title"],
f"AWS measurements from {station_id} processed to level 2. Hourly average.",
)

t0 = datetime.datetime.utcnow()
for dataset in [dataset_hour, dataset_10min]:
self.assertEqual(dataset.attrs["format"], "raw")
self.assertEqual(dataset.attrs["station_id"], station_id)
self.assertIsInstance(dataset.attrs["date_created"], str)
date_created = pd.to_datetime(dataset.attrs["date_created"])
self.assertLess(t0 - date_created, datetime.timedelta(seconds=1))
self.assertEqual(
dataset.attrs["date_issued"], dataset.attrs["date_created"]
)
self.assertEqual(
dataset.attrs["date_modified"], dataset.attrs["date_created"]
)
self.assertEqual(
dataset.attrs["processing_level"],
"Level 2",
)
self.assertEqual(
dataset.attrs["institution"],
"Geological Survey of Denmark and Greenland (GEUS)",
)
source_decoded = json.loads(dataset.attrs["source"])
self.assertSetEqual(
{"pypromice", "l0_config_file", "l0_data_root"},
set(source_decoded.keys()),
)
self.assertEqual(
source_decoded["pypromice"],
metadata.version("pypromice"),
)
config_file_name, config_hash = source_decoded["l0_config_file"].rsplit(
":", 1
)
self.assertEqual(
config_file_name,
config_file.name,
)
data_root_name, data_root_hash = source_decoded["l0_data_root"].rsplit(":", 1)
self.assertEqual(
data_root_name,
TEST_DATA_ROOT_PATH.name,
)
self.assertNotEquals(config_hash, 'unknown', 'This test will fail while the commit is dirty')
self.assertNotEquals(data_root_hash, 'unknown', 'This test will fail while the commit is dirty')

0 comments on commit 083bffe

Please sign in to comment.