Skip to content

Commit

Permalink
Merge branch 'feature/surface-heights-and-thermistor-depths' of https…
Browse files Browse the repository at this point in the history
…://github.com/GEUS-Glaciology-and-Climate/pypromice into feature/surface-heights-and-thermistor-depths
  • Loading branch information
BaptisteVandecrux committed Aug 14, 2024
2 parents f0d7ec9 + 49ca5b5 commit b156dde
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 47 deletions.
18 changes: 5 additions & 13 deletions .github/workflows/process_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
- name: Install Python
uses: actions/setup-python@v4
with:
python-version: "3.8"
python-version: "3.10"
- name: Checkout repo
uses: actions/checkout@v3
with:
Expand Down Expand Up @@ -41,23 +41,15 @@ jobs:
for i in $(echo ${{ env.TEST_STATION }} | tr ' ' '\n'); do
python3 $GITHUB_WORKSPACE/main/src/pypromice/process/get_l2.py -c $GITHUB_WORKSPACE/aws-l0/tx/config/$i.toml -i $GITHUB_WORKSPACE/aws-l0/tx -o $GITHUB_WORKSPACE/out/L0toL2/
done
# - name: Run L0 to L2 processing
# env:
# TEST_STATION: KAN_U HUM
# shell: bash
# run: |
# mkdir $GITHUB_WORKSPACE/out/L2toL3/
# for i in $(echo ${{ env.TEST_STATION }} | tr ' ' '\n'); do
# python3 $GITHUB_WORKSPACE/main/src/pypromice/process/get_l2tol3.py -i $GITHUB_WORKSPACE/out/L0toL2/$i/$i_hour.nc -o $GITHUB_WORKSPACE/out/L2toL3 -t 60min
# done
- name: Run L0 to L3 processing
- name: Run L2 to L3 processing
env:
TEST_STATION: KAN_U HUM
shell: bash
run: |
mkdir $GITHUB_WORKSPACE/out/L0toL3/
mkdir $GITHUB_WORKSPACE/out/L2toL3/
for i in $(echo ${{ env.TEST_STATION }} | tr ' ' '\n'); do
python3 $GITHUB_WORKSPACE/main/src/pypromice/process/get_l2.py -c $GITHUB_WORKSPACE/aws-l0/tx/config/$i.toml -i $GITHUB_WORKSPACE/aws-l0/tx -o $GITHUB_WORKSPACE/out/L2/
echo ${i}_hour.nc
python3 $GITHUB_WORKSPACE/main/src/pypromice/process/get_l2tol3.py -c $GITHUB_WORKSPACE/aws-l0/metadata/station_configurations/ -i $GITHUB_WORKSPACE/out/L0toL2/${i}/${i}_hour.nc -o $GITHUB_WORKSPACE/out/L2toL3/
done
- name: Upload test output
uses: actions/upload-artifact@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/unit_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python_version: ['3.8','3.9','3.10']
python_version: ['3.10', '3.11']
steps:
- name: Install Python
uses: actions/setup-python@v4
Expand Down
75 changes: 49 additions & 26 deletions src/pypromice/qc/persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,40 @@
"persistence_qc",
"find_persistent_regions",
"count_consecutive_persistent_values",
"count_consecutive_true",
"get_duration_consecutive_true",
]

logger = logging.getLogger(__name__)

# period is given in hours, 2 persistent 10 min values will be flagged if period < 0.333
DEFAULT_VARIABLE_THRESHOLDS = {
"t": {"max_diff": 0.0001, "period": 2},
"p": {"max_diff": 0.0001, "period": 2},
'gps_lat_lon':{"max_diff": 0.000001, "period": 6}, # gets special handling to remove simultaneously constant gps_lat and gps_lon
'gps_alt':{"max_diff": 0.0001, "period": 6},
't_rad':{"max_diff": 0.0001, "period": 2},
"rh": {"max_diff": 0.0001, "period": 2}, # gets special handling to allow constant 100%
"wspd": {"max_diff": 0.0001, "period": 6},
"t_i": {"max_diff": 0.0001, "period": 2},
"t_u": {"max_diff": 0.0001, "period": 2},
"t_l": {"max_diff": 0.0001, "period": 2},
"p_i": {"max_diff": 0.0001, "period": 2},
# "p_u": {"max_diff": 0.0001, "period": 2},
# "p_l": {"max_diff": 0.0001, "period": 2},
"gps_lat_lon": {
"max_diff": 0.000001,
"period": 6,
}, # gets special handling to remove simultaneously constant gps_lat and gps_lon
"gps_alt": {"max_diff": 0.0001, "period": 6},
"t_rad": {"max_diff": 0.0001, "period": 2},
"rh_i": {
"max_diff": 0.0001,
"period": 2,
}, # gets special handling to allow constant 100%
"rh_u": {
"max_diff": 0.0001,
"period": 2,
}, # gets special handling to allow constant 100%
"rh_l": {
"max_diff": 0.0001,
"period": 2,
}, # gets special handling to allow constant 100%
"wspd_i": {"max_diff": 0.0001, "period": 6},
"wspd_u": {"max_diff": 0.0001, "period": 6},
"wspd_l": {"max_diff": 0.0001, "period": 6},
}


Expand Down Expand Up @@ -65,7 +85,7 @@ def persistence_qc(
logger.info(f"Running persistence_qc using {variable_thresholds}")

for k in variable_thresholds.keys():
if k in ['t','p','rh','wspd','wdir', 'z_boom']:
if k in ["t", "p", "rh", "wspd", "wdir", "z_boom"]:
var_all = [
k + "_u",
k + "_l",
Expand All @@ -79,29 +99,28 @@ def persistence_qc(
for v in var_all:
if v in df:
mask = find_persistent_regions(df[v], period, max_diff)
if 'rh' in v:
mask = mask & (df[v]<99)
if "rh" in v:
mask = mask & (df[v] < 99)
n_masked = mask.sum()
n_samples = len(mask)
logger.debug(
f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
)
# setting outliers to NaN
df.loc[mask, v] = np.nan
elif v == 'gps_lat_lon':
mask = (
find_persistent_regions(df['gps_lon'], period, max_diff)
& find_persistent_regions(df['gps_lat'], period, max_diff)
)
elif v == "gps_lat_lon":
mask = find_persistent_regions(
df["gps_lon"], period, max_diff
) & find_persistent_regions(df["gps_lat"], period, max_diff)

n_masked = mask.sum()
n_samples = len(mask)
logger.debug(
f"Applying persistent QC in {v}. Filtering {n_masked}/{n_samples} samples"
)
# setting outliers to NaN
df.loc[mask, 'gps_lon'] = np.nan
df.loc[mask, 'gps_lat'] = np.nan
df.loc[mask, "gps_lon"] = np.nan
df.loc[mask, "gps_lat"] = np.nan

# Back to xarray, and re-assign the original attrs
ds_out = df.to_xarray()
Expand Down Expand Up @@ -133,19 +152,21 @@ def count_consecutive_persistent_values(
) -> pd.Series:
diff = data.ffill().diff().abs() # forward filling all NaNs!
mask: pd.Series = diff < max_diff
return duration_consecutive_true(mask)
return get_duration_consecutive_true(mask)


def duration_consecutive_true(
def get_duration_consecutive_true(
series: pd.Series,
) -> pd.Series:
"""
From a boolean series, calculates the duration, in hours, of the periods with connective true values.
From a boolean series, calculates the duration, in hours, of the periods with concecutive true values.
The first value will be set to NaN, as it is not possible to calculate the duration of a single value.
Examples
--------
>>> duration_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True]))
pd.Series([0, 1, 0, 0, 1, 2, 3, 0, 1])
>>> get_duration_consecutive_true(pd.Series([False, True, False, False, True, True, True, False, True]))
pd.Series([np.nan, 1, 0, 0, 1, 2, 3, 0, 1])
Parameters
----------
Expand All @@ -158,9 +179,11 @@ def duration_consecutive_true(
Integer pandas Series or DataFrame with values representing the number of connective true values.
"""
# assert series.dtype == bool
cumsum = ((series.index - series.index[0]).total_seconds()/3600).to_series(index=series.index)
is_first = series.astype("int").diff() == 1
offset = (is_first * cumsum).replace(0, np.nan).ffill().fillna(0)
delta_time = (series.index.diff().total_seconds() / 3600).to_series(
index=series.index
)
cumsum = delta_time.cumsum()
offset = (is_first * (cumsum - delta_time)).replace(0, np.nan).ffill().fillna(0)

return (cumsum - offset) * series
Empty file added tests/unit/qc/__init__.py
Empty file.
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import unittest

import numpy as np
import numpy.testing
import pandas as pd

from pypromice.qc import persistence
from pypromice.qc.persistence import find_persistent_regions


Expand Down Expand Up @@ -32,7 +32,9 @@ def _test_1_hour_repeat(self, index: int):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_no_persistent_period(self):
time_range = pd.date_range(
Expand All @@ -46,7 +48,9 @@ def test_no_persistent_period(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_persistent_period_longer_than_period_threshold(self):
time_range = pd.date_range(
Expand All @@ -66,7 +70,9 @@ def test_persistent_period_longer_than_period_threshold(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_period_threshold_longer_than_persistent_period(self):
time_range = pd.date_range(
Expand All @@ -83,7 +89,9 @@ def test_period_threshold_longer_than_persistent_period(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_persistent_period_at_the_end(self):
time_range = pd.date_range(
Expand All @@ -101,7 +109,9 @@ def test_persistent_period_at_the_end(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_dont_filter_nan_values(self):
time_range = pd.date_range(
Expand All @@ -123,7 +133,9 @@ def test_dont_filter_nan_values(self):
input_series, min_repeats=min_repeats, max_diff=0.001
)

pd.testing.assert_series_equal(expected_output, persistent_mask, check_names=False)
pd.testing.assert_series_equal(
expected_output, persistent_mask, check_names=False
)

def test_series_with_nan_values_between_persistent_values(self):
time_range = pd.date_range(
Expand All @@ -145,6 +157,40 @@ def test_series_with_nan_values_between_persistent_values(self):

np.testing.assert_equal(expected_mask, output_mask)

def test_get_duration_consecutive_true(self):
delta_time_hours = np.random.random(24) * 2
time_range = pd.to_datetime("2023-01-25") + pd.to_timedelta(
delta_time_hours.cumsum(), unit="h"
)
values = time_range == False
values[0:2] = True
values[6] = True
values[10:14] = True
values[-3:] = True
series = pd.Series(index=time_range, data=values)

duration_consecutive_true = persistence.get_duration_consecutive_true(series)

self.assertTrue(
np.isnan(duration_consecutive_true[0]), "The first index should be ignored"
)
np.testing.assert_almost_equal(
duration_consecutive_true[1],
delta_time_hours[1],
)
np.testing.assert_almost_equal(
duration_consecutive_true[6],
delta_time_hours[6],
)
np.testing.assert_almost_equal(
duration_consecutive_true[10:14],
delta_time_hours[10:14].cumsum(),
)
np.testing.assert_almost_equal(
duration_consecutive_true[-3:],
delta_time_hours[-3:].cumsum(),
)


if __name__ == "__main__":
unittest.main()

0 comments on commit b156dde

Please sign in to comment.