From c2aa111f1ad7f7a1d3a6d23b677803866f2bc0e1 Mon Sep 17 00:00:00 2001 From: Rasmus Handberg Date: Thu, 28 Jan 2021 13:31:40 +0100 Subject: [PATCH] Include CBVs and dependencies for Ensemble --- VERSION | 2 +- dataval/release.py | 150 ++++++++++++++++-- dataval/utilities.py | 17 +- run_package_release.py | 65 +++++++- ...tess-s0006-c0120-a114-v5-tasoc_cbv.fits.gz | 3 + ...tess-s0006-c1800-a114-v5-tasoc_cbv.fits.gz | 3 + tests/test_run_package_release.py | 40 ++++- 7 files changed, 251 insertions(+), 29 deletions(-) create mode 100644 tests/input/ready_for_release/cbv-prepare/tess-s0006-c0120-a114-v5-tasoc_cbv.fits.gz create mode 100644 tests/input/ready_for_release/cbv-prepare/tess-s0006-c1800-a114-v5-tasoc_cbv.fits.gz diff --git a/VERSION b/VERSION index a4dbc43..8f97f00 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -master-v1.3.2 \ No newline at end of file +master-v1.4 \ No newline at end of file diff --git a/dataval/release.py b/dataval/release.py index 4aff84d..46cb12c 100644 --- a/dataval/release.py +++ b/dataval/release.py @@ -6,15 +6,63 @@ .. codeauthor:: Rasmus Handberg """ +import numpy as np import logging import warnings import re import os import shutil +import tempfile +import contextlib from astropy.io import fits from astropy.wcs import WCS, FITSFixedWarning from .utilities import find_tpf_files, get_filehash +#-------------------------------------------------------------------------------------------------- +@contextlib.contextmanager +def temporary_filename(**kwargs): + """ + Context that introduces a temporary file. + + Creates a temporary file, yields its name, and upon context exit, deletes it. + (In contrast, tempfile.NamedTemporaryFile() provides a 'file' object and + deletes the file as soon as that file object is closed, so the temporary file + cannot be safely re-opened by another library or process.) + + Yields: + The name of the temporary file. + """ + if 'delete' in kwargs: + raise ValueError("DELETE keyword can not be used") + try: + f = tempfile.NamedTemporaryFile(delete=False, **kwargs) + tmp_name = f.name + f.close() + yield tmp_name + finally: + if os.path.exists(tmp_name): + os.remove(tmp_name) + +#-------------------------------------------------------------------------------------------------- +def atomic_copy(src, dst): + """ + Copy file (using shutil.copy2), but with higher likelihood of being an atomic operation. + + This is done by first copying to a temp file and then renaming this file to the final name. + This is only atmic on POSIX systems. + """ + if os.path.exists(dst): + raise FileExistsError(dst) + + with temporary_filename(dir=os.path.dirname(dst), suffix='.tmp') as tmp: + try: + shutil.copy2(src, tmp) + os.rename(tmp, dst) + except: # noqa: E722 + if os.path.exists(dst): + os.remove(dst) + raise + #-------------------------------------------------------------------------------------------------- def check_fits_changes(fname, fname_modified, allow_header_value_changes=None): """ @@ -170,27 +218,34 @@ def fix_file(row, input_folder=None, check_corrector=None, force_version=None, t raise Exception("CORRECTOR") # Do we really need to modify the FITS file? - modification_needed = True # FORCE modification check! + openfile_needed = True # FORCE modification check! fix_wcs = False if dataval > 0: - modification_needed = True + openfile_needed = True if cadence == 120 and version <= 5: - modification_needed = True + openfile_needed = True fix_wcs = True + # We need to open the ensemble files to find the lightcurve dependencies: + if corrector == 'ens': + openfile_needed = True + # Find the starid of the TPF which was used to create this lightcurve: if row['datasource'] == 'tpf': - dependency = row['starid'] + dependency_tpf = row['starid'] elif row['datasource'].startswith('tpf:'): - dependency = int(row['datasource'][4:]) + dependency_tpf = int(row['datasource'][4:]) else: - dependency = None + dependency_tpf = None + + # Placeholder for dependencies between lightcurves: + dependency_lc = None # Damn, it looks like a modification is needed: allow_change = [] - if modification_needed: + if openfile_needed: logger.debug("Opening FITS file: %s", fname) modification_needed = False @@ -198,18 +253,18 @@ def fix_file(row, input_folder=None, check_corrector=None, force_version=None, t if tpf_rootdir is None: raise Exception("You need to provide a TPF_ROOTDIR") # Find out what the - if dependency is None: + if dependency_tpf is None: raise Exception("We can't fix WCSs of FFI targets!") # Find the original TPF file and extract the WCS from its headers: - tpf_file = find_tpf_files(tpf_rootdir, starid=dependency, sector=sector, camera=camera, ccd=ccd, cadence=cadence) + tpf_file = find_tpf_files(tpf_rootdir, starid=dependency_tpf, sector=sector, camera=camera, ccd=ccd, cadence=cadence) if len(tpf_file) != 1: - raise Exception("Could not find TPF file: starid=%d, sector=%d" % (dependency, sector)) + raise Exception("Could not find TPF file: starid=%d, sector=%d" % (dependency_tpf, sector)) # Extract the FITS header with the correct WCS: with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=FITSFixedWarning) wcs_header = WCS(header=fits.getheader(tpf_file[0], extname='APERTURE'), relax=True).to_header(relax=True) - shutil.copy(fname, fname_original) + atomic_copy(fname, fname_original) with fits.open(fname_original, mode='readonly', memmap=True) as hdu: prihdr = hdu[0].header @@ -231,6 +286,10 @@ def fix_file(row, input_folder=None, check_corrector=None, force_version=None, t allow_change += ['TDISP2'] hdu['ENSEMBLE'].header['TDISP2'] = 'E26.17' + if corrector == 'ens': + # Pick out the list of TIC-IDs used to build ensemble: + dependency_lc = list(hdu['ENSEMBLE'].data['TIC']) + if fix_wcs: logger.info("%s: Changing WCS", fname) modification_needed = True @@ -269,6 +328,7 @@ def fix_file(row, input_folder=None, check_corrector=None, force_version=None, t 'sector': row['sector'], 'camera': row['camera'], 'ccd': row['ccd'], + 'cbv_area': row['cbv_area'], 'cadence': cadence, 'lightcurve': row['lightcurve'], 'dataval': dataval, @@ -276,5 +336,71 @@ def fix_file(row, input_folder=None, check_corrector=None, force_version=None, t 'version': version, 'filesize': filesize, 'filehash': filehash, - 'dependency': dependency + 'dependency_tpf': dependency_tpf, + 'dependency_lc': dependency_lc + } + +#-------------------------------------------------------------------------------------------------- +def process_cbv(fname, input_folder, force_version=None): + + m = re.match(r'^tess-s(\d{4})-c(\d{4})-a(\d{3})-v(\d+)-tasoc_cbv\.fits\.gz$', os.path.basename(fname)) + if m is None: + raise Exception("CBV file does not have the correct file name format!") + fname_sector = int(m.group(1)) + fname_cadence = int(m.group(2)) + fname_cbvarea = int(m.group(3)) + fname_camera = int(m.group(3)[0]) + fname_ccd = int(m.group(3)[1]) + fname_version = int(m.group(4)) + + # Open the FITS file and check the headers: + with fits.open(fname, mode='readonly', memmap=True) as hdu: + hdr = hdu[0].header + sector = hdr['SECTOR'] + camera = hdr['CAMERA'] + ccd = hdr['CCD'] + data_rel = hdr['DATA_REL'] + version = hdr['VERSION'] + cbv_area = hdr['CBV_AREA'] + cadence = hdr['CADENCE'] + + time = np.asarray(hdu[1].data['TIME']) + cadence_time = int(np.round(86400*np.median(np.diff(time)))) + + # Check that the filename and headers are consistent: + if sector != fname_sector: + raise Exception("SECTOR does not match filename.") + if camera != fname_camera: + raise Exception("CAMERA does not match filename.") + if ccd != fname_ccd: + raise Exception("CCD does not match filename.") + if cadence != fname_cadence: + raise Exception("CADENCE does not match filename.") + if cadence != cadence_time: + raise Exception("CADENCE does not match TIME.") + if cbv_area != fname_cbvarea: + raise Exception("CBV_AREA does not match filename.") + if version != fname_version: + raise Exception("VERSION does not match filename.") + + if force_version is not None and version != force_version: + raise Exception("Version mismatch!") + + path = os.path.relpath(fname, input_folder).replace('\\', '/') + + # Extract information from final file: + filesize = os.path.getsize(fname) + filehash = get_filehash(fname) + + return { + 'path': path, + 'sector': sector, + 'camera': camera, + 'ccd': ccd, + 'cbv_area': cbv_area, + 'cadence': cadence, + 'datarel': data_rel, + 'version': version, + 'filesize': filesize, + 'filehash': filehash } diff --git a/dataval/utilities.py b/dataval/utilities.py index db0dd91..e98f49c 100644 --- a/dataval/utilities.py +++ b/dataval/utilities.py @@ -68,21 +68,24 @@ def loadPickle(fname): return pickle.load(fid) #-------------------------------------------------------------------------------------------------- -def find_tpf_files(rootdir, starid=None, sector=None, camera=None, ccd=None, cadence=None, findmax=None): +def find_tpf_files(rootdir, starid=None, sector=None, camera=None, ccd=None, cadence=None, + findmax=None): """ Search directory recursively for TESS Target Pixel Files. Parameters: - rootdir (string): Directory to search recursively for TESS TPF files. - starid (integer or None, optional): Only return files from the given TIC number. + rootdir (str): Directory to search recursively for TESS TPF files. + starid (int, optional): Only return files from the given TIC number. If ``None``, files from all TIC numbers are returned. - sector (integer or None, optional): Only return files from the given sector. + sector (int, optional): Only return files from the given sector. If ``None``, files from all sectors are returned. - camera (integer or None, optional): Only return files from the given camera number (1-4). + camera (int or None, optional): Only return files from the given camera number (1-4). If ``None``, files from all cameras are returned. - ccd (integer or None, optional): Only return files from the given CCD number (1-4). + ccd (int, optional): Only return files from the given CCD number (1-4). If ``None``, files from all CCDs are returned. - findmax (integer or None, optional): Maximum number of files to return. + cadence (int, optional): Only return files from the given cadence (20 or 120). + If ``None``, files from all cadences are returned. + findmax (int, optional): Maximum number of files to return. If ``None``, return all files. Note: diff --git a/run_package_release.py b/run_package_release.py index bcecab5..ef8f5f3 100644 --- a/run_package_release.py +++ b/run_package_release.py @@ -11,6 +11,7 @@ import sys import logging import os +import glob import sqlite3 from contextlib import closing import functools @@ -18,7 +19,7 @@ from tqdm import tqdm from dataval import __version__ from dataval.utilities import TqdmLoggingHandler, CounterFilter -from dataval.release import fix_file, regex_fileend +from dataval.release import fix_file, regex_fileend, process_cbv #-------------------------------------------------------------------------------------------------- def main(): @@ -126,6 +127,7 @@ def main(): todolist.ccd, todolist.sector, todolist.datasource, + todolist.cbv_area, diagnostics_corr.lightcurve, datavalidation_corr.dataval FROM todolist @@ -187,8 +189,11 @@ def main(): filehash TEXT NOT NULL, datarel INTEGER NOT NULL, dataval INTEGER NOT NULL, - dependency INTEGER + cbv_area INTEGER NOT NULL, + dependency_tpf INTEGER, + dependency_lc TEXT );""") + # Create the settings table if it doesn't exist: cursor.execute("SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='settings';") if cursor.fetchone()[0] == 0: @@ -202,6 +207,22 @@ def main(): corrector, force_version ]) + + # For CBV corrected data, create a separate table for the CBV files: + if corrector == 'cbv': + cursor.execute("DROP TABLE IF EXISTS release_cbv;") + cursor.execute("""CREATE TABLE release_cbv ( + path TEXT NOT NULL PRIMARY KEY, + sector INTEGER NOT NULL, + camera INTEGER NOT NULL, + ccd INTEGER NOT NULL, + cbv_area INTEGER NOT NULL, + cadence INTEGER NOT NULL, + datarel INTEGER NOT NULL, + filesize INTEGER NOT NULL, + filehash TEXT NOT NULL + );""") + conn.commit() # Ensure that we are not running an existing file with new settings: @@ -214,6 +235,31 @@ def main(): logger.error("Inconsistent VERSION provided") return 2 + # For CBV corrected data, first we find all the CBV files, add these to their own + # release table, and keep a record of which ones were found, so we can check below + # if all the lightcurves can be associated with a single CBV file: + cbvs = [] + if corrector == 'cbv': + cbv_files = glob.glob(os.path.join(input_folder, 'cbv-prepare', '*-tasoc_cbv.fits.gz')) + for fname in cbv_files: + info = process_cbv(fname, input_folder, force_version=force_version) + logger.debug(info) + cursor.execute("INSERT INTO release_cbv (path, sector, camera, ccd, cbv_area, cadence, datarel, filesize, filehash) VALUES (?,?,?,?,?,?,?,?,?);", [ + info['path'], + info['sector'], + info['camera'], + info['ccd'], + info['cbv_area'], + info['cadence'], + info['datarel'], + info['filesize'], + info['filehash'] + ]) + cbvs.append((info['sector'], info['cadence'], info['cbv_area'])) + cbvs = set(cbvs) + print(cbvs) + + # Figure out which files needs to be processed: cursor.execute("SELECT priority FROM release;") already_processed = set([row[0] for row in cursor.fetchall()]) not_yet_processed = [] @@ -237,19 +283,30 @@ def main(): inserted = 0 for info in tqdm(m(fix_file_wrapper, not_yet_processed), total=numfiles, **tqdm_settings): logger.debug(info) - cursor.execute("INSERT INTO release (priority, lightcurve, starid, sector, camera, ccd, cadence, filesize, filehash, datarel, dataval, dependency) VALUES (?,?,?,?,?,?,?,?,?,?,?,?);", [ + + # For CBV corrected data, check that the corresponding CBV was also found: + if corrector == 'cbv' and (info['sector'], info['cadence'], info['cbv_area']) not in cbvs: + raise Exception("CBV not found") + + dependency_lc = info['dependency_lc'] + if dependency_lc is not None: + dependency_lc = ','.join([str(t) for t in info['dependency_lc']]) + + cursor.execute("INSERT INTO release (priority, lightcurve, starid, sector, camera, ccd, cbv_area, cadence, filesize, filehash, datarel, dataval, dependency_tpf, dependency_lc) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?);", [ info['priority'], info['lightcurve'], info['starid'], info['sector'], info['camera'], info['ccd'], + info['cbv_area'], info['cadence'], info['filesize'], info['filehash'], info['datarel'], info['dataval'], - info['dependency'] + info['dependency_tpf'], + dependency_lc ]) inserted += 1 diff --git a/tests/input/ready_for_release/cbv-prepare/tess-s0006-c0120-a114-v5-tasoc_cbv.fits.gz b/tests/input/ready_for_release/cbv-prepare/tess-s0006-c0120-a114-v5-tasoc_cbv.fits.gz new file mode 100644 index 0000000..e38cbf7 --- /dev/null +++ b/tests/input/ready_for_release/cbv-prepare/tess-s0006-c0120-a114-v5-tasoc_cbv.fits.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3d296595f9d5f059603cbff9e146114e7c3e16126290e087e077d10ec73cca6 +size 1182066 diff --git a/tests/input/ready_for_release/cbv-prepare/tess-s0006-c1800-a114-v5-tasoc_cbv.fits.gz b/tests/input/ready_for_release/cbv-prepare/tess-s0006-c1800-a114-v5-tasoc_cbv.fits.gz new file mode 100644 index 0000000..b098d11 --- /dev/null +++ b/tests/input/ready_for_release/cbv-prepare/tess-s0006-c1800-a114-v5-tasoc_cbv.fits.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd7b259fd19df32e6b233b482d2122522487f7b698cd748ff1359904258b73be +size 69352 diff --git a/tests/test_run_package_release.py b/tests/test_run_package_release.py index 8ead4a1..6e54dae 100644 --- a/tests/test_run_package_release.py +++ b/tests/test_run_package_release.py @@ -107,6 +107,28 @@ def test_run_release(PRIVATE_INPUT_DIR, jobs, corrector): else: assert antal == 12 + if corrector == 'cbv': + cursor.execute("SELECT * FROM release_cbv ORDER BY cadence DESC;") + cbvs = cursor.fetchall() + assert len(cbvs) == 2 + row = dict(cbvs[0]) + assert row['path'] == 'cbv-prepare/tess-s0006-c1800-a114-v5-tasoc_cbv.fits.gz' + assert row['sector'] == 6 + assert row['camera'] == 1 + assert row['ccd'] == 1 + assert row['cadence'] == 1800 + assert row['cbv_area'] == 114 + assert row['datarel'] == 8 + + row = dict(cbvs[1]) + assert row['path'] == 'cbv-prepare/tess-s0006-c0120-a114-v5-tasoc_cbv.fits.gz' + assert row['sector'] == 6 + assert row['camera'] == 1 + assert row['ccd'] == 1 + assert row['cadence'] == 120 + assert row['cbv_area'] == 114 + assert row['datarel'] == 8 + cursor.execute("SELECT * FROM release;") for row in cursor.fetchall(): fpath = os.path.join(PRIVATE_INPUT_DIR, 'ready_for_release', row['lightcurve']) @@ -119,13 +141,13 @@ def test_run_release(PRIVATE_INPUT_DIR, jobs, corrector): # Test the dependency: if row['cadence'] > 200: - assert row['dependency'] is None + assert row['dependency_tpf'] is None else: - assert row['dependency'] is not None + assert row['dependency_tpf'] is not None if row['starid'] == 4256961: # This is a secondary target - assert row['dependency'] == 4255638 + assert row['dependency_tpf'] == 4255638 else: # These are "main" targets: - assert row['dependency'] == row['starid'] + assert row['dependency_tpf'] == row['starid'] with fits.open(fpath, mode='readonly', memmap=True) as hdu: hdr = hdu[0].header @@ -140,12 +162,20 @@ def test_run_release(PRIVATE_INPUT_DIR, jobs, corrector): # Check the fix of invalid header in ENSEMBLE extension: if corrector == 'ensemble': + # Check the fix of invalid header in ENSEMBLE extension: assert hdu['ENSEMBLE'].header['TDISP2'] != 'E' + # Check that the stars used to build ensemble were stored: + dependency_lc = set([int(t) for t in row['dependency_lc'].split(',')]) + assert set(hdu['ENSEMBLE'].data['TIC']) == dependency_lc + + elif corrector == 'cbv': + assert hdu[1].header['CBV_AREA'] == row['cbv_area'] + # Check the modification of the WCS solution in 120s data: if row['cadence'] == 120: tpf_file = find_tpf_files(tpf_rootdir, - starid=row['dependency'], + starid=row['dependency_tpf'], sector=row['sector'], camera=row['camera'], ccd=row['ccd'],