Skip to content

Commit

Permalink
refactored zenodo into repository sub package
Browse files Browse the repository at this point in the history
  • Loading branch information
matthiasprobst committed Dec 13, 2023
1 parent 03b61a8 commit 1a60b3b
Show file tree
Hide file tree
Showing 19 changed files with 529 additions and 451 deletions.
38 changes: 19 additions & 19 deletions h5rdmtoolbox/conventions/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import sys
import warnings
import yaml
import zenodo_search as zsearch
from pydoc import locate
from typing import Union, List, Dict, Tuple

Expand All @@ -21,6 +20,9 @@
from .utils import json2yaml
from .._repr import make_italic, make_bold
from .._user import UserDir
from ..repository.zenodo.utils import recid_from_doi_or_redid
from ..repository import zenodo


CV_DIR = UserDir['conventions']

Expand Down Expand Up @@ -622,17 +624,17 @@ def from_json(filename: Union[str, pathlib.Path], overwrite: bool = False) -> Co
return Convention.from_json(filename, overwrite=overwrite)


def from_zenodo(doi,
def from_zenodo(doi_or_recid: str,
name: str = None,
overwrite: bool = False,
force_download: bool = False) -> Convention:
"""Download a YAML file from a zenodo repository
Parameters
----------
doi: str
doi_or_recid: str
DOI of the zenodo repository. Can be a short DOI or a full DOI or the URL (e.g. 10156750 or
10.5281/zenodo.10156750 or https://doi.org/10.5281/zenodo.10156750)
10.5281/zenodo.10156750 or https://doi.org/10.5281/zenodo.10156750 or only the record id, e.g. 10156750)
name: str=None
Name to be sed for the filename. If None, the name is taken from the zenodo record.
overwrite: bool = False
Expand All @@ -646,29 +648,27 @@ def from_zenodo(doi,
The convention object
"""
# depending on the input, try to convert to a valid DOI:
# doi = zsearch.utils.parse_doi(doi)
doi = str(doi)
# parse record id:

rec_id = recid_from_doi_or_redid(doi_or_recid)

if name is None:
filename = UserDir['cache'] / f'{doi.replace("/", "_").replace(":", "_")}'
filename = UserDir['cache'] / f'{rec_id}'
else:
filename = UserDir['cache'] / f'{doi.replace("/", "_").replace(":", "_")}/{name}'
filename = UserDir['cache'] / f'{rec_id}/{name}'

if not filename.exists() or force_download:
record = zsearch.search_doi(doi, parse_doi=False)
record = zenodo.ZenodoRecord(rec_id)

filenames = record.get_filenames()
if name is None:
matches = [file for file in record.files if file['key'].rsplit('.', 1)[-1] == 'yaml']
if len(matches) == 0:
raise ValueError(f'No file with suffix ".yaml" found in record {doi}')
matches = [file for file in filenames if pathlib.Path(file).suffix == '.yaml']
else:
matches = [file for file in record.files if file['key'] == name]
matches = [file for file in filenames if file == name]
if len(matches) == 0:
raise ValueError(f'No file with name "{name}" found in record {doi}')

file0 = zsearch.ZenodoFile(matches[0])
if file0['key'].rsplit('.', 1)[-1] != 'yaml':
raise ValueError(f'The file with name "{name}" is not a YAML file')
raise ValueError(f'No file with name "{name}" found in record {doi_or_recid}')

_filename = file0.download(destination_dir=filename.parent)
_filename = record.download_file(matches[0], target_folder=filename.parent)
shutil.move(_filename, filename)

return from_yaml(filename, overwrite=overwrite)
58 changes: 28 additions & 30 deletions h5rdmtoolbox/conventions/standard_names/table.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
"""Standard name table module"""
import h5py
import json
import pathlib
import shutil
import warnings
from datetime import datetime, timezone
from typing import List, Union, Dict, Tuple

import h5py
import pint
import warnings
import yaml
from IPython.display import display, HTML
from datetime import datetime, timezone
from typing import List, Union, Dict, Tuple

from h5rdmtoolbox._user import UserDir
from h5rdmtoolbox.database import GroupDB
from h5rdmtoolbox.repository import zenodo
from h5rdmtoolbox.utils import generate_temporary_filename, download_file, is_xml_file
from . import cache
from . import consts
Expand All @@ -20,7 +20,6 @@
from .. import logger
from ..utils import dict2xml, get_similar_names_ratio
from ... import errors
from h5rdmtoolbox.database import GroupDB

__this_dir__ = pathlib.Path(__file__).parent

Expand Down Expand Up @@ -667,14 +666,15 @@ def from_gitlab(url: str,
return snt

@staticmethod
def from_zenodo(doi: str) -> "StandardNameTable":
def from_zenodo(doi_or_recid: str) -> "StandardNameTable":
"""Download a standard name table from Zenodo based on its DOI.
Parameters
----------
doi: str
The DOI. It can hav the following formats:
doi_or_recid: str
The DOI or record id. It can have the following formats:
- 8266929
- 10.5281/zenodo.8266929
- https://doi.org/10.5281/zenodo.8266929
- https://zenodo.org/record/8266929
Expand All @@ -693,29 +693,27 @@ def from_zenodo(doi: str) -> "StandardNameTable":
-----
Zenodo API: https://vlp-new.ur.de/developers/#using-access-tokens
"""
doi = str(doi)
if doi in cache.snt:
return cache.snt[doi]

if 'zenodo' in doi:
doi = doi.split('/')[-1]

yaml_filename = UserDir['standard_name_tables'] / f'{doi}.yaml'

if not yaml_filename.exists():
import zenodo_search as zsearch
zenrec = zsearch.search_doi(doi)
zenfile = zenrec.files[0]

yaml_name = zenrec.files[0]['key']
if not yaml_name.endswith('.yaml'):
raise ValueError(f'Expected yaml file, got {yaml_name}')
_yaml_filename = zenfile.download()
shutil.move(_yaml_filename, yaml_filename)
# parse input:
rec_id = zenodo.utils.recid_from_doi_or_redid(doi_or_recid)
if rec_id in cache.snt:
return cache.snt[rec_id]

z = zenodo.ZenodoRecord(rec_id)
assert z.exists()

filenames = z.download_files(target_folder=UserDir['standard_name_tables'])
assert len(filenames) == 1
filename = filenames[0]
assert filename.suffix == '.yaml'
new_filename = UserDir['standard_name_tables'] / f'{rec_id}.yaml'
if new_filename.exists():
new_filename.unlink()
yaml_filename = filename.rename(UserDir['standard_name_tables'] / f'{rec_id}.yaml')
snt = StandardNameTable.from_yaml(yaml_filename)
snt._meta.update(dict(zenodo_doi=doi))
snt._meta.update(dict(zenodo_doi=doi_or_recid))

cache.snt[doi] = snt
cache.snt[rec_id] = snt
return snt

@staticmethod
Expand Down
62 changes: 1 addition & 61 deletions h5rdmtoolbox/database/__init__.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,7 @@
from . import lazy
from ..utils import create_tbx_logger
from .hdfdb import FileDB, FilesDB, GroupDB
from ..utils import create_tbx_logger

logger = create_tbx_logger('database')

# class Folder:
# """Folder with HDF5 files as a database
#
# Parameters
# ----------
# folder : pathlib.Path
# folder with HDF5 files
# pattern : str, optional
# pattern to search for, by default '*.hdf'
# rec : bool, optional
# search recursively for hdf files within the given folder, by default True
# """
#
# def __init__(self, folder: pathlib.Path, pattern='*.hdf', rec: bool = True):
# folder = pathlib.Path(folder)
# if not folder.is_dir():
# raise ValueError(f'{folder} is not a directory')
# self.folder = folder
# if rec:
# self.filenames = list(self.folder.rglob(pattern))
# else:
# self.filenames = list(self.folder.glob(pattern))
# self.rec = rec
#
# def __repr__(self):
# return f'<{self.__class__.__name__} (root="{self.folder}", nfiles={len(self)}, recursive={self.rec})>'
#
# def __len__(self):
# return len(self.filenames)
#
# def __getitem__(self, item) -> File:
# return File(self.filenames[item])
#
# def find(self,
# flt: Union[Dict, str],
# objfilter=None, rec: bool = True,
# ignore_attribute_error: bool = False):
# """Find"""
# with Files(self.filenames, file_instance=File) as h5:
# return h5.find(flt, objfilter, rec, ignore_attribute_error)
#
# def find_one(self,
# flt: Union[Dict, str],
# objfilter=None,
# rec: bool = True,
# ignore_attribute_error: bool = False):
# """Find one occurrence"""
# with Files(self.filenames, file_instance=File) as h5:
# return h5.find_one(flt, objfilter, rec, ignore_attribute_error)
#
# def find_one_per_file(self,
# flt: Union[Dict, str],
# objfilter=None,
# rec: bool = True,
# ignore_attribute_error: bool = False):
# """Find one occurrence"""
# with Files(self.filenames, file_instance=File) as h5:
# return h5.find_one_per_file(flt, objfilter, rec, ignore_attribute_error)


__all__ = ['logger', 'lazy', 'FileDB', 'FilesDB', 'GroupDB']
Loading

0 comments on commit 1a60b3b

Please sign in to comment.