refactored zenodo into repository sub package

matthiasprobst · Dec 13, 2023 · 1a60b3b · 1a60b3b
1 parent 03b61a8
commit 1a60b3b
Show file tree

Hide file tree

Showing 19 changed files with 529 additions and 451 deletions.
diff --git a/h5rdmtoolbox/conventions/core.py b/h5rdmtoolbox/conventions/core.py
@@ -9,7 +9,6 @@
 import sys
 import warnings
 import yaml
-import zenodo_search as zsearch
 from pydoc import locate
 from typing import Union, List, Dict, Tuple
 
@@ -21,6 +20,9 @@
 from .utils import json2yaml
 from .._repr import make_italic, make_bold
 from .._user import UserDir
+from ..repository.zenodo.utils import recid_from_doi_or_redid
+from ..repository import zenodo
+
 
 CV_DIR = UserDir['conventions']
 
@@ -622,17 +624,17 @@ def from_json(filename: Union[str, pathlib.Path], overwrite: bool = False) -> Co
     return Convention.from_json(filename, overwrite=overwrite)
 
 
-def from_zenodo(doi,
+def from_zenodo(doi_or_recid: str,
                 name: str = None,
                 overwrite: bool = False,
                 force_download: bool = False) -> Convention:
     """Download a YAML file from a zenodo repository
 
     Parameters
     ----------
-    doi: str
+    doi_or_recid: str
         DOI of the zenodo repository. Can be a short DOI or a full DOI or the URL (e.g. 10156750 or
-        10.5281/zenodo.10156750 or https://doi.org/10.5281/zenodo.10156750)
+        10.5281/zenodo.10156750 or https://doi.org/10.5281/zenodo.10156750 or only the record id, e.g. 10156750)
     name: str=None
         Name to be sed for the filename. If None, the name is taken from the zenodo record.
     overwrite: bool = False
@@ -646,29 +648,27 @@ def from_zenodo(doi,
         The convention object
     """
     # depending on the input, try to convert to a valid DOI:
-    # doi = zsearch.utils.parse_doi(doi)
-    doi = str(doi)
+    # parse record id:
+
+    rec_id = recid_from_doi_or_redid(doi_or_recid)
+
     if name is None:
-        filename = UserDir['cache'] / f'{doi.replace("/", "_").replace(":", "_")}'
+        filename = UserDir['cache'] / f'{rec_id}'
     else:
-        filename = UserDir['cache'] / f'{doi.replace("/", "_").replace(":", "_")}/{name}'
+        filename = UserDir['cache'] / f'{rec_id}/{name}'
 
     if not filename.exists() or force_download:
-        record = zsearch.search_doi(doi, parse_doi=False)
+        record = zenodo.ZenodoRecord(rec_id)
+
+        filenames = record.get_filenames()
         if name is None:
-            matches = [file for file in record.files if file['key'].rsplit('.', 1)[-1] == 'yaml']
-            if len(matches) == 0:
-                raise ValueError(f'No file with suffix ".yaml" found in record {doi}')
+            matches = [file for file in filenames if pathlib.Path(file).suffix == '.yaml']
         else:
-            matches = [file for file in record.files if file['key'] == name]
+            matches = [file for file in filenames if file == name]
             if len(matches) == 0:
-                raise ValueError(f'No file with name "{name}" found in record {doi}')
-
-        file0 = zsearch.ZenodoFile(matches[0])
-        if file0['key'].rsplit('.', 1)[-1] != 'yaml':
-            raise ValueError(f'The file with name "{name}" is not a YAML file')
+                raise ValueError(f'No file with name "{name}" found in record {doi_or_recid}')
 
-        _filename = file0.download(destination_dir=filename.parent)
+        _filename = record.download_file(matches[0], target_folder=filename.parent)
         shutil.move(_filename, filename)
 
     return from_yaml(filename, overwrite=overwrite)
diff --git a/h5rdmtoolbox/conventions/standard_names/table.py b/h5rdmtoolbox/conventions/standard_names/table.py
@@ -1,17 +1,17 @@
 """Standard name table module"""
+import h5py
 import json
 import pathlib
-import shutil
-import warnings
-from datetime import datetime, timezone
-from typing import List, Union, Dict, Tuple
-
-import h5py
 import pint
+import warnings
 import yaml
 from IPython.display import display, HTML
+from datetime import datetime, timezone
+from typing import List, Union, Dict, Tuple
 
 from h5rdmtoolbox._user import UserDir
+from h5rdmtoolbox.database import GroupDB
+from h5rdmtoolbox.repository import zenodo
 from h5rdmtoolbox.utils import generate_temporary_filename, download_file, is_xml_file
 from . import cache
 from . import consts
@@ -20,7 +20,6 @@
 from .. import logger
 from ..utils import dict2xml, get_similar_names_ratio
 from ... import errors
-from h5rdmtoolbox.database import GroupDB
 
 __this_dir__ = pathlib.Path(__file__).parent
 
@@ -667,14 +666,15 @@ def from_gitlab(url: str,
         return snt
 
     @staticmethod
-    def from_zenodo(doi: str) -> "StandardNameTable":
+    def from_zenodo(doi_or_recid: str) -> "StandardNameTable":
         """Download a standard name table from Zenodo based on its DOI.
 
 
         Parameters
         ----------
-        doi: str
-            The DOI. It can hav the following formats:
+        doi_or_recid: str
+            The DOI or record id. It can have the following formats:
+            - 8266929
             - 10.5281/zenodo.8266929
             - https://doi.org/10.5281/zenodo.8266929
             - https://zenodo.org/record/8266929
@@ -693,29 +693,27 @@ def from_zenodo(doi: str) -> "StandardNameTable":
         -----
         Zenodo API: https://vlp-new.ur.de/developers/#using-access-tokens
         """
-        doi = str(doi)
-        if doi in cache.snt:
-            return cache.snt[doi]
-
-        if 'zenodo' in doi:
-            doi = doi.split('/')[-1]
-
-        yaml_filename = UserDir['standard_name_tables'] / f'{doi}.yaml'
-
-        if not yaml_filename.exists():
-            import zenodo_search as zsearch
-            zenrec = zsearch.search_doi(doi)
-            zenfile = zenrec.files[0]
 
-            yaml_name = zenrec.files[0]['key']
-            if not yaml_name.endswith('.yaml'):
-                raise ValueError(f'Expected yaml file, got {yaml_name}')
-            _yaml_filename = zenfile.download()
-            shutil.move(_yaml_filename, yaml_filename)
+        # parse input:
+        rec_id = zenodo.utils.recid_from_doi_or_redid(doi_or_recid)
+        if rec_id in cache.snt:
+            return cache.snt[rec_id]
+
+        z = zenodo.ZenodoRecord(rec_id)
+        assert z.exists()
+
+        filenames = z.download_files(target_folder=UserDir['standard_name_tables'])
+        assert len(filenames) == 1
+        filename = filenames[0]
+        assert filename.suffix == '.yaml'
+        new_filename = UserDir['standard_name_tables'] / f'{rec_id}.yaml'
+        if new_filename.exists():
+            new_filename.unlink()
+        yaml_filename = filename.rename(UserDir['standard_name_tables'] / f'{rec_id}.yaml')
         snt = StandardNameTable.from_yaml(yaml_filename)
-        snt._meta.update(dict(zenodo_doi=doi))
+        snt._meta.update(dict(zenodo_doi=doi_or_recid))
 
-        cache.snt[doi] = snt
+        cache.snt[rec_id] = snt
         return snt
 
     @staticmethod

diff --git a/h5rdmtoolbox/database/__init__.py b/h5rdmtoolbox/database/__init__.py
@@ -1,67 +1,7 @@
 from . import lazy
-from ..utils import create_tbx_logger
 from .hdfdb import FileDB, FilesDB, GroupDB
+from ..utils import create_tbx_logger
 
 logger = create_tbx_logger('database')
 
-# class Folder:
-#     """Folder with HDF5 files as a database
-#
-#     Parameters
-#     ----------
-#     folder : pathlib.Path
-#         folder with HDF5 files
-#     pattern : str, optional
-#         pattern to search for, by default '*.hdf'
-#     rec : bool, optional
-#         search recursively for hdf files within the given folder, by default True
-#     """
-#
-#     def __init__(self, folder: pathlib.Path, pattern='*.hdf', rec: bool = True):
-#         folder = pathlib.Path(folder)
-#         if not folder.is_dir():
-#             raise ValueError(f'{folder} is not a directory')
-#         self.folder = folder
-#         if rec:
-#             self.filenames = list(self.folder.rglob(pattern))
-#         else:
-#             self.filenames = list(self.folder.glob(pattern))
-#         self.rec = rec
-#
-#     def __repr__(self):
-#         return f'<{self.__class__.__name__} (root="{self.folder}", nfiles={len(self)}, recursive={self.rec})>'
-#
-#     def __len__(self):
-#         return len(self.filenames)
-#
-#     def __getitem__(self, item) -> File:
-#         return File(self.filenames[item])
-#
-#     def find(self,
-#              flt: Union[Dict, str],
-#              objfilter=None, rec: bool = True,
-#              ignore_attribute_error: bool = False):
-#         """Find"""
-#         with Files(self.filenames, file_instance=File) as h5:
-#             return h5.find(flt, objfilter, rec, ignore_attribute_error)
-#
-#     def find_one(self,
-#                  flt: Union[Dict, str],
-#                  objfilter=None,
-#                  rec: bool = True,
-#                  ignore_attribute_error: bool = False):
-#         """Find one occurrence"""
-#         with Files(self.filenames, file_instance=File) as h5:
-#             return h5.find_one(flt, objfilter, rec, ignore_attribute_error)
-#
-#     def find_one_per_file(self,
-#                           flt: Union[Dict, str],
-#                           objfilter=None,
-#                           rec: bool = True,
-#                           ignore_attribute_error: bool = False):
-#         """Find one occurrence"""
-#         with Files(self.filenames, file_instance=File) as h5:
-#             return h5.find_one_per_file(flt, objfilter, rec, ignore_attribute_error)
-
-
 __all__ = ['logger', 'lazy', 'FileDB', 'FilesDB', 'GroupDB']