From c1c09b4038144ce8e97377bcdb52a5114228d6d6 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 28 Oct 2024 16:33:00 -0400 Subject: [PATCH 1/2] Do specify filter="tar" when extracting tars Situation is complicated is that when extracting .zip shutil.unpack_archive would also pass it and would crash so we need to pass only when extracting tar. And also that kwargs option was added in 3.12, started to be enforced in 3.13 and in 3.14 would be required --- heudiconv/parser.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/heudiconv/parser.py b/heudiconv/parser.py index e6ef72d1..3b510522 100644 --- a/heudiconv/parser.py +++ b/heudiconv/parser.py @@ -9,6 +9,7 @@ import os.path as op import re import shutil +import sys from types import ModuleType from typing import Optional @@ -22,7 +23,18 @@ _VCS_REGEX = r"%s\.(?:git|gitattributes|svn|bzr|hg)(?:%s|$)" % (op.sep, op.sep) -_UNPACK_FORMATS = tuple(sum((x[1] for x in shutil.get_unpack_formats()), [])) + +def _get_unpack_formats() -> dict[str, bool]: + """For each extension return if it is a tar""" + out = {} + for _, exts, d in shutil.get_unpack_formats(): + for e in exts: + out[e] = bool(re.search(r"\btar\b", d.lower())) + return out + + +_UNPACK_FORMATS = _get_unpack_formats() +_TAR_UNPACK_FORMATS = tuple(k for k, is_tar in _UNPACK_FORMATS.items() if is_tar) @docstring_parameter(_VCS_REGEX) @@ -114,7 +126,7 @@ def get_extracted_dicoms(fl: Iterable[str]) -> ItemsView[Optional[str], list[str # needs sorting to keep the generated "session" label deterministic for _, t in enumerate(sorted(fl)): - if not t.endswith(_UNPACK_FORMATS): + if not t.endswith(tuple(_UNPACK_FORMATS)): sessions[None].append(t) continue @@ -127,7 +139,14 @@ def get_extracted_dicoms(fl: Iterable[str]) -> ItemsView[Optional[str], list[str # check content and sanitize permission bits before extraction os.chmod(tmpdir, mode=0o700) - shutil.unpack_archive(t, extract_dir=tmpdir) + # For tar (only!) starting with 3.12 we should provide filter + # (enforced in 3.14) on how to filter/safe-guard filenames. + kws = {} + if sys.version_info >= (3, 12) and t.endswith(_TAR_UNPACK_FORMATS): + # Allow for a user-workaround if would be desired + # see e.g. https://docs.python.org/3.12/library/tarfile.html#extraction-filters + kws["filter"] = os.environ.get("HEUDICONV_TAR_FILTER", "tar") + shutil.unpack_archive(t, extract_dir=tmpdir, **kws) archive_content = list(find_files(regex=".*", topdir=tmpdir)) From bf404c88bfe8cf13ae91d3e8ded269bf968f594b Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 28 Oct 2024 17:52:04 -0400 Subject: [PATCH 2/2] Relax typing check for invocation of shutil there --- heudiconv/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heudiconv/parser.py b/heudiconv/parser.py index 3b510522..d21605b9 100644 --- a/heudiconv/parser.py +++ b/heudiconv/parser.py @@ -141,12 +141,12 @@ def get_extracted_dicoms(fl: Iterable[str]) -> ItemsView[Optional[str], list[str os.chmod(tmpdir, mode=0o700) # For tar (only!) starting with 3.12 we should provide filter # (enforced in 3.14) on how to filter/safe-guard filenames. - kws = {} + kws: dict[str, str] = {} if sys.version_info >= (3, 12) and t.endswith(_TAR_UNPACK_FORMATS): # Allow for a user-workaround if would be desired # see e.g. https://docs.python.org/3.12/library/tarfile.html#extraction-filters kws["filter"] = os.environ.get("HEUDICONV_TAR_FILTER", "tar") - shutil.unpack_archive(t, extract_dir=tmpdir, **kws) + shutil.unpack_archive(t, extract_dir=tmpdir, **kws) # type: ignore[arg-type] archive_content = list(find_files(regex=".*", topdir=tmpdir))