From e0620f9c87b174cf1a80b42f6f18bec4940c85a3 Mon Sep 17 00:00:00 2001 From: Edvard Rejthar Date: Thu, 14 Mar 2024 15:17:56 +0100 Subject: [PATCH] invert, output --- README.md | 18 ++++++++---- deduplidog/deduplidog.py | 59 ++++++++++++++++++++++++++++++++++------ deduplidog/utils.py | 12 ++++++++ tests.py | 36 ++++++++++++++++++------ 4 files changed, 102 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 17a2da1..e04662a 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,8 @@ from deduplidog import Deduplidog Deduplidog("/home/user/duplicates", "/media/disk/origs", ignore_date=True, rename=True) ``` +This command produced the following output: + ``` Find files by size, ignoring: date, crc32 Duplicates from the work dir at 'home' would be (if execute were True) renamed (prefixed with ✓). @@ -59,16 +61,17 @@ Number of originals: 38 * /home/user/duplicates/foo.txt /media/disk/origs/foo.txt 🔨home: renamable - 📄media: DATE WARNING + a day -Affectable: 38/38 -Affected size: 59.9 kB + 📄media: DATE WARNING + a day 🛟skipped on warning +Affectable: 37/38 +Affected size: 56.9 kB Warnings: 1 ``` -We found out all the files in the *duplicates* folder seem to be useless but one. It's date is earlier than the original one. See with full log. +We found out all the files in the *duplicates* folder seem to be useless but one. It's date is earlier than the original one. The life buoy icon would prevent any action. To suppress this, let's turn on `set_both_to_older_date`. See with full log. ```python3 -Deduplidog("/home/user/duplicates", "/media/disk/origs", ignore_date=True, rename=True, set_both_to_older_date=True, log_level=logging.INFO) +Deduplidog("/home/user/duplicates", "/media/disk/origs", + ignore_date=True, rename=True, set_both_to_older_date=True, log_level=logging.INFO) ``` ``` @@ -94,7 +97,8 @@ Affected size: 59.9 kB You see, the log is at the most brief, yet transparent form. The files to be affected at the work folder are prepended with the 🔨 icon whereas those affected at the original folder uses 📄 icon. We might add `execute=True` parameter to perform the actions. Or use `bashify=True` to inspect. ```python3 -Deduplidog("/home/user/duplicates", "/media/disk/origs", ignore_date=True, rename=True, set_both_to_older_date=True, bashify=True) +Deduplidog("/home/user/duplicates", "/media/disk/origs", + ignore_date=True, rename=True, set_both_to_older_date=True, bashify=True) ``` The `bashify=True` just produces the commands we might use. @@ -146,6 +150,7 @@ Find the duplicates. Normally, the file must have the same size, date and name. | strip_end_counter | bool | False | When comparing files in work_dir, strip the counter. Ex: "00034(3).MTS" is compared as "00034.MTS" | | strip_suffix | str | False | When comparing files in work_dir, strip the file name end matched by a regular. Ex: "001-edited.jpg" is compared as "001.jpg" | | work_file_stem_shortened | int | None | Photos downloaded from Google have its stem shortened to 47 chars. For the comparing purpose, treat original folder file names shortened. | +| invert_selection | bool | False | Match only those files from work_dir that does not match the criterions. | | **Media** | | media_magic | bool | False | Nor the size or date is compared for files with media suffixes.
A video is considered a duplicate if it has the same name and a similar number of frames, even if it has a different extension.
An image is considered a duplicate if it has the same name and a similar image hash, even if the files are of different sizes.
(This mode is considerably slower.) | | accepted_frame_delta | int | 1 | Used only when media_magic is True | @@ -153,6 +158,7 @@ Find the duplicates. Normally, the file must have the same size, date and name. | img_compare_date | bool | False | If True and `media_magic=True`, the work file date or the work file EXIF date must match the original file date (has to be no more than an hour around). | | **Helper** | | log_level | int | 30 (warning) | 10 debug .. 50 critical | +| output | bool | False | Stores the output log to a file in the current working directory. (Never overwrites an older file.) | ## Utils In the `deduplidog.utils` packages, you'll find a several handsome tools to help you. You will find parameters by using you IDE hints. diff --git a/deduplidog/deduplidog.py b/deduplidog/deduplidog.py index 6430bf8..d0a0957 100644 --- a/deduplidog/deduplidog.py +++ b/deduplidog/deduplidog.py @@ -1,3 +1,4 @@ +from contextlib import redirect_stdout import logging import os import re @@ -19,7 +20,7 @@ from tqdm.autonotebook import tqdm from .helpers import Field, FileMetadata, keydefaultdict -from .utils import _qp, crc, get_frame_count +from .utils import _qp, crc, get_frame_count, open_log_file VIDEO_SUFFIXES = ".mp4", ".mov", ".avi", ".vob", ".mts", ".3gp", ".mpg", ".mpeg", ".wmv", ".hevc" IMAGE_SUFFIXES = ".jpg", ".jpeg", ".png", ".gif", ".avif", ".webp", ".heic", ".avif" @@ -122,6 +123,8 @@ class Deduplidog: """When comparing files in work_dir, strip the file name end matched by a regular. Ex: "001-edited.jpg" is compared as "001.jpg" """, False)] = False work_file_stem_shortened: Annotated[int, opt( "Photos downloaded from Google have its stem shortened to 47 chars. For the comparing purpose, treat original folder file names shortened.", None)] = None + invert_selection: Annotated[bool, flag( + "Match only those files from work_dir that does not match the criterions.")] = False # Media section media_magic: Annotated[bool, flag( @@ -139,8 +142,10 @@ class Deduplidog: # Helper section log_level: Annotated[int, opt("10 debug .. 50 critical", logging.WARNING, 1)] = logging.WARNING + output: Annotated[bool, flag( + "Stores the output log to a file in the current working directory. (Never overwrites an older file.)")] = False - # TODO output of log and of bashize should be outputtable to a file + # TODO bashize should be outputtable through output # Following parameters are undocumented: @@ -193,6 +198,10 @@ def __post_init__(self): " TODO deprecated" self.original_dir_name = self.work_dir_name = None "Shortened name, human readable" + self.same_superdir = False + """ Work_dir and original dir is the same """ + self._output = None + " Log buffer " self.check() self.perform() @@ -221,11 +230,17 @@ def perform(self): self._common_prefix_length = len(os.path.commonprefix([self.original_dir, self.work_dir])) \ if self.shorter_log else 0 + if self.output: + name = ",".join([self.original_dir_name, self.work_dir_name] + + [p for p, v in vars(self).items() if v is True])[:150] + self._output = open_log_file(name) try: self._loop_files() except: raise finally: + if self._output: + self._output.close() if self.bar: print(f"{'Affected' if self.execute else 'Affectable'}:" f" {self.affected_count}/{len(self.file_list)- self.ignored_count}", end="") @@ -257,18 +272,23 @@ def check(self): if not self.work_dir: raise AssertionError("Missing work_dir") else: + self.same_superdir = False for a, b in zip(Path(self.work_dir).parts, Path(self.original_dir).parts): if a != b: self.work_dir_name = a self.original_dir_name = b break else: - self.work_dir_name = a - self.original_dir_name = "(same superdir)" + self.same_superdir = True + self.original_dir_name = self.work_dir_name = a if self.skip_bigger and not self.media_magic: raise AssertionError("The skip_bigger works only with media_magic") + if self.invert_selection and any((self.replace_with_original, self.treat_bigger_as_original, self.set_both_to_older_date)): + raise AssertionError( + "It does not make sense using invert_selection with this command. The work file has no file to compare to.") + match self.tolerate_hour: case True: self.tolerate_hour = -1, 1 @@ -295,7 +315,8 @@ def check(self): self.checksum and ("crc32", "") or ("", "crc32"))) print(f"Find files by {used}{f', ignoring: {ignored}' if ignored else ''}") - which = f"either the file from the work dir at '{self.work_dir_name}' or the original dir at '{self.original_dir_name}' (whichever is bigger)" \ + dirs_ = "" if self.same_superdir else f" at '{self.work_dir_name}' or the original dir at '{self.original_dir_name}'" + which = f"either the file from the work dir{dirs_} (whichever is bigger)" \ if self.treat_bigger_as_original \ else f"duplicates from the work dir at '{self.work_dir_name}'" small = " (only if smaller than the pair file)" if self.skip_bigger else "" @@ -321,7 +342,8 @@ def check(self): def _loop_files(self): work_dir, skip = self.work_dir, self.skip - work_files = [f for f in tqdm(Path(work_dir).rglob("*"), desc="Caching working files")] + work_files = [f for f in tqdm((p for p in Path(work_dir).rglob( + "*") if not p.is_dir()), desc="Caching working files")] if skip: if isinstance(work_files, list): work_files = work_files[skip:] @@ -398,8 +420,10 @@ def _process_file(self, work_file: Path, bar: tqdm): # original of the work_file has been found # one of them might be treated as a duplicate and thus affected - if original: + if original and not self.invert_selection: self._affect(work_file, original) + elif not original and self.invert_selection: + self._affect(work_file, Path("/dev/null")) elif len(candidates) > 1: # we did not find the object amongst multiple candidates self.having_multiple_candidates[work_file] = candidates logger.debug("Candidates %s %s", work_file, candidates) @@ -467,6 +491,9 @@ def _affect(self, work_file: Path, original: Path): if (warning and self.log_level <= logging.WARNING) or (self.log_level <= logging.INFO): self.bar.clear() # this looks the same from jupyter and much better from terminal (does not leave a trace of abandoned bars) self._print_change(change) + if self._output: + with redirect_stdout(self._output): + self._print_change(change) def _rename(self, change: Change, affected_file: Path): msg = "renamable" @@ -616,10 +643,24 @@ def print_changes(self): [self._print_change(change) for change in self.changes] def _print_change(self, change: Change): + """ We aim for the clearest representation to help the user orientate at a glance. + Because file paths can be long, we'll display them as succinctly as possible. + Sometimes we'll use, for example, the disk name, other times we'll use file names, + or the first or last differing part of the path. """ wicon, oicon = "🔨", "📄" wf, of = change + + # Nice paths + wn, on = self.work_dir_name, self.original_dir_name # meaningful dir representation + if self.same_superdir: + if wf.name == of.name: # full path that makes the difference + len_ = len(os.path.commonprefix((wf, of))) + wn, on = str(wf.parent)[len_:] or "(basedir)", str(of.parent)[len_:] or "(basedir)" + else: # the file name will make the meaningful difference + wn, on = wf.name, of.name + print("*", wf) print(" ", of) [print(text, *(str(s) for s in changes)) - for text, changes in zip((f" {wicon}{self.work_dir_name}:", - f" {oicon}{self.original_dir_name}:"), change.values()) if len(changes)] + for text, changes in zip((f" {wicon}{wn}:", + f" {oicon}{on}:"), change.values()) if len(changes)] diff --git a/deduplidog/utils.py b/deduplidog/utils.py index afc0399..4d48ba0 100644 --- a/deduplidog/utils.py +++ b/deduplidog/utils.py @@ -39,6 +39,18 @@ def _qp(path: Path): s = str(path) return f'"{s}"' if " " in s else s +def open_log_file(name): # undocumented functions + log_file_path = Path(f"{name}.log") + try: + return log_file_path.open("x") + except FileExistsError: + counter = 1 + while True: + new_file_path = Path(f"{name} ({counter}).log") + try: + return new_file_path.open("x") + except FileExistsError: + counter += 1 def images(urls: Iterable[str | Path]): """ Display a ribbon of images. """ diff --git a/tests.py b/tests.py index 7e23ee3..0b34a95 100644 --- a/tests.py +++ b/tests.py @@ -74,15 +74,15 @@ def check(self, prefixed: tuple[int] = None, suck: tuple[int] = None): class TestDeduplidog(TestCase): def prepare(self, testing_dir: str = None): - self.temp = TemporaryDirectory() - temp = Path(testing_dir) if testing_dir else self.temp.name + self.temp = mkdtemp() # TemporaryDirectory() TODO + # temp = Path(testing_dir) if testing_dir else self.temp.name TODO + temp = str(self.temp) originals = Path(temp, "originals") work_dir = Path(temp, "work_dir") if not testing_dir: originals.mkdir() work_dir.mkdir() - # c = FileRepresentationController(temp) original_files = {name: FileRepresentation(originals / name).write() for name in (f"file_{i}" for i in range(12))} work_files = {name: FileRepresentation(work_dir / name, *rest).write() for name, *rest in ( @@ -107,27 +107,47 @@ def test_simple_prefix(self): def test_date(self): state = self.prepare() - Deduplidog(*state, rename=True, execute=True, ignore_date=True) + Deduplidog(*state, rename=True, execute=True, ignore_date=True, neglect_warning=True) state.check(prefixed=(4, 5, 6, 7, 8, 9, 10, 11)) + state = self.prepare() + Deduplidog(*state, rename=True, execute=True, ignore_date=True) + state.check(prefixed=(4, 5, 6, 7, 11)) state = self.prepare() - Deduplidog(*state, rename=True, execute=True, tolerate_hour=1) + Deduplidog(*state, rename=True, execute=True, tolerate_hour=1, neglect_warning=True) state.check(prefixed=(4, 7, 8, 9, 11)) + state = self.prepare() + Deduplidog(*state, rename=True, execute=True, tolerate_hour=1) + state.check(prefixed=(4, 7, 11)) state = self.prepare() - Deduplidog(*state, rename=True, execute=True, tolerate_hour=2) + Deduplidog(*state, rename=True, execute=True, tolerate_hour=2, neglect_warning=True) state.check(prefixed=(4, 5, 6, 7, 8, 9, 11)) + state = self.prepare() + Deduplidog(*state, rename=True, execute=True, tolerate_hour=2) + state.check(prefixed=(4, 5, 6, 7, 11)) def test_replace_with_original(self): state = self.prepare() - Deduplidog(*state, replace_with_original=True, execute=True) + Deduplidog(*state, replace_with_original=True, execute=True, neglect_warning=True) state.work_files["file_11"].suck(state.originals["file_11"]) state.check() state = self.prepare() - Deduplidog(*state, replace_with_original=True, execute=True, tolerate_hour=2) + Deduplidog(*state, replace_with_original=True, execute=True, tolerate_hour=2, neglect_warning=True) state.check(suck=(4, 5, 6, 7, 8, 9, 11)) + def test_invert_selection(self): + state = self.prepare() + self.assertRaises(AssertionError, Deduplidog, + *state, replace_with_original=True, execute=True, tolerate_hour=2, invert_selection=True) + Deduplidog(*state, rename=True, execute=True, tolerate_hour=2, neglect_warning=True, invert_selection=False) + state.check(prefixed=(4, 5, 6, 7, 8, 9, 11)) + + state = self.prepare() + Deduplidog(*state, rename=True, execute=True, tolerate_hour=2, neglect_warning=True, invert_selection=True) + state.check(prefixed=(1, 2, 10)) + # No media file in the test case. # def test_skip_bigger(self): # state = self.prepare()