diff --git a/README.md b/README.md
index 17a2da1..e04662a 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,8 @@ from deduplidog import Deduplidog
Deduplidog("/home/user/duplicates", "/media/disk/origs", ignore_date=True, rename=True)
```
+This command produced the following output:
+
```
Find files by size, ignoring: date, crc32
Duplicates from the work dir at 'home' would be (if execute were True) renamed (prefixed with ✓).
@@ -59,16 +61,17 @@ Number of originals: 38
* /home/user/duplicates/foo.txt
/media/disk/origs/foo.txt
🔨home: renamable
- 📄media: DATE WARNING + a day
-Affectable: 38/38
-Affected size: 59.9 kB
+ 📄media: DATE WARNING + a day 🛟skipped on warning
+Affectable: 37/38
+Affected size: 56.9 kB
Warnings: 1
```
-We found out all the files in the *duplicates* folder seem to be useless but one. It's date is earlier than the original one. See with full log.
+We found out all the files in the *duplicates* folder seem to be useless but one. It's date is earlier than the original one. The life buoy icon would prevent any action. To suppress this, let's turn on `set_both_to_older_date`. See with full log.
```python3
-Deduplidog("/home/user/duplicates", "/media/disk/origs", ignore_date=True, rename=True, set_both_to_older_date=True, log_level=logging.INFO)
+Deduplidog("/home/user/duplicates", "/media/disk/origs",
+ ignore_date=True, rename=True, set_both_to_older_date=True, log_level=logging.INFO)
```
```
@@ -94,7 +97,8 @@ Affected size: 59.9 kB
You see, the log is at the most brief, yet transparent form. The files to be affected at the work folder are prepended with the 🔨 icon whereas those affected at the original folder uses 📄 icon. We might add `execute=True` parameter to perform the actions. Or use `bashify=True` to inspect.
```python3
-Deduplidog("/home/user/duplicates", "/media/disk/origs", ignore_date=True, rename=True, set_both_to_older_date=True, bashify=True)
+Deduplidog("/home/user/duplicates", "/media/disk/origs",
+ ignore_date=True, rename=True, set_both_to_older_date=True, bashify=True)
```
The `bashify=True` just produces the commands we might use.
@@ -146,6 +150,7 @@ Find the duplicates. Normally, the file must have the same size, date and name.
| strip_end_counter | bool | False | When comparing files in work_dir, strip the counter. Ex: "00034(3).MTS" is compared as "00034.MTS" |
| strip_suffix | str | False | When comparing files in work_dir, strip the file name end matched by a regular. Ex: "001-edited.jpg" is compared as "001.jpg" |
| work_file_stem_shortened | int | None | Photos downloaded from Google have its stem shortened to 47 chars. For the comparing purpose, treat original folder file names shortened. |
+| invert_selection | bool | False | Match only those files from work_dir that does not match the criterions. |
| **Media** |
| media_magic | bool | False | Nor the size or date is compared for files with media suffixes.
A video is considered a duplicate if it has the same name and a similar number of frames, even if it has a different extension.
An image is considered a duplicate if it has the same name and a similar image hash, even if the files are of different sizes.
(This mode is considerably slower.) |
| accepted_frame_delta | int | 1 | Used only when media_magic is True |
@@ -153,6 +158,7 @@ Find the duplicates. Normally, the file must have the same size, date and name.
| img_compare_date | bool | False | If True and `media_magic=True`, the work file date or the work file EXIF date must match the original file date (has to be no more than an hour around). |
| **Helper** |
| log_level | int | 30 (warning) | 10 debug .. 50 critical |
+| output | bool | False | Stores the output log to a file in the current working directory. (Never overwrites an older file.) |
## Utils
In the `deduplidog.utils` packages, you'll find a several handsome tools to help you. You will find parameters by using you IDE hints.
diff --git a/deduplidog/deduplidog.py b/deduplidog/deduplidog.py
index 6430bf8..d0a0957 100644
--- a/deduplidog/deduplidog.py
+++ b/deduplidog/deduplidog.py
@@ -1,3 +1,4 @@
+from contextlib import redirect_stdout
import logging
import os
import re
@@ -19,7 +20,7 @@
from tqdm.autonotebook import tqdm
from .helpers import Field, FileMetadata, keydefaultdict
-from .utils import _qp, crc, get_frame_count
+from .utils import _qp, crc, get_frame_count, open_log_file
VIDEO_SUFFIXES = ".mp4", ".mov", ".avi", ".vob", ".mts", ".3gp", ".mpg", ".mpeg", ".wmv", ".hevc"
IMAGE_SUFFIXES = ".jpg", ".jpeg", ".png", ".gif", ".avif", ".webp", ".heic", ".avif"
@@ -122,6 +123,8 @@ class Deduplidog:
"""When comparing files in work_dir, strip the file name end matched by a regular. Ex: "001-edited.jpg" is compared as "001.jpg" """, False)] = False
work_file_stem_shortened: Annotated[int, opt(
"Photos downloaded from Google have its stem shortened to 47 chars. For the comparing purpose, treat original folder file names shortened.", None)] = None
+ invert_selection: Annotated[bool, flag(
+ "Match only those files from work_dir that does not match the criterions.")] = False
# Media section
media_magic: Annotated[bool, flag(
@@ -139,8 +142,10 @@ class Deduplidog:
# Helper section
log_level: Annotated[int, opt("10 debug .. 50 critical", logging.WARNING, 1)] = logging.WARNING
+ output: Annotated[bool, flag(
+ "Stores the output log to a file in the current working directory. (Never overwrites an older file.)")] = False
- # TODO output of log and of bashize should be outputtable to a file
+ # TODO bashize should be outputtable through output
# Following parameters are undocumented:
@@ -193,6 +198,10 @@ def __post_init__(self):
" TODO deprecated"
self.original_dir_name = self.work_dir_name = None
"Shortened name, human readable"
+ self.same_superdir = False
+ """ Work_dir and original dir is the same """
+ self._output = None
+ " Log buffer "
self.check()
self.perform()
@@ -221,11 +230,17 @@ def perform(self):
self._common_prefix_length = len(os.path.commonprefix([self.original_dir, self.work_dir])) \
if self.shorter_log else 0
+ if self.output:
+ name = ",".join([self.original_dir_name, self.work_dir_name] +
+ [p for p, v in vars(self).items() if v is True])[:150]
+ self._output = open_log_file(name)
try:
self._loop_files()
except:
raise
finally:
+ if self._output:
+ self._output.close()
if self.bar:
print(f"{'Affected' if self.execute else 'Affectable'}:"
f" {self.affected_count}/{len(self.file_list)- self.ignored_count}", end="")
@@ -257,18 +272,23 @@ def check(self):
if not self.work_dir:
raise AssertionError("Missing work_dir")
else:
+ self.same_superdir = False
for a, b in zip(Path(self.work_dir).parts, Path(self.original_dir).parts):
if a != b:
self.work_dir_name = a
self.original_dir_name = b
break
else:
- self.work_dir_name = a
- self.original_dir_name = "(same superdir)"
+ self.same_superdir = True
+ self.original_dir_name = self.work_dir_name = a
if self.skip_bigger and not self.media_magic:
raise AssertionError("The skip_bigger works only with media_magic")
+ if self.invert_selection and any((self.replace_with_original, self.treat_bigger_as_original, self.set_both_to_older_date)):
+ raise AssertionError(
+ "It does not make sense using invert_selection with this command. The work file has no file to compare to.")
+
match self.tolerate_hour:
case True:
self.tolerate_hour = -1, 1
@@ -295,7 +315,8 @@ def check(self):
self.checksum and ("crc32", "") or ("", "crc32")))
print(f"Find files by {used}{f', ignoring: {ignored}' if ignored else ''}")
- which = f"either the file from the work dir at '{self.work_dir_name}' or the original dir at '{self.original_dir_name}' (whichever is bigger)" \
+ dirs_ = "" if self.same_superdir else f" at '{self.work_dir_name}' or the original dir at '{self.original_dir_name}'"
+ which = f"either the file from the work dir{dirs_} (whichever is bigger)" \
if self.treat_bigger_as_original \
else f"duplicates from the work dir at '{self.work_dir_name}'"
small = " (only if smaller than the pair file)" if self.skip_bigger else ""
@@ -321,7 +342,8 @@ def check(self):
def _loop_files(self):
work_dir, skip = self.work_dir, self.skip
- work_files = [f for f in tqdm(Path(work_dir).rglob("*"), desc="Caching working files")]
+ work_files = [f for f in tqdm((p for p in Path(work_dir).rglob(
+ "*") if not p.is_dir()), desc="Caching working files")]
if skip:
if isinstance(work_files, list):
work_files = work_files[skip:]
@@ -398,8 +420,10 @@ def _process_file(self, work_file: Path, bar: tqdm):
# original of the work_file has been found
# one of them might be treated as a duplicate and thus affected
- if original:
+ if original and not self.invert_selection:
self._affect(work_file, original)
+ elif not original and self.invert_selection:
+ self._affect(work_file, Path("/dev/null"))
elif len(candidates) > 1: # we did not find the object amongst multiple candidates
self.having_multiple_candidates[work_file] = candidates
logger.debug("Candidates %s %s", work_file, candidates)
@@ -467,6 +491,9 @@ def _affect(self, work_file: Path, original: Path):
if (warning and self.log_level <= logging.WARNING) or (self.log_level <= logging.INFO):
self.bar.clear() # this looks the same from jupyter and much better from terminal (does not leave a trace of abandoned bars)
self._print_change(change)
+ if self._output:
+ with redirect_stdout(self._output):
+ self._print_change(change)
def _rename(self, change: Change, affected_file: Path):
msg = "renamable"
@@ -616,10 +643,24 @@ def print_changes(self):
[self._print_change(change) for change in self.changes]
def _print_change(self, change: Change):
+ """ We aim for the clearest representation to help the user orientate at a glance.
+ Because file paths can be long, we'll display them as succinctly as possible.
+ Sometimes we'll use, for example, the disk name, other times we'll use file names,
+ or the first or last differing part of the path. """
wicon, oicon = "🔨", "📄"
wf, of = change
+
+ # Nice paths
+ wn, on = self.work_dir_name, self.original_dir_name # meaningful dir representation
+ if self.same_superdir:
+ if wf.name == of.name: # full path that makes the difference
+ len_ = len(os.path.commonprefix((wf, of)))
+ wn, on = str(wf.parent)[len_:] or "(basedir)", str(of.parent)[len_:] or "(basedir)"
+ else: # the file name will make the meaningful difference
+ wn, on = wf.name, of.name
+
print("*", wf)
print(" ", of)
[print(text, *(str(s) for s in changes))
- for text, changes in zip((f" {wicon}{self.work_dir_name}:",
- f" {oicon}{self.original_dir_name}:"), change.values()) if len(changes)]
+ for text, changes in zip((f" {wicon}{wn}:",
+ f" {oicon}{on}:"), change.values()) if len(changes)]
diff --git a/deduplidog/utils.py b/deduplidog/utils.py
index afc0399..4d48ba0 100644
--- a/deduplidog/utils.py
+++ b/deduplidog/utils.py
@@ -39,6 +39,18 @@ def _qp(path: Path):
s = str(path)
return f'"{s}"' if " " in s else s
+def open_log_file(name): # undocumented functions
+ log_file_path = Path(f"{name}.log")
+ try:
+ return log_file_path.open("x")
+ except FileExistsError:
+ counter = 1
+ while True:
+ new_file_path = Path(f"{name} ({counter}).log")
+ try:
+ return new_file_path.open("x")
+ except FileExistsError:
+ counter += 1
def images(urls: Iterable[str | Path]):
""" Display a ribbon of images. """
diff --git a/tests.py b/tests.py
index 7e23ee3..0b34a95 100644
--- a/tests.py
+++ b/tests.py
@@ -74,15 +74,15 @@ def check(self, prefixed: tuple[int] = None, suck: tuple[int] = None):
class TestDeduplidog(TestCase):
def prepare(self, testing_dir: str = None):
- self.temp = TemporaryDirectory()
- temp = Path(testing_dir) if testing_dir else self.temp.name
+ self.temp = mkdtemp() # TemporaryDirectory() TODO
+ # temp = Path(testing_dir) if testing_dir else self.temp.name TODO
+ temp = str(self.temp)
originals = Path(temp, "originals")
work_dir = Path(temp, "work_dir")
if not testing_dir:
originals.mkdir()
work_dir.mkdir()
- # c = FileRepresentationController(temp)
original_files = {name: FileRepresentation(originals / name).write()
for name in (f"file_{i}" for i in range(12))}
work_files = {name: FileRepresentation(work_dir / name, *rest).write() for name, *rest in (
@@ -107,27 +107,47 @@ def test_simple_prefix(self):
def test_date(self):
state = self.prepare()
- Deduplidog(*state, rename=True, execute=True, ignore_date=True)
+ Deduplidog(*state, rename=True, execute=True, ignore_date=True, neglect_warning=True)
state.check(prefixed=(4, 5, 6, 7, 8, 9, 10, 11))
+ state = self.prepare()
+ Deduplidog(*state, rename=True, execute=True, ignore_date=True)
+ state.check(prefixed=(4, 5, 6, 7, 11))
state = self.prepare()
- Deduplidog(*state, rename=True, execute=True, tolerate_hour=1)
+ Deduplidog(*state, rename=True, execute=True, tolerate_hour=1, neglect_warning=True)
state.check(prefixed=(4, 7, 8, 9, 11))
+ state = self.prepare()
+ Deduplidog(*state, rename=True, execute=True, tolerate_hour=1)
+ state.check(prefixed=(4, 7, 11))
state = self.prepare()
- Deduplidog(*state, rename=True, execute=True, tolerate_hour=2)
+ Deduplidog(*state, rename=True, execute=True, tolerate_hour=2, neglect_warning=True)
state.check(prefixed=(4, 5, 6, 7, 8, 9, 11))
+ state = self.prepare()
+ Deduplidog(*state, rename=True, execute=True, tolerate_hour=2)
+ state.check(prefixed=(4, 5, 6, 7, 11))
def test_replace_with_original(self):
state = self.prepare()
- Deduplidog(*state, replace_with_original=True, execute=True)
+ Deduplidog(*state, replace_with_original=True, execute=True, neglect_warning=True)
state.work_files["file_11"].suck(state.originals["file_11"])
state.check()
state = self.prepare()
- Deduplidog(*state, replace_with_original=True, execute=True, tolerate_hour=2)
+ Deduplidog(*state, replace_with_original=True, execute=True, tolerate_hour=2, neglect_warning=True)
state.check(suck=(4, 5, 6, 7, 8, 9, 11))
+ def test_invert_selection(self):
+ state = self.prepare()
+ self.assertRaises(AssertionError, Deduplidog,
+ *state, replace_with_original=True, execute=True, tolerate_hour=2, invert_selection=True)
+ Deduplidog(*state, rename=True, execute=True, tolerate_hour=2, neglect_warning=True, invert_selection=False)
+ state.check(prefixed=(4, 5, 6, 7, 8, 9, 11))
+
+ state = self.prepare()
+ Deduplidog(*state, rename=True, execute=True, tolerate_hour=2, neglect_warning=True, invert_selection=True)
+ state.check(prefixed=(1, 2, 10))
+
# No media file in the test case.
# def test_skip_bigger(self):
# state = self.prepare()