Skip to content

Commit

Permalink
invert, output
Browse files Browse the repository at this point in the history
  • Loading branch information
e3rd committed Mar 14, 2024
1 parent 27edb29 commit 1d9aea7
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 8 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,15 @@ Find the duplicates. Normally, the file must have the same size, date and name.
| strip_end_counter | bool | False | When comparing files in work_dir, strip the counter. Ex: "00034(3).MTS" is compared as "00034.MTS" |
| strip_suffix | str | False | When comparing files in work_dir, strip the file name end matched by a regular. Ex: "001-edited.jpg" is compared as "001.jpg" |
| work_file_stem_shortened | int | None | Photos downloaded from Google have its stem shortened to 47 chars. For the comparing purpose, treat original folder file names shortened. |
| invert_selection | bool | False | Match only those files from work_dir that does not match the criterions. |
| **Media** |
| media_magic | bool | False | Nor the size or date is compared for files with media suffixes.<br>A video is considered a duplicate if it has the same name and a similar number of frames, even if it has a different extension.<br>An image is considered a duplicate if it has the same name and a similar image hash, even if the files are of different sizes.<br>(This mode is considerably slower.) |
| accepted_frame_delta | int | 1 | Used only when media_magic is True |
| accepted_img_hash_diff | int | 1 | Used only when media_magic is True |
| img_compare_date | bool | False | If True and `media_magic=True`, the work file date or the work file EXIF date must match the original file date (has to be no more than an hour around). |
| **Helper** |
| log_level | int | 30 (warning) | 10 debug .. 50 critical |
| output | bool | False | Stores the output log to a file in the current working directory. (Never overwrites an older file.) |

## Utils
In the `deduplidog.utils` packages, you'll find a several handsome tools to help you. You will find parameters by using you IDE hints.
Expand Down
55 changes: 47 additions & 8 deletions deduplidog/deduplidog.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from contextlib import redirect_stdout
import logging
import os
import re
Expand All @@ -19,7 +20,7 @@
from tqdm.autonotebook import tqdm

from .helpers import Field, FileMetadata, keydefaultdict
from .utils import _qp, crc, get_frame_count
from .utils import _qp, crc, get_frame_count, open_log_file

VIDEO_SUFFIXES = ".mp4", ".mov", ".avi", ".vob", ".mts", ".3gp", ".mpg", ".mpeg", ".wmv", ".hevc"
IMAGE_SUFFIXES = ".jpg", ".jpeg", ".png", ".gif", ".avif", ".webp", ".heic", ".avif"
Expand Down Expand Up @@ -122,6 +123,8 @@ class Deduplidog:
"""When comparing files in work_dir, strip the file name end matched by a regular. Ex: "001-edited.jpg" is compared as "001.jpg" """, False)] = False
work_file_stem_shortened: Annotated[int, opt(
"Photos downloaded from Google have its stem shortened to 47 chars. For the comparing purpose, treat original folder file names shortened.", None)] = None
invert_selection: Annotated[bool, flag(
"Match only those files from work_dir that does not match the criterions.")] = False

# Media section
media_magic: Annotated[bool, flag(
Expand All @@ -139,6 +142,8 @@ class Deduplidog:

# Helper section
log_level: Annotated[int, opt("10 debug .. 50 critical", logging.WARNING, 1)] = logging.WARNING
output: Annotated[bool, flag(
"Stores the output log to a file in the current working directory. (Never overwrites an older file.)")] = False

# TODO output of log and of bashize should be outputtable to a file

Expand Down Expand Up @@ -193,6 +198,8 @@ def __post_init__(self):
" TODO deprecated"
self.original_dir_name = self.work_dir_name = None
"Shortened name, human readable"
self.same_superdir = False
""" Work_dir and original dir is the same """

self.check()
self.perform()
Expand Down Expand Up @@ -221,11 +228,17 @@ def perform(self):
self._common_prefix_length = len(os.path.commonprefix([self.original_dir, self.work_dir])) \
if self.shorter_log else 0

if self.output:
name = ",".join([self.original_dir_name, self.work_dir_name] +
[p for p, v in vars(self).items() if v is True])[:150]
self._output = open_log_file(name)
try:
self._loop_files()
except:
raise
finally:
if self._output:
self._output.close()
if self.bar:
print(f"{'Affected' if self.execute else 'Affectable'}:"
f" {self.affected_count}/{len(self.file_list)- self.ignored_count}", end="")
Expand Down Expand Up @@ -257,18 +270,23 @@ def check(self):
if not self.work_dir:
raise AssertionError("Missing work_dir")
else:
self.same_superdir = False
for a, b in zip(Path(self.work_dir).parts, Path(self.original_dir).parts):
if a != b:
self.work_dir_name = a
self.original_dir_name = b
break
else:
self.work_dir_name = a
self.original_dir_name = "(same superdir)"
self.same_superdir = True
self.original_dir_name = self.work_dir_name = a

if self.skip_bigger and not self.media_magic:
raise AssertionError("The skip_bigger works only with media_magic")

if self.invert_selection and any((self.replace_with_original, self.treat_bigger_as_original, self.set_both_to_older_date)):
raise AssertionError(
"It does not make sense using invert_selection with this command. The work file has no file to compare to.")

match self.tolerate_hour:
case True:
self.tolerate_hour = -1, 1
Expand All @@ -295,7 +313,8 @@ def check(self):
self.checksum and ("crc32", "") or ("", "crc32")))
print(f"Find files by {used}{f', ignoring: {ignored}' if ignored else ''}")

which = f"either the file from the work dir at '{self.work_dir_name}' or the original dir at '{self.original_dir_name}' (whichever is bigger)" \
dirs_ = "" if self.same_superdir else f" at '{self.work_dir_name}' or the original dir at '{self.original_dir_name}'"
which = f"either the file from the work dir{dirs_} (whichever is bigger)" \
if self.treat_bigger_as_original \
else f"duplicates from the work dir at '{self.work_dir_name}'"
small = " (only if smaller than the pair file)" if self.skip_bigger else ""
Expand All @@ -321,7 +340,8 @@ def check(self):

def _loop_files(self):
work_dir, skip = self.work_dir, self.skip
work_files = [f for f in tqdm(Path(work_dir).rglob("*"), desc="Caching working files")]
work_files = [f for f in tqdm((p for p in Path(work_dir).rglob(
"*") if not p.is_dir()), desc="Caching working files")]
if skip:
if isinstance(work_files, list):
work_files = work_files[skip:]
Expand Down Expand Up @@ -398,8 +418,10 @@ def _process_file(self, work_file: Path, bar: tqdm):

# original of the work_file has been found
# one of them might be treated as a duplicate and thus affected
if original:
if original and not self.invert_selection:
self._affect(work_file, original)
elif not original and self.invert_selection:
self._affect(work_file, Path("/dev/null"))
elif len(candidates) > 1: # we did not find the object amongst multiple candidates
self.having_multiple_candidates[work_file] = candidates
logger.debug("Candidates %s %s", work_file, candidates)
Expand Down Expand Up @@ -467,6 +489,9 @@ def _affect(self, work_file: Path, original: Path):
if (warning and self.log_level <= logging.WARNING) or (self.log_level <= logging.INFO):
self.bar.clear() # this looks the same from jupyter and much better from terminal (does not leave a trace of abandoned bars)
self._print_change(change)
if self._output:
with redirect_stdout(self._output):
self._print_change(change)

def _rename(self, change: Change, affected_file: Path):
msg = "renamable"
Expand Down Expand Up @@ -616,10 +641,24 @@ def print_changes(self):
[self._print_change(change) for change in self.changes]

def _print_change(self, change: Change):
""" We aim for the clearest representation to help the user orientate at a glance.
Because file paths can be long, we'll display them as succinctly as possible.
Sometimes we'll use, for example, the disk name, other times we'll use file names,
or the first or last differing part of the path. """
wicon, oicon = "🔨", "📄"
wf, of = change

# Nice paths
wn, on = self.work_dir_name, self.original_dir_name # meaningful dir representation
if self.same_superdir:
if wf.name == of.name: # full path that makes the difference
len_ = len(os.path.commonprefix((wf, of)))
wn, on = str(wf.parent)[len_:] or "(basedir)", str(of.parent)[len_:] or "(basedir)"
else: # the file name will make the meaningful difference
wn, on = wf.name, of.name

print("*", wf)
print(" ", of)
[print(text, *(str(s) for s in changes))
for text, changes in zip((f" {wicon}{self.work_dir_name}:",
f" {oicon}{self.original_dir_name}:"), change.values()) if len(changes)]
for text, changes in zip((f" {wicon}{wn}:",
f" {oicon}{on}:"), change.values()) if len(changes)]
12 changes: 12 additions & 0 deletions deduplidog/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@ def _qp(path: Path):
s = str(path)
return f'"{s}"' if " " in s else s

def open_log_file(name): # undocumented functions
log_file_path = Path(f"{name}.log")
try:
return log_file_path.open("x")
except FileExistsError:
counter = 1
while True:
new_file_path = Path(f"{name} ({counter}).log")
try:
return new_file_path.open("x")
except FileExistsError:
counter += 1

def images(urls: Iterable[str | Path]):
""" Display a ribbon of images. """
Expand Down

0 comments on commit 1d9aea7

Please sign in to comment.