diff --git a/README.md b/README.md
index f3371c4..5717ffd 100644
--- a/README.md
+++ b/README.md
@@ -101,5 +101,39 @@ mv -n /home/user/duplicates/third.txt /home/user/duplicates/✓third.txt
# Documentation – `Deduplidog` class
+Import the `Deduplidog` class and change its parameters.
+
+```python3
+from deduplidog import Deduplidog
+```
+
Find the duplicates. Normally, the file must have the same size, date and name. (Name might be just similar if parameters like strip_end_counter are set.) If media_magic=True, media files receive different rules: Neither the size nor the date are compared. See its help.
+| parameter | type | default | description |
+|-----------|------|---------|-------------|
+| work_dir | str \| Path | - | Folder of the files suspectible to be duplicates. |
+| original_dir | str \| Path | - | Folder of the original files. Normally, these files will not be affected.
(However, they might get affected by treat_bigger_as_original or set_both_to_older_date). |
+| **Actions** |
+| execute | bool | False | If False, nothing happens, just a safe run is performed. |
+| bashify | bool | False | Print bash commands that correspond to the actions that would have been executed if execute were True.
You can check and run them yourself. |
+| affect_only_if_smaller | bool | False | If media_magic=True, all writing actions like rename, replace_with_original, set_both_to_older_date and treat_bigger_as_original
are executed only if the affectable file is smaller than the other. |
+| rename | bool | False | If execute=True, prepend ✓ to the duplicated work file name (or possibly to the original file name if treat_bigger_as_original).
Mutually exclusive with replace_with_original and delete. |
+| delete | bool | False | If execute=True, delete theduplicated work file name (or possibly to the original file name if treat_bigger_as_original).
Mutually exclusive with replace_with_original and rename. |
+| replace_with_original | bool | False | If execute=True, replace duplicated work file with the original (or possibly vice versa if treat_bigger_as_original).
Mutually exclusive with rename and delete. |
+| set_both_to_older_date | bool | False | If execute=True, media_magic=True or (media_magic=False and ignore_date=True), both files are set to the older date. Ex: work file get's the original file's date or vice versa. |
+| treat_bigger_as_original | bool | False | If execute=True and rename=True and media_magic=True, the original file might be affected (by renaming) if smaller than the work file. |
+| **Matching** |
+| casefold | bool | False | Case insensitive file name comparing. |
+| checksum | bool | False | If media_magic=False and ignore_size=False, files will be compared by CRC32 checksum.
(This mode is considerably slower.) |
+| tolerate_hour | int \| tuple[int, int] \| bool | False | When comparing files in work_dir and media_magic=False, tolerate hour difference.
Sometimes when dealing with FS changes, files might got shifted few hours.
* bool → -1 .. +1
* int → -int .. +int
* tuple → int1 .. int2
Ex: tolerate_hour=2 → work_file.st_mtime -7200 ... + 7200 is compared to the original_file.st_mtime |
+| ignore_date | bool | False | If media_magic=False, files will not be compared by date. |
+| ignore_size | bool | False | If media_magic=False, files will not be compared by size. |
+| space2char | bool \| str | False | When comparing files in work_dir, consider space as another char. Ex: "file 012.jpg" is compared as "file_012.jpg" |
+| strip_end_counter | bool | False | When comparing files in work_dir, strip the counter. Ex: "00034(3).MTS" is compared as "00034.MTS" |
+| strip_suffix | str | False | When comparing files in work_dir, strip the file name end matched by a regular. Ex: "001-edited.jpg" is compared as "001.jpg" |
+| work_file_stem_shortened | int | None | Photos downloaded from Google have its stem shortened to 47 chars. For the comparing purpose, treat original folder file names shortened. |
+| **Media** |
+| media_magic | bool | False | Nor the size or date is compared for files with media suffixes.
A video is considered a duplicate if it has the same name and a similar number of frames, even if it has a different extension.
An image is considered a duplicate if it has the same name and a similar image hash, even if the files are of different sizes.
(This mode is considerably slower.) |
+| accepted_frame_delta | int | 1 | Used only when media_magic is True |
+| accepted_img_hash_diff | int | 1 | Used only when media_magic is True |
+| img_compare_date | bool | False | If True and media_magic=True, the file date or the EXIF date must match. |
\ No newline at end of file
diff --git a/deduplidog/deduplidog.py b/deduplidog/deduplidog.py
index fe77217..34bf4d2 100644
--- a/deduplidog/deduplidog.py
+++ b/deduplidog/deduplidog.py
@@ -46,6 +46,7 @@ class Deduplidog:
"Folder of the original files. Normally, these files will not be affected." \
" (However, they might get affected by treat_bigger_as_original or set_both_to_older_date)."
+ # Action section
execute: bool = False
"If False, nothing happens, just a safe run is performed."
bashify: bool = False
@@ -56,16 +57,19 @@ class Deduplidog:
are executed only if the affectable file is smaller than the other."""
rename: bool = False
"""If execute=True, prepend ✓ to the duplicated work file name (or possibly to the original file name if treat_bigger_as_original).
- Mutually exclusive with replace_with_original."""
+ Mutually exclusive with replace_with_original and delete."""
+ delete: bool = False
+ """If execute=True, delete theduplicated work file name (or possibly to the original file name if treat_bigger_as_original).
+ Mutually exclusive with replace_with_original and rename."""
replace_with_original: bool = False
"""If execute=True, replace duplicated work file with the original (or possibly vice versa if treat_bigger_as_original).
- Mutually exclusive with rename.
- """
+ Mutually exclusive with rename and delete."""
set_both_to_older_date: bool = False
"If execute=True, media_magic=True or (media_magic=False and ignore_date=True), both files are set to the older date. Ex: work file get's the original file's date or vice versa."
treat_bigger_as_original: bool = False
"If execute=True and rename=True and media_magic=True, the original file might be affected (by renaming) if smaller than the work file."
+ # Match section
casefold: bool = False
"Case insensitive file name comparing."
checksum: bool = False
@@ -91,6 +95,7 @@ class Deduplidog:
work_file_stem_shortened: int = None
"Photos downloaded from Google have its stem shortened to 47 chars. For the comparing purpose, treat original folder file names shortened."
+ # Media section
media_magic: bool = False
"""
Nor the size or date is compared for files with media suffixes.
@@ -102,9 +107,11 @@ class Deduplidog:
"Used only when media_magic is True"
accepted_img_hash_diff: int = 1
"Used only when media_magic is True"
- img_compare_date = False
+ img_compare_date: bool = False
"If True and media_magic=True, the file date or the EXIF date must match."
+ # Following parameters are undocumented:
+
file_list: list[Path] = None
"Use original file list. If none, a new is generated or a cached version is used."
suffixes: bool | tuple[str] = False
@@ -198,7 +205,8 @@ def perform(self):
raise
finally:
if self.bar:
- print(f"{'Affected' if self.execute else 'Affectable'}: {self.affected_count}/{len(self.file_list)- self.ignored_count}", end="")
+ print(
+ f"{'Affected' if self.execute else 'Affectable'}: {self.affected_count}/{len(self.file_list)- self.ignored_count}", end="")
if self.ignored_count:
print(f" ({self.ignored_count} ignored)", end="")
print("\nAffected size:", naturalsize(self.size_affected))
@@ -248,12 +256,16 @@ def check(self):
action = "will be" if self.execute else f"would be (if execute were True)"
print(f"{which.capitalize()}{small} {action} ", end="")
- match self.rename, self.replace_with_original:
- case True, False:
+ match self.rename, self.replace_with_original, self.delete:
+ case False, False, False:
+ pass
+ case True, False, False:
print("renamed (prefixed with ✓).")
- case False, True:
+ case False, True, False:
print("replaced with the original.")
- case True, True:
+ case False, False, True:
+ print("deleted.")
+ case _:
raise AssertionError("Choose either rename or replace_with_original")
if self.set_both_to_older_date:
@@ -381,54 +393,70 @@ def _affect(self, work_file: Path, original: Path):
change[other_file].append(f"DATE WARNING + {naturaldelta(other_date-affected_date)}")
warning = True
- # renaming
+ # other actions
if self.rename:
- status_ = "renamable"
- if self.execute or self.bashify:
- # self.queue.put((affected_file, affected_file.with_name("✓" + affected_file.name)))
- target_path = affected_file.with_name("✓" + affected_file.name)
- if self.execute:
- if target_path.exists():
- err = f"Do not rename {affected_file} because {target_path} exists."
- if self.fail_on_error:
- raise FileExistsError(err)
- else:
- logger.warning(err)
- else:
- affected_file.rename(target_path)
- status_ = "renaming"
- if self.bashify:
- print(f"mv -n {_qp(affected_file)} {_qp(target_path)}") # TODO check
- self.passed_away.add(affected_file)
- change[affected_file].append(status_)
+ self._rename(change, affected_file)
+
+ if self.delete:
+ self._delete(change, affected_file)
+
if self.replace_with_original:
- status_ = "replacable"
- if other_file.name == affected_file.name:
- if self.execute:
- status_ = "replacing"
- shutil.copy2(other_file, affected_file)
- if self.bashify:
- print(f"cp --preserve {_qp(other_file)} {_qp(affected_file)}") # TODO check
- else:
- if self.execute:
- status_ = "replacing"
- shutil.copy2(other_file, affected_file.parent)
- affected_file.unlink()
- if self.bashify:
- # TODO check
- print(f"cp --preserve {_qp(other_file)} {_qp(affected_file.parent)} && rm {_qp(affected_file)}")
- change[affected_file].append(status_)
+ self._replace_with_original(change, affected_file, other_file)
self.changes.append(change)
if warning:
self.warning_count += 1
if (warning and self.logging_level <= logging.WARNING) or (self.logging_level <= logging.INFO):
self._print_change(change)
- # suffix = " (affected):" if affected_file is original else ":"
- # getattr(logger, "warning" if warning else "info")("Original%s %s %s",
- # suffix, self._path(original), " ".join(str(s) for s in change[original]))
- # getattr(logger, "warning" if warning else "info")(
- # "Work file: %s %s", self._path(work_file), " ".join(str(s) for s in change[work_file]))
+
+ def _rename(self, change: Change, affected_file: Path):
+ msg = "renamable"
+ if self.execute or self.bashify:
+ # self.queue.put((affected_file, affected_file.with_name("✓" + affected_file.name)))
+ target_path = affected_file.with_name("✓" + affected_file.name)
+ if self.execute:
+ if target_path.exists():
+ err = f"Do not rename {affected_file} because {target_path} exists."
+ if self.fail_on_error:
+ raise FileExistsError(err)
+ else:
+ logger.warning(err)
+ else:
+ affected_file.rename(target_path)
+ msg = "renaming"
+ if self.bashify:
+ print(f"mv -n {_qp(affected_file)} {_qp(target_path)}") # TODO check
+ self.passed_away.add(affected_file)
+ change[affected_file].append(msg)
+
+ def _delete(self, change: Change, affected_file: Path):
+ msg = "deletable"
+ if self.execute or self.bashify:
+ if self.execute:
+ affected_file.unlink()
+ msg = "deleting"
+ if self.bashify:
+ print(f"rm {_qp(affected_file)}") # TODO check
+ self.passed_away.add(affected_file)
+ change[affected_file].append(msg)
+
+ def _replace_with_original(self, change: Change, affected_file: Path, other_file: Path):
+ msg = "replacable"
+ if other_file.name == affected_file.name:
+ if self.execute:
+ msg = "replacing"
+ shutil.copy2(other_file, affected_file)
+ if self.bashify:
+ print(f"cp --preserve {_qp(other_file)} {_qp(affected_file)}") # TODO check
+ else:
+ if self.execute:
+ msg = "replacing"
+ shutil.copy2(other_file, affected_file.parent)
+ affected_file.unlink()
+ if self.bashify:
+ # TODO check
+ print(f"cp --preserve {_qp(other_file)} {_qp(affected_file.parent)} && rm {_qp(affected_file)}")
+ change[affected_file].append(msg)
def _change_file_date(self, path, old_date, new_date, change: Change):
# Consider following usecase:
@@ -459,8 +487,8 @@ def _find_similar(self, work_file: Path, candidates: list[Path]):
for original in candidates:
ost, wst = original.stat(), work_file.stat()
if (self.ignore_date
- or wst.st_mtime == ost.st_mtime
- or self.tolerate_hour and self.tolerate_hour[0] <= (wst.st_mtime - ost.st_mtime)/3600 <= self.tolerate_hour[1]
+ or wst.st_mtime == ost.st_mtime
+ or self.tolerate_hour and self.tolerate_hour[0] <= (wst.st_mtime - ost.st_mtime)/3600 <= self.tolerate_hour[1]
) and (self.ignore_size or wst.st_size == ost.st_size and (not self.checksum or crc(original) == crc(work_file))):
return original