Skip to content

Commit

Permalink
feat: cli: add --image-timestamp-interval to download images upload…
Browse files Browse the repository at this point in the history
…ed in the given time interval

close: mediawiki-client-tools#151
  • Loading branch information
yzqzss committed Aug 12, 2023
1 parent 1043567 commit 2b567a5
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 1 deletion.
7 changes: 7 additions & 0 deletions wikiteam3/dumpgenerator/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,12 @@ def getArgumentParser():
action="store_true",
help="Don't verify image size and hash while downloading. (useful for wikis with server-side image resizing)"
)
groupDownload.add_argument(
"--image-timestamp-interval",
metavar="2019-01-02T01:36:06Z/2023-08-12T10:36:06Z",
help="[BETA] Only download images uploaded in the given time interval. [format: ISO 8601 UTC interval] "
"(only works with api)",
)
groupDownload.add_argument(
"--namespaces",
metavar="1,2,3",
Expand Down Expand Up @@ -429,6 +435,7 @@ def sleep(self, response=None):
"stdout_log_path": args.stdout_log_path,
"bypass_cdn_image_compression": args.bypass_cdn_image_compression,
"disable_image_verify": args.disable_image_verify,
"image_timestamp_interval": args.image_timestamp_interval,
}

# calculating path, if not defined by user with --path=
Expand Down
22 changes: 21 additions & 1 deletion wikiteam3/dumpgenerator/dump/image/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,15 @@ def generate_image_dump(config: Config=None, other: Dict=None, images: List[List
"""Save files and descriptions using a file list\n
Deprecated: `start` is not used anymore."""

# fix use subdirectories md5
bypass_cdn_image_compression: bool = other["bypass_cdn_image_compression"]
disable_image_verify: bool = other["disable_image_verify"]
image_timestamp_interval: str = other["image_timestamp_interval"]
image_timestamp_intervals = None
if image_timestamp_interval: # 2019-01-02T01:36:06Z/2023-08-12T10:36:06Z
image_timestamp_intervals = image_timestamp_interval.split("/")
assert len(image_timestamp_intervals) == 2
import datetime
image_timestamp_intervals = [datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ") for x in image_timestamp_intervals]

print("Retrieving images...")
images_dir = Path(config.path) / "images"
Expand Down Expand Up @@ -74,6 +80,20 @@ def check_response(r: requests.Response) -> None:
for filename_raw, original_url, uploader, size, sha1, timestamp in images:
downloaded = False

if image_timestamp_intervals:
if timestamp == NULL:
print(f" {filename_raw}|timestamp is unknown: {NULL}, downloading anyway...")
else:
if not (
image_timestamp_intervals[0]
<= datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")
<= image_timestamp_intervals[1]
):
print(f" timestamp {timestamp} is not in interval {image_timestamp_interval}: {filename_raw}")
continue
else:
print(f" timestamp {timestamp} is in interval {image_timestamp_interval}: {filename_raw}")

# saving file
filename_unquoted = urllib.parse.unquote(filename_raw)
if len(filename_unquoted.encode('utf-8')) > other["filenamelimit"]:
Expand Down

0 comments on commit 2b567a5

Please sign in to comment.