diff --git a/wikiteam3/dumpgenerator/cli/cli.py b/wikiteam3/dumpgenerator/cli/cli.py index 6cac8efa..ad4bbb12 100644 --- a/wikiteam3/dumpgenerator/cli/cli.py +++ b/wikiteam3/dumpgenerator/cli/cli.py @@ -113,6 +113,12 @@ def getArgumentParser(): action="store_true", help="Don't verify image size and hash while downloading. (useful for wikis with server-side image resizing)" ) + groupDownload.add_argument( + "--image-timestamp-interval", + metavar="2019-01-02T01:36:06Z/2023-08-12T10:36:06Z", + help="[BETA] Only download images uploaded in the given time interval. [format: ISO 8601 UTC interval] " + "(only works with api)", + ) groupDownload.add_argument( "--namespaces", metavar="1,2,3", @@ -429,6 +435,7 @@ def sleep(self, response=None): "stdout_log_path": args.stdout_log_path, "bypass_cdn_image_compression": args.bypass_cdn_image_compression, "disable_image_verify": args.disable_image_verify, + "image_timestamp_interval": args.image_timestamp_interval, } # calculating path, if not defined by user with --path= diff --git a/wikiteam3/dumpgenerator/dump/image/image.py b/wikiteam3/dumpgenerator/dump/image/image.py index 4398638a..150d1be0 100644 --- a/wikiteam3/dumpgenerator/dump/image/image.py +++ b/wikiteam3/dumpgenerator/dump/image/image.py @@ -43,9 +43,15 @@ def generate_image_dump(config: Config=None, other: Dict=None, images: List[List """Save files and descriptions using a file list\n Deprecated: `start` is not used anymore.""" - # fix use subdirectories md5 bypass_cdn_image_compression: bool = other["bypass_cdn_image_compression"] disable_image_verify: bool = other["disable_image_verify"] + image_timestamp_interval: str = other["image_timestamp_interval"] + image_timestamp_intervals = None + if image_timestamp_interval: # 2019-01-02T01:36:06Z/2023-08-12T10:36:06Z + image_timestamp_intervals = image_timestamp_interval.split("/") + assert len(image_timestamp_intervals) == 2 + import datetime + image_timestamp_intervals = [datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ") for x in image_timestamp_intervals] print("Retrieving images...") images_dir = Path(config.path) / "images" @@ -74,6 +80,20 @@ def check_response(r: requests.Response) -> None: for filename_raw, original_url, uploader, size, sha1, timestamp in images: downloaded = False + if image_timestamp_intervals: + if timestamp == NULL: + print(f" {filename_raw}|timestamp is unknown: {NULL}, downloading anyway...") + else: + if not ( + image_timestamp_intervals[0] + <= datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ") + <= image_timestamp_intervals[1] + ): + print(f" timestamp {timestamp} is not in interval {image_timestamp_interval}: {filename_raw}") + continue + else: + print(f" timestamp {timestamp} is in interval {image_timestamp_interval}: {filename_raw}") + # saving file filename_unquoted = urllib.parse.unquote(filename_raw) if len(filename_unquoted.encode('utf-8')) > other["filenamelimit"]: