diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d464164b..f1462026 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,20 +2,20 @@ # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - repo: https://github.com/psf/black - rev: "23.7.0" + rev: "24.4.2" hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.0.282 + rev: v0.4.4 hooks: - id: ruff - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.331 + rev: v1.1.363 hooks: - id: pyright name: pyright (system) diff --git a/CHANGELOG b/CHANGELOG index b41a7f64..18beda8c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,9 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- New `long_description` CLI argument to set the ZIM long description +- New `disable_metadata_check` CLI argument to disable the metadata checks which are automated since zimscraperlib 3.x + ### Changed - Changed default publisher metadata to 'openZIM' +- Validate ZIM metadata (tags, title, description, long_description) as early as possible +- Migrate to zimscraperlib 3.3.2 +- Upgrade Python dependencies, including migration to Python 3.12 ## [2.2.0] - 2023-11-17 @@ -23,7 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed local path media (profile, banner) not working (#178) - Unset `metadata_from` in `youtube2zim-playlists` (#185) -- Do not move local banner and profile images, copy them instead #179 +- Do not move local banner and profile images, copy them instead #179 ## [2.1.18] - 2022-11-09 diff --git a/Dockerfile b/Dockerfile index 2fc882ae..148cec3a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.10-bookworm +FROM python:3.12-bookworm LABEL org.opencontainers.image.source https://github.com/openzim/youtube # Install necessary packages @@ -20,7 +20,7 @@ RUN mkdir -p /output WORKDIR /output # Copy pyproject.toml and its dependencies -COPY pyproject.toml README.md get_js_deps.sh hatch_build.py /src/ +COPY pyproject.toml README.md openzim.toml /src/ COPY src/youtube2zim/__about__.py /src/src/youtube2zim/__about__.py # Install Python dependencies diff --git a/get_js_deps.sh b/get_js_deps.sh deleted file mode 100755 index 7e3fc13f..00000000 --- a/get_js_deps.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/sh - -### -# download JS dependencies and place them in our templates/assets folder -# then launch our ogv.js script to fix dynamic loading links -### - -if ! command -v curl > /dev/null; then - echo "you need curl." - exit 1 -fi - -if ! command -v unzip > /dev/null; then - echo "you need unzip." - exit 1 -fi - -# Absolute path this script is in. -SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )" -ASSETS_PATH="${SCRIPT_PATH}/src/youtube2zim/templates/assets" - -echo "About to download JS assets to ${ASSETS_PATH}" - -echo "getting video.js" -curl -L -O https://github.com/videojs/video.js/releases/download/v8.6.1/video-js-8.6.1.zip -rm -rf $ASSETS_PATH/videojs -mkdir -p $ASSETS_PATH/videojs -unzip -o -d $ASSETS_PATH/videojs video-js-8.6.1.zip -rm -rf $ASSETS_PATH/videojs/alt $ASSETS_PATH/videojs/examples -rm -f video-js-8.6.1.zip - -echo "getting ogv.js" -curl -L -O https://github.com/brion/ogv.js/releases/download/1.8.9/ogvjs-1.8.9.zip -rm -rf $ASSETS_PATH/ogvjs -unzip -o ogvjs-1.8.9.zip -mv ogvjs-1.8.9 $ASSETS_PATH/ogvjs -rm -f ogvjs-1.8.9.zip - -echo "getting chosen.jquery.js" -curl -L -O https://github.com/harvesthq/chosen/releases/download/v1.8.7/chosen_v1.8.7.zip -rm -rf $ASSETS_PATH/chosen -mkdir -p $ASSETS_PATH/chosen -unzip -o -d $ASSETS_PATH/chosen chosen_v1.8.7.zip -rm -rf $ASSETS_PATH/chosen/docsupport $ASSETS_PATH/chosen/chosen.proto.* $ASSETS_PATH/chosen/*.html $ASSETS_PATH/chosen/*.md -rm -f chosen_v1.8.7.zip - -echo "getting videojs-ogvjs.js" -curl -L -o $ASSETS_PATH/videojs-ogvjs.js https://dev.kiwix.org/videojs-ogvjs/videojs-ogvjs.min.js - -echo "getting jquery.js" -curl -L -o $ASSETS_PATH/jquery.min.js https://code.jquery.com/jquery-1.12.4.min.js - -echo "getting webp-hero" -curl -L -O https://unpkg.com/webp-hero@0.0.2/dist-cjs/polyfills.js -rm -f $ASSETS_PATH/polyfills.js -mv polyfills.js $ASSETS_PATH/polyfills.js -curl -L -O https://unpkg.com/webp-hero@0.0.2/dist-cjs/webp-hero.bundle.js -rm -f $ASSETS_PATH/webp-hero.bundle.js -mv webp-hero.bundle.js $ASSETS_PATH/webp-hero.bundle.js - -if command -v fix_ogvjs_dist > /dev/null; then - echo "fixing JS files" - fix_ogvjs_dist $ASSETS_PATH "assets" -else - echo "NOT fixing JS files (zimscraperlib not installed)" -fi diff --git a/hatch_build.py b/hatch_build.py deleted file mode 100644 index cfaafdb4..00000000 --- a/hatch_build.py +++ /dev/null @@ -1,43 +0,0 @@ -import logging -import subprocess -from pathlib import Path - -from hatchling.builders.hooks.plugin.interface import BuildHookInterface - -logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger(__name__) - -# update list in constants.py as well -JS_DEPS = [ - "videojs", - "ogvjs", - "chosen", - "videojs-ogvjs.js", - "jquery.min.js", - "polyfills.js", - "webp-hero.bundle.js", -] - - -class GetJsDepsHook(BuildHookInterface): - def initialize(self, version, build_data): - if self.deps_already_installed(): - logger.info("JS dependencies are already installed, skipping it") - return - Path(self.root).joinpath("src/youtube2zim/templates/assets") - subprocess.run( - str(Path(self.root).joinpath("get_js_deps.sh")), - check=True, - ) - return super().initialize(version, build_data) - - def deps_already_installed(self) -> bool: - for dep in JS_DEPS: - if ( - not Path(self.root) - .joinpath("src/youtube2zim/templates/assets") - .joinpath(dep) - .exists() - ): - return False - return True diff --git a/openzim.toml b/openzim.toml new file mode 100644 index 00000000..218f1255 --- /dev/null +++ b/openzim.toml @@ -0,0 +1,45 @@ +[files.assets.config] +target_dir="src/youtube2zim/templates/assets" +execute_after=[ + "fix_ogvjs_dist .", +] + +[files.assets.actions."video.js"] +action="extract_all" +source="https://github.com/videojs/video.js/releases/download/v7.8.1/video-js-7.8.1.zip" +target_dir="videojs" +remove = ["alt","examples",] + +[files.assets.actions."chosen.jquery.js"] +action="extract_all" +source="https://github.com/harvesthq/chosen/releases/download/v1.8.7/chosen_v1.8.7.zip" +target_dir="chosen" +remove = ["docsupport","chosen.proto.*","*.html","*.md"] + +[files.assets.actions."jquery.min.js"] +action="get_file" +source="https://code.jquery.com/jquery-3.5.1.min.js" +target_file="jquery.min.js" + +[files.assets.actions."ogv.js"] +action="extract_items" +source="https://github.com/brion/ogv.js/releases/download/1.8.9/ogvjs-1.8.9.zip" +zip_paths=["ogvjs-1.8.9"] +target_paths=["ogvjs"] +remove = ["ogvjs/COPYING","ogvjs/*.txt","ogvjs/*.md",] + +[files.assets.actions."videojs-ogvjs.js"] +action="extract_items" +source="https://github.com/hartman/videojs-ogvjs/archive/v1.3.1.zip" +zip_paths=["videojs-ogvjs-1.3.1/dist/videojs-ogvjs.js"] +target_paths=["videojs-ogvjs.js"] + +[files.assets.actions."webp-hero.polyfills.js"] +action="get_file" +source="https://unpkg.com/webp-hero@0.0.0-dev.26/dist-cjs/polyfills.js" +target_file="polyfills.js" + +[files.assets.actions."webp-hero.bundle.js"] +action="get_file" +source="https://unpkg.com/webp-hero@0.0.0-dev.26/dist-cjs/webp-hero.bundle.js" +target_file="webp-hero.bundle.js" diff --git a/pyproject.toml b/pyproject.toml index f050c5d4..70f9fe48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,54 +1,50 @@ [build-system] -requires = ["hatchling"] +requires = ["hatchling", "hatch-openzim==0.2.1"] build-backend = "hatchling.build" [project] name = "youtube2zim" -authors = [{ name = "Kiwix", email = "dev@kiwix.org" }] -keywords = ["kiwix", "zim", "offline", "youtube"] -requires-python = ">=3.10,<3.11" +requires-python = ">=3.12,<3.13" description = "Make ZIM file from a Youtube channel, user or playlist(s)" readme = "README.md" -license = { text = "GPL-3.0-or-later" } -classifiers = [ - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", - "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", -] dependencies = [ "python-slugify==3.0.3", "yt-dlp", # youtube-dl should be updated as frequently as possible - "python-dateutil==2.8.0", - "jinja2>=2.11,<3.0", + "python-dateutil==2.9.0.post0", + "jinja2==3.1.4", "MarkupSafe==2.0.1", # jinja2 dependency (https://github.com/pallets/markupsafe/issues/284) - "zimscraperlib>=2.0.0,<2.1.0", + "zimscraperlib==3.3.2", "requests==2.31.0", "kiwixstorage==0.8.3", "pif==0.8.2", ] -dynamic = ["version"] +dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.metadata.hooks.openzim-metadata] +kind = "scraper" +additional-keywords = ["youtube"] + +[tool.hatch.build.hooks.openzim-build] +dependencies = [ "zimscraperlib==3.3.2"] # required for fix_ogv_dist [project.optional-dependencies] scripts = ["invoke==2.2.0"] -lint = ["black==23.9.1", "ruff==0.0.292"] -check = ["pyright==1.1.331"] -test = ["pytest==7.4.2", "coverage==7.3.2"] +lint = ["black==24.4.2", "ruff==0.4.4"] +check = ["pyright==1.1.363"] +test = ["pytest==8.1.1", "coverage==7.4.4"] dev = [ - "pre-commit==3.4.0", - "debugpy==1.8.0", + "pre-commit==3.7.1", + "debugpy==1.8.1", "youtube2zim[scripts]", "youtube2zim[lint]", "youtube2zim[test]", "youtube2zim[check]", - # hatchling is a dev dependency only needed for hook development on developer machine - "hatchling==1.18.0", "humanfriendly==10.0" ] -[project.urls] -Homepage = "https://github.com/openzim/youtube" -Donate = "https://www.kiwix.org/en/support-us/" - [project.scripts] youtube2zim = "youtube2zim.__main__:main" youtube2zim-playlists = "youtube2zim.playlists.__main__:main" @@ -59,12 +55,11 @@ path = "src/youtube2zim/__about__.py" [tool.hatch.build] exclude = ["/.github"] -[tool.hatch.build.hooks.custom] -path = "hatch_build.py" -dependencies = ["zimscraperlib>=2.0.0,<2.1.0"] - [tool.hatch.build.targets.wheel] packages = ["src/youtube2zim"] +artifacts = [ + "src/youtube2zim/templates/assets/**", +] [tool.hatch.envs.default] features = ["dev"] @@ -81,7 +76,7 @@ html = "inv coverage --html --args '{args}'" [tool.hatch.envs.lint] template = "lint" -python = "py311" +python = "py312" skip-install = false features = ["scripts", "lint"] @@ -102,13 +97,15 @@ all = "inv checkall --args '{args}'" [tool.black] line-length = 88 -target-version = ['py311'] +target-version = ['py312'] exclude = "(src/youtube2zim/templates/.*|.hatch/.*)" [tool.ruff] -target-version = "py311" +target-version = "py312" line-length = 88 src = ["src"] + +[tool.ruff.lint] select = [ "A", # flake8-builtins # "ANN", # flake8-annotations @@ -193,13 +190,13 @@ unfixable = [ "F401", ] -[tool.ruff.isort] +[tool.ruff.lint.isort] known-first-party = ["youtube2zim"] -[tool.ruff.flake8-tidy-imports] +[tool.ruff.lint.flake8-tidy-imports] ban-relative-imports = "all" -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] @@ -225,5 +222,6 @@ exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] include = ["src", "tests", "tasks.py"] exclude = [".env/**", ".venv/**", "src/youtube2zim/templates", ".hatch"] extraPaths = ["src"] -pythonVersion = "3.10" +pythonVersion = "3.12" typeCheckingMode = "basic" +disableBytesTypePromotions = true diff --git a/src/youtube2zim/entrypoint.py b/src/youtube2zim/entrypoint.py index cf3743e6..63c368a9 100755 --- a/src/youtube2zim/entrypoint.py +++ b/src/youtube2zim/entrypoint.py @@ -5,14 +5,7 @@ import logging import sys -from youtube2zim.constants import ( - CHANNEL, - NAME, - PLAYLIST, - SCRAPER, - USER, - logger, -) +from youtube2zim.constants import CHANNEL, NAME, PLAYLIST, SCRAPER, USER, logger from youtube2zim.scraper import Youtube2Zim @@ -123,6 +116,11 @@ def main(): "Default to Channel name (of first video if playlists)", ) + parser.add_argument( + "--long-description", + help="Custom long description for your ZIM.", + ) + parser.add_argument( "--creator", help="Name of content creator. Defaults to Channel name or “Youtue Channels”", @@ -191,6 +189,13 @@ def main(): version=SCRAPER, ) + parser.add_argument( + "--disable-metadata-checks", + help="Disable validity checks of metadata according to openZIM conventions", + action="store_true", + default=False, + ) + parser.add_argument( "--dateafter", help="Custom filter to download videos uploaded on or after specified date. " diff --git a/src/youtube2zim/playlists/entrypoint.py b/src/youtube2zim/playlists/entrypoint.py index bf2a9972..056f4b81 100644 --- a/src/youtube2zim/playlists/entrypoint.py +++ b/src/youtube2zim/playlists/entrypoint.py @@ -72,6 +72,14 @@ def main(): version=SCRAPER, ) + parser.add_argument( + "--disable-metadata-checks", + help="Disable validity checks of metadata according to openZIM conventions", + action="store_true", + default=False, + dest="disable_metadata_checks", + ) + args, extra_args = parser.parse_known_args() # prevent setting --title and --description diff --git a/src/youtube2zim/playlists/scraper.py b/src/youtube2zim/playlists/scraper.py index fb9ca322..aeab210f 100644 --- a/src/youtube2zim/playlists/scraper.py +++ b/src/youtube2zim/playlists/scraper.py @@ -38,6 +38,7 @@ def __init__( # extract values from options self.api_key = options["api_key"] self.debug = options["debug"] + self.disable_metadata_checks = options["disable_metadata_checks"] self.playlists_mode = options["playlists_mode"] self.collection_type = options["collection_type"] self.youtube_id = options["youtube_id"] @@ -137,6 +138,9 @@ def run_playlist_zim(self, playlist): if self.debug: args.append("--debug") + if self.disable_metadata_checks: + args.append("--disable-metadatachecks") + # set metadata args for playlist metadata = self.metadata.get(playlist_id, {}) for key in ( @@ -185,6 +189,10 @@ def handle_single_zim(self): ] if self.debug: args.append("--debug") + + if self.disable_metadata_checks: + args.append("--disable-metadatachecks") + return subprocess.run(args).returncode # noqa: PLW1510 @staticmethod diff --git a/src/youtube2zim/processing.py b/src/youtube2zim/processing.py index 9dd7d368..b600518a 100644 --- a/src/youtube2zim/processing.py +++ b/src/youtube2zim/processing.py @@ -58,6 +58,15 @@ def post_process_video(video_dir, video_id, preset, video_format, low_quality): dst_path = src_path.with_name(f"video.{video_format}") logger.info(f"Reencode video to {dst_path}") - reencode( - src_path, dst_path, preset.to_ffmpeg_args(), delete_src=True, failsafe=False - ) + success, process = reencode( + src_path, + dst_path, + preset.to_ffmpeg_args(), + delete_src=True, + with_process=True, + failsafe=True, + ) # pyright: ignore[reportGeneralTypeIssues] + if not success: + if process: + logger.error(process.stdout) + raise Exception(f"Exception while re-encoding {src_path} for {video_id}") diff --git a/src/youtube2zim/scraper.py b/src/youtube2zim/scraper.py index 248b316a..d884664f 100644 --- a/src/youtube2zim/scraper.py +++ b/src/youtube2zim/scraper.py @@ -29,11 +29,19 @@ from zimscraperlib.download import stream_file from zimscraperlib.fix_ogvjs_dist import fix_source_dir from zimscraperlib.i18n import NotFound, get_language_details, setlocale +from zimscraperlib.image.convertion import convert_image from zimscraperlib.image.presets import WebpHigh from zimscraperlib.image.probing import get_colors, is_hex_color from zimscraperlib.image.transformation import resize_image +from zimscraperlib.inputs import compute_descriptions from zimscraperlib.video.presets import VideoMp4Low, VideoWebmLow from zimscraperlib.zim import make_zim_file +from zimscraperlib.zim.metadata import ( + validate_description, + validate_longdescription, + validate_tags, + validate_title, +) from youtube2zim.constants import ( CHANNEL, @@ -93,8 +101,10 @@ def __init__( use_any_optimized_version, s3_url_with_credentials, publisher, + disable_metadata_checks, title=None, description=None, + long_description=None, creator=None, name=None, profile_image=None, @@ -121,6 +131,7 @@ def __init__( self.tags = [t.strip() for t in tags.split(",")] self.title = title self.description = description + self.long_description = long_description self.creator = creator self.publisher = publisher self.name = name @@ -128,6 +139,18 @@ def __init__( self.banner_image = banner_image self.main_color = main_color self.secondary_color = secondary_color + self.disable_metadata_checks = disable_metadata_checks + + if not self.disable_metadata_checks: + # Validate ZIM metadata early so that we do not waste time doing operations + # for a scraper which will fail anyway in the end + validate_tags("Tags", self.tags) + if self.title: + validate_title("Title", self.title) + if self.description: + validate_description("Description", self.description) + if self.long_description: + validate_longdescription("LongDescription", self.long_description) # directory setup self.output_dir = Path(output_dir).expanduser().resolve() @@ -349,14 +372,16 @@ def run(self): fpath=self.output_dir / self.fname, name=self.name, main_page="home.html", - favicon="favicon.jpg", + illustration="favicon.png", title=self.title, description=self.description, + long_description=self.long_description, # pyright: ignore[reportArgumentType] language=self.language, creator=self.creator, publisher=self.publisher, tags=self.tags, scraper=SCRAPER, + disable_metadata_checks=self.disable_metadata_checks, ) if not self.keep_build_dir: @@ -796,9 +821,13 @@ def update_metadata(self): clean_text(self.playlists[0].description) if self.is_playlist and len(self.playlists) == 1 else clean_text(main_channel_json["snippet"]["description"]) - ) + ) or "-" self.title = self.title or auto_title or "-" - self.description = self.description or auto_description or "-" + self.description, self.long_description = compute_descriptions( + default_description=auto_description, + user_description=self.description, + user_long_description=self.long_description, + ) if self.creator is None: if self.is_single_channel: @@ -830,12 +859,16 @@ def update_metadata(self): self.main_color = self.main_color or profile_main self.secondary_color = self.secondary_color or profile_secondary + # convert profile image to png for favicon + png_profile_path = self.build_dir.joinpath("profile.png") + convert_image(self.profile_path, png_profile_path) + resize_image( - self.profile_path, + png_profile_path, width=48, height=48, method="thumbnail", - dst=self.build_dir.joinpath("favicon.jpg"), + dst=self.build_dir.joinpath("favicon.png"), ) def make_html_files(self, actual_videos_ids):