From 4b727454df966dd2eeea0534daaa328697f70c4f Mon Sep 17 00:00:00 2001 From: Dan Niles <56271899+dan-niles@users.noreply.github.com> Date: Tue, 4 Jun 2024 09:09:16 +0530 Subject: [PATCH 1/3] Update scraper to generate JSON files for `zimui` --- CHANGELOG | 2 + scraper/pyproject.toml | 4 +- scraper/src/youtube2zim/schemas.py | 94 +++++++++ scraper/src/youtube2zim/scraper.py | 302 +++++++++++++++++++++++++---- scraper/src/youtube2zim/youtube.py | 15 +- zimui/index.html | 4 +- zimui/public/favicon.ico | Bin 4286 -> 0 bytes 7 files changed, 383 insertions(+), 38 deletions(-) create mode 100644 scraper/src/youtube2zim/schemas.py delete mode 100644 zimui/public/favicon.ico diff --git a/CHANGELOG b/CHANGELOG index 9f9e905a..4cd1b28e 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Move scraper files to `scraper` subfolder and update workflows - Bump `requests` package from 2.32.0 to 2.32.2 - Initialize new Vue.js project in `zimui` subfolder +- Update dependencies in pyproject.toml (pydantic, pyhumps, python-slugify) +- Update scraper to generate JSON files for `zimui` (#212) ## [2.3.0] - 2024-05-22 diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index 95714248..8d282add 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -8,7 +8,7 @@ requires-python = ">=3.12,<3.13" description = "Make ZIM file from a Youtube channel, user or playlist(s)" readme = "../README.md" dependencies = [ - "python-slugify==3.0.3", + "python-slugify==8.0.4", "yt-dlp", # youtube-dl should be updated as frequently as possible "python-dateutil==2.9.0.post0", "jinja2==3.1.4", @@ -16,6 +16,8 @@ dependencies = [ "requests==2.32.2", "kiwixstorage==0.8.3", "pif==0.8.2", + "pydantic==2.7.2", + "pyhumps==3.8.0", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/scraper/src/youtube2zim/schemas.py b/scraper/src/youtube2zim/schemas.py new file mode 100644 index 00000000..b50547da --- /dev/null +++ b/scraper/src/youtube2zim/schemas.py @@ -0,0 +1,94 @@ +from humps import camelize +from pydantic import BaseModel + + +class CamelModel(BaseModel): + """Model to transform Python snake_case into JSON camelCase.""" + + class Config: + alias_generator = camelize + populate_by_name = True + + +class Author(CamelModel): + channel_id: str + channel_title: str + profile_path: str | None = None + banner_path: str | None = None + + +class Subtitle(CamelModel): + """Class to serialize data about a YouTube video subtitle.""" + + code: str + name: str + + +class Video(CamelModel): + """Class to serialize data about a YouTube video.""" + + id: str + title: str + description: str + author: Author + publication_date: str + video_path: str + thumbnail_path: str | None = None + subtitle_path: str | None = None + subtitle_list: list[Subtitle] + duration: str + + +class VideoPreview(CamelModel): + """Class to serialize data about a YouTube video for preview.""" + + slug: str + id: str + title: str + thumbnail_path: str | None = None + duration: str + + +class Playlist(CamelModel): + """Class to serialize data about a YouTube playlist.""" + + id: str + author: Author + title: str + description: str + publication_date: str + thumbnail_path: str | None = None + videos: list[VideoPreview] + videos_count: int + + +class PlaylistPreview(CamelModel): + """Class to serialize data about a YouTube playlist for preview.""" + + slug: str + id: str + title: str + thumbnail_path: str | None = None + videos_count: int + main_video_slug: str + + +class Playlists(CamelModel): + """Class to serialize data about a list of YouTube playlists.""" + + playlists: list[PlaylistPreview] + + +class Channel(CamelModel): + """Class to serialize data about a YouTube channel.""" + + id: str + title: str + description: str + channel_name: str + channel_description: str + profile_path: str | None = None + banner_path: str | None = None + joined_date: str + collection_type: str + main_playlist: str | None = None diff --git a/scraper/src/youtube2zim/scraper.py b/scraper/src/youtube2zim/scraper.py index ba1e3329..3d844512 100644 --- a/scraper/src/youtube2zim/scraper.py +++ b/scraper/src/youtube2zim/scraper.py @@ -36,6 +36,7 @@ from zimscraperlib.inputs import compute_descriptions from zimscraperlib.video.presets import VideoMp4Low, VideoWebmLow from zimscraperlib.zim import Creator +from zimscraperlib.zim.filesystem import FileItem from zimscraperlib.zim.metadata import ( validate_description, validate_longdescription, @@ -54,6 +55,16 @@ logger, ) from youtube2zim.processing import post_process_video, process_thumbnail +from youtube2zim.schemas import ( + Author, + Channel, + Playlist, + PlaylistPreview, + Playlists, + Subtitle, + Video, + VideoPreview, +) from youtube2zim.utils import ( clean_text, get_slug, @@ -66,7 +77,7 @@ credentials_ok, extract_playlists_details_from, get_channel_json, - # get_videos_authors_info, + get_videos_authors_info, get_videos_json, save_channel_branding, skip_deleted_videos, @@ -317,43 +328,37 @@ def run(self): ) logger.info(f"{nb_videos_msg}.") - # Commented out for now, - # but we have to rework this part to work with the new vuejs zimui - # download videos (and recompress) - # logger.info( - # "downloading all videos, subtitles and thumbnails " - # f"(concurrency={self.max_concurrency})" - # ) - # logger.info(f" format: {self.video_format}") - # logger.info(f" quality: {self.video_quality}") - # logger.info(f" generated-subtitles: {self.all_subtitles}") - # if self.s3_storage: - # logger.info( - # f" using cache: {self.s3_storage.url.netloc} " - # f"with bucket: {self.s3_storage.bucket_name}" - # ) - # succeeded, failed = self.download_video_files( - # max_concurrency=self.max_concurrency - # ) - # if failed: - # logger.error(f"{len(failed)} video(s) failed to download: {failed}") - # if len(failed) >= len(succeeded): - # logger.critical("More than half of videos failed. exiting") - # raise OSError("Too much videos failed to download") - - # logger.info("retrieve channel-info for all videos (author details)") - # get_videos_authors_info(succeeded) - - # logger.info("download all author's profile pictures") - # self.download_authors_branding() + logger.info( + "downloading all videos, subtitles and thumbnails " + f"(concurrency={self.max_concurrency})" + ) + logger.info(f" format: {self.video_format}") + logger.info(f" quality: {self.video_quality}") + logger.info(f" generated-subtitles: {self.all_subtitles}") + if self.s3_storage: + logger.info( + f" using cache: {self.s3_storage.url.netloc} " + f"with bucket: {self.s3_storage.bucket_name}" + ) + succeeded, failed = self.download_video_files( + max_concurrency=self.max_concurrency + ) + if failed: + logger.error(f"{len(failed)} video(s) failed to download: {failed}") + if len(failed) >= len(succeeded): + logger.critical("More than half of videos failed. exiting") + raise OSError("Too much videos failed to download") + + logger.info("retrieve channel-info for all videos (author details)") + get_videos_authors_info(succeeded) + + logger.info("download all author's profile pictures") + self.download_authors_branding() logger.info("update general metadata") self.update_metadata() - # logger.info("creating HTML files") - # self.make_html_files(succeeded) - # make zim file os.makedirs(self.output_dir, exist_ok=True) if not self.no_zim: @@ -412,6 +417,12 @@ def run(self): logger.debug(f"Preparing zimfile at {self.zim_file.filename}") logger.debug(f"Recursively adding files from {self.build_dir}") self.add_zimui() + + logger.info("creating JSON files") + self.make_json_files(succeeded) + + logger.info("Adding files to ZIM") + self.add_files_to_zim(self.build_dir, self.zim_file) except KeyboardInterrupt: self.zim_file.can_finish = False logger.error("KeyboardInterrupt, exiting.") @@ -923,6 +934,7 @@ def update_metadata(self): method="thumbnail", dst=self.build_dir.joinpath("favicon.png"), ) + png_profile_path.unlink() def make_html_files(self, actual_videos_ids): """make up HTML structure to read the content @@ -1108,3 +1120,227 @@ def to_data_js(video): # clean videos left out in videos directory remove_unused_videos(videos) + + def make_json_files(self, actual_videos_ids): + """Generate JSON files to be consumed by the frontend""" + + def remove_unused_videos(videos): + video_ids = [video["contentDetails"]["videoId"] for video in videos] + for path in self.videos_dir.iterdir(): + if path.is_dir() and path.name not in video_ids: + logger.debug(f"Removing unused video {path.name}") + shutil.rmtree(path, ignore_errors=True) + + def is_present(video): + """whether this video has actually been succeffuly downloaded""" + return video["contentDetails"]["videoId"] in actual_videos_ids + + def video_has_channel(videos_channels, video): + return video["contentDetails"]["videoId"] in videos_channels + + def get_thumbnail_path(video_id): + return f"videos/{video_id}/video.webp" + + def get_subtitles(video_id) -> list[Subtitle]: + video_dir = self.videos_dir.joinpath(video_id) + languages = [ + x.stem.split(".")[1] + for x in video_dir.iterdir() + if x.is_file() and x.name.endswith(".vtt") + ] + + def to_subtitle_object(lang): + try: + try: + subtitle = get_language_details( + YOUTUBE_LANG_MAP.get(lang, lang) + ) + except NotFound: + lang_simpl = re.sub(r"^([a-z]{2})-.+$", r"\1", lang) + subtitle = get_language_details( + YOUTUBE_LANG_MAP.get(lang_simpl, lang_simpl) + ) + except Exception: + logger.error(f"Failed to get language details for {lang}") + raise + return Subtitle( + code=lang, + name=f"{subtitle['english'].title()} - {subtitle['query']}", + ) + + # Youtube.com sorts subtitles by English name + return sorted(map(to_subtitle_object, languages), key=lambda x: x.name) + + def get_videos_list(playlist): + videos = load_mandatory_json( + self.cache_dir, f"playlist_{playlist.playlist_id}_videos" + ) + videos = list(filter(skip_deleted_videos, videos)) + videos = list(filter(is_present, videos)) + videos = list(filter(has_channel, videos)) + videos = sorted(videos, key=lambda v: v["snippet"]["position"]) + return videos + + def generate_video_object(video) -> Video: + video_id = video["contentDetails"]["videoId"] + author = videos_channels[video_id] + subtitles_list = get_subtitles(video_id) + return Video( + id=video_id, + title=video["snippet"]["title"], + description=video["snippet"]["description"], + author=Author( + channel_id=author["channelId"], + channel_title=author["channelTitle"], + profile_path=f"channels/{author['channelId']}/profile.jpg", + banner_path=f"channels/{author['channelId']}/banner.jpg", + ), + publication_date=video["contentDetails"]["videoPublishedAt"], + video_path=f"videos/{video_id}/video.{self.video_format}", + thumbnail_path=get_thumbnail_path(video_id), + subtitle_path=f"videos/{video_id}" if len(subtitles_list) > 0 else None, + subtitle_list=subtitles_list, + duration=videos_channels[video_id]["duration"], + ) + + def generate_video_preview_object(video) -> VideoPreview: + video_id = video["contentDetails"]["videoId"] + return VideoPreview( + slug=get_video_slug(video), + id=video_id, + title=video["snippet"]["title"], + thumbnail_path=get_thumbnail_path(video_id), + duration=videos_channels[video_id]["duration"], + ) + + def get_video_slug(video) -> str: + title = video["snippet"]["title"] + video_id = video["contentDetails"]["videoId"] + return f"{get_slug(title)}-{video_id[:4]}" + + def generate_playlist_object(playlist) -> Playlist: + videos = get_videos_list(playlist) + return Playlist( + id=playlist.playlist_id, + title=playlist.title, + description=playlist.description, + videos=[generate_video_preview_object(video) for video in videos], + publication_date=playlist.published_at, + author=Author( + channel_id=playlist.creator_id, + channel_title=playlist.creator_name, + profile_path=f"channels/{playlist.creator_id}/profile.jpg", + banner_path=f"channels/{playlist.creator_id}/banner.jpg", + ), + videos_count=len(videos), + thumbnail_path=get_thumbnail_path( + videos[0]["contentDetails"]["videoId"] + ), + ) + + def generate_playlist_preview_object(playlist) -> PlaylistPreview: + videos = get_videos_list(playlist) + return PlaylistPreview( + slug=get_playlist_slug(playlist), + id=playlist.playlist_id, + title=playlist.title, + thumbnail_path=get_thumbnail_path( + videos[0]["contentDetails"]["videoId"] + ), + videos_count=len(videos), + main_video_slug=get_video_slug(videos[0]), + ) + + def get_playlist_slug(playlist) -> str: + return f"{get_slug(playlist.title)}-{playlist.playlist_id[-4:]}" + + videos = load_mandatory_json(self.cache_dir, "videos").values() + # filter videos so we only include the ones we could retrieve + videos = list(filter(is_present, videos)) + videos_channels = load_mandatory_json(self.cache_dir, "videos_channels") + has_channel = functools.partial(video_has_channel, videos_channels) + # filter videos to exclude those for which we have no channel (#76) + videos = list(filter(has_channel, videos)) + for video in videos: + slug = get_video_slug(video) + self.zim_file.add_item_for( + path=f"videos/{slug}.json", + title=slug, + content=generate_video_object(video).model_dump_json( + by_alias=True, indent=2 + ), + mimetype="application/json", + is_front=False, + ) + + # write playlists JSON files + playlist_list = [] + + main_playlist_slug = None + if len(self.playlists) > 0: + main_playlist_slug = get_playlist_slug( + self.playlists[0] + ) # set first playlist as main playlist + + for playlist in self.playlists: + playlist_slug = get_playlist_slug(playlist) + playlist_path = f"playlists/{playlist_slug}.json" + + if playlist.playlist_id != self.uploads_playlist_id: + playlist_list.append(generate_playlist_preview_object(playlist)) + else: + main_playlist_slug = ( + playlist_slug # set uploads playlist as main playlist + ) + + self.zim_file.add_item_for( + path=playlist_path, + title=playlist.title, + content=generate_playlist_object(playlist).model_dump_json( + by_alias=True, indent=2 + ), + mimetype="application/json", + is_front=False, + ) + + # write playlists.json file + self.zim_file.add_item_for( + path="playlists.json", + title="Playlists", + content=Playlists(playlists=playlist_list).model_dump_json( + by_alias=True, indent=2 + ), + mimetype="application/json", + is_front=False, + ) + + # write channel.json file + channel_data = get_channel_json(self.main_channel_id) + self.zim_file.add_item_for( + path="channel.json", + title=self.title, + content=Channel( + id=str(self.main_channel_id), + title=str(self.title), + description=str(self.description), + channel_name=channel_data["snippet"]["title"], + channel_description=channel_data["snippet"]["description"], + profile_path="profile.jpg", + banner_path="banner.jpg", + collection_type=self.collection_type, + main_playlist=main_playlist_slug, + joined_date=channel_data["snippet"]["publishedAt"], + ).model_dump_json(by_alias=True, indent=2), + mimetype="application/json", + is_front=False, + ) + + # clean videos left out in videos directory + remove_unused_videos(videos) + + def add_files_to_zim(self, dir_path: Path, zim_file: Creator): + """recursively add a path to a zim file""" + for file_path in filter( + lambda file_path: file_path.is_file(), dir_path.rglob("*") + ): + zim_file.add_item(FileItem(dir_path, file_path)) diff --git a/scraper/src/youtube2zim/youtube.py b/scraper/src/youtube2zim/youtube.py index 2725e14c..f9cd6909 100644 --- a/scraper/src/youtube2zim/youtube.py +++ b/scraper/src/youtube2zim/youtube.py @@ -24,12 +24,21 @@ class Playlist: - def __init__(self, playlist_id, title, description, creator_id, creator_name): + def __init__( + self, + playlist_id, + title, + description, + creator_id, + creator_name, + published_at=None, + ): self.playlist_id = playlist_id self.title = title self.description = description self.creator_id = creator_id self.creator_name = creator_name + self.published_at = published_at self.slug = get_slug(title, js_safe=True) @classmethod @@ -41,6 +50,7 @@ def from_id(cls, playlist_id): description=playlist_json["snippet"]["description"], creator_id=playlist_json["snippet"]["channelId"], creator_name=playlist_json["snippet"]["channelTitle"], + published_at=playlist_json["snippet"]["publishedAt"], ) def to_dict(self): @@ -221,7 +231,7 @@ def retrieve_videos_for(videos_ids): VIDEOS_API, params={ "id": ",".join(videos_ids), - "part": "snippet", + "part": "snippet,contentDetails", "key": YOUTUBE.api_key, "maxResults": RESULTS_PER_PAGE, "pageToken": page_token, @@ -238,6 +248,7 @@ def retrieve_videos_for(videos_ids): item["id"]: { "channelId": item["snippet"]["channelId"], "channelTitle": item["snippet"]["channelTitle"], + "duration": item["contentDetails"]["duration"], } } ) diff --git a/zimui/index.html b/zimui/index.html index cd531825..22929b5e 100644 --- a/zimui/index.html +++ b/zimui/index.html @@ -2,13 +2,13 @@ - + YouTube in a ZIM
- + diff --git a/zimui/public/favicon.ico b/zimui/public/favicon.ico deleted file mode 100644 index df36fcfb72584e00488330b560ebcf34a41c64c2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4286 zcmds*O-Phc6o&64GDVCEQHxsW(p4>LW*W<827=Unuo8sGpRux(DN@jWP-e29Wl%wj zY84_aq9}^Am9-cWTD5GGEo#+5Fi2wX_P*bo+xO!)p*7B;iKlbFd(U~_d(U?#hLj56 zPhFkj-|A6~Qk#@g^#D^U0XT1cu=c-vu1+SElX9NR;kzAUV(q0|dl0|%h|dI$%VICy zJnu2^L*Te9JrJMGh%-P79CL0}dq92RGU6gI{v2~|)p}sG5x0U*z<8U;Ij*hB9z?ei z@g6Xq-pDoPl=MANPiR7%172VA%r)kevtV-_5H*QJKFmd;8yA$98zCxBZYXTNZ#QFk2(TX0;Y2dt&WitL#$96|gJY=3xX zpCoi|YNzgO3R`f@IiEeSmKrPSf#h#Qd<$%Ej^RIeeYfsxhPMOG`S`Pz8q``=511zm zAm)MX5AV^5xIWPyEu7u>qYs?pn$I4nL9J!=K=SGlKLXpE<5x+2cDTXq?brj?n6sp= zphe9;_JHf40^9~}9i08r{XM$7HB!`{Ys~TK0kx<}ZQng`UPvH*11|q7&l9?@FQz;8 zx!=3<4seY*%=OlbCbcae?5^V_}*K>Uo6ZWV8mTyE^B=DKy7-sdLYkR5Z?paTgK-zyIkKjIcpyO z{+uIt&YSa_$QnN_@t~L014dyK(fOOo+W*MIxbA6Ndgr=Y!f#Tokqv}n<7-9qfHkc3 z=>a|HWqcX8fzQCT=dqVbogRq!-S>H%yA{1w#2Pn;=e>JiEj7Hl;zdt-2f+j2%DeVD zsW0Ab)ZK@0cIW%W7z}H{&~yGhn~D;aiP4=;m-HCo`BEI+Kd6 z={Xwx{TKxD#iCLfl2vQGDitKtN>z|-AdCN|$jTFDg0m3O`WLD4_s#$S From 99817858463334030f4807a0ec5f975ac63189c8 Mon Sep 17 00:00:00 2001 From: Dan Niles <56271899+dan-niles@users.noreply.github.com> Date: Tue, 11 Jun 2024 13:57:31 +0530 Subject: [PATCH 2/3] Remove old jinja template files and method in scraper.py --- CHANGELOG | 1 + scraper/src/youtube2zim/scraper.py | 190 ------------------ .../src/youtube2zim/templates/article.html | 44 ---- scraper/src/youtube2zim/templates/home.html | 63 ------ scraper/src/youtube2zim/utils.py | 9 - 5 files changed, 1 insertion(+), 306 deletions(-) delete mode 100644 scraper/src/youtube2zim/templates/article.html delete mode 100644 scraper/src/youtube2zim/templates/home.html diff --git a/CHANGELOG b/CHANGELOG index 4cd1b28e..11f68e4f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Initialize new Vue.js project in `zimui` subfolder - Update dependencies in pyproject.toml (pydantic, pyhumps, python-slugify) - Update scraper to generate JSON files for `zimui` (#212) +- Remove old UI files and methods: template files (home.html, article.html) and `make_html_files` method in scraper.py ## [2.3.0] - 2024-05-22 diff --git a/scraper/src/youtube2zim/scraper.py b/scraper/src/youtube2zim/scraper.py index 3d844512..2c61f9cc 100644 --- a/scraper/src/youtube2zim/scraper.py +++ b/scraper/src/youtube2zim/scraper.py @@ -10,7 +10,6 @@ import concurrent.futures import datetime import functools -import json import locale import os import re @@ -20,10 +19,7 @@ from gettext import gettext as _ from pathlib import Path -import jinja2 import yt_dlp -from babel.dates import format_date -from dateutil import parser as dt_parser from kiwixstorage import KiwixStorage from pif import get_public_ip from zimscraperlib.download import stream_file @@ -70,7 +66,6 @@ get_slug, load_json, load_mandatory_json, - render_template, save_json, ) from youtube2zim.youtube import ( @@ -936,191 +931,6 @@ def update_metadata(self): ) png_profile_path.unlink() - def make_html_files(self, actual_videos_ids): - """make up HTML structure to read the content - - /home.html Homepage - - for each video: - - .html HTML article - - videos//video. video file - - videos//video..vtt subtititle(s) - - videos//video.webp template - """ - - def remove_unused_videos(videos): - video_ids = [video["contentDetails"]["videoId"] for video in videos] - for path in self.videos_dir.iterdir(): - if path.is_dir() and path.name not in video_ids: - logger.debug(f"Removing unused video {path.name}") - shutil.rmtree(path, ignore_errors=True) - - def is_present(video): - """whether this video has actually been succeffuly downloaded""" - return video["contentDetails"]["videoId"] in actual_videos_ids - - def video_has_channel(videos_channels, video): - return video["contentDetails"]["videoId"] in videos_channels - - def get_subtitles(video_id): - video_dir = self.videos_dir.joinpath(video_id) - languages = [ - x.stem.split(".")[1] - for x in video_dir.iterdir() - if x.is_file() and x.name.endswith(".vtt") - ] - - def to_jinja_subtitle(lang): - try: - try: - subtitle = get_language_details( - YOUTUBE_LANG_MAP.get(lang, lang) - ) - except NotFound: - lang_simpl = re.sub(r"^([a-z]{2})-.+$", r"\1", lang) - subtitle = get_language_details( - YOUTUBE_LANG_MAP.get(lang_simpl, lang_simpl) - ) - except Exception: - logger.error(f"Failed to get language details for {lang}") - raise - return { - "code": lang, - # Youtube.com uses `English - code` format. - # Note: videojs displays it lowercased anyway - "name": f"{subtitle['english'].title()} - {subtitle['query']}", - } - - # Youtube.com sorts subtitles by English name - return sorted(map(to_jinja_subtitle, languages), key=lambda x: x["name"]) - - env = jinja2.Environment( - loader=jinja2.FileSystemLoader(str(self.templates_dir)), autoescape=True - ) - - videos = load_mandatory_json(self.cache_dir, "videos").values() - # filter videos so we only include the ones we could retrieve - videos = list(filter(is_present, videos)) - videos_channels = load_mandatory_json(self.cache_dir, "videos_channels") - has_channel = functools.partial(video_has_channel, videos_channels) - # filter videos to exclude those for which we have no channel (#76) - videos = list(filter(has_channel, videos)) - for video in videos: - video_id = video["contentDetails"]["videoId"] - title = video["snippet"]["title"] - slug = get_slug(title) - description = video["snippet"]["description"] - publication_date = dt_parser.parse( - video["contentDetails"]["videoPublishedAt"] - ) - author = videos_channels[video_id] - subtitles = get_subtitles(video_id) - video_url = f"https://www.youtube.com/watch?v={video_id}" - - html = render_template( - env=env, - template_name="article.html", - video_id=video_id, - video_format=self.video_format, - author=author, - title=title, - description=description, - date=format_date(publication_date, format="medium", locale=self.locale), - subtitles=subtitles, - url=video_url, - channel_id=video["snippet"]["channelId"], - color=self.main_color, - background_color=self.secondary_color, - autoplay=self.autoplay, - ) - with open( - self.build_dir.joinpath(f"{slug}.html"), "w", encoding="utf-8" - ) as fp: - fp.write(html) - - # build homepage - html = render_template( - env=env, - template_name="home.html", - playlists=self.playlists, - video_format=self.video_format, - title=self.title, - description=self.description, - color=self.main_color, - background_color=self.secondary_color, - page_label=_("Page {current}/{total}"), - back_label=_("Back to top"), - ) - with open(self.build_dir.joinpath("home.html"), "w", encoding="utf-8") as fp: - fp.write(html) - - # rewrite app.js including `format` - with open(self.assets_dir.joinpath("app.js"), "w", encoding="utf-8") as fp: - fp.write( - render_template( - env=env, - template_name="assets/app.js", - video_format=self.video_format, - ) - ) - - # rewrite app.js including `pagination` - with open(self.assets_dir.joinpath("db.js"), "w", encoding="utf-8") as fp: - fp.write( - render_template( - env=env, - template_name="assets/db.js", - NB_VIDEOS_PER_PAGE=self.nb_videos_per_page, - ) - ) - - # write list of videos in data.js - def to_data_js(video): - return { - "id": video["contentDetails"]["videoId"], - "title": video["snippet"]["title"], - "slug": get_slug(video["snippet"]["title"]), - "description": video["snippet"]["description"], - "subtitles": get_subtitles(video["contentDetails"]["videoId"]), - "thumbnail": str( - Path("videos").joinpath( - video["contentDetails"]["videoId"], "video.webp" - ) - ), - } - - with open(self.assets_dir.joinpath("data.js"), "w", encoding="utf-8") as fp: - # write all playlists as they are - for playlist in self.playlists: - # retrieve list of videos for PL - playlist_videos = load_mandatory_json( - self.cache_dir, f"playlist_{playlist.playlist_id}_videos" - ) - # filtering-out missing ones (deleted or not downloaded) - playlist_videos = list(filter(skip_deleted_videos, playlist_videos)) - playlist_videos = list(filter(is_present, playlist_videos)) - playlist_videos = list(filter(has_channel, playlist_videos)) - # sorting them based on playlist - playlist_videos.sort(key=lambda v: v["snippet"]["position"]) - - fp.write( - "var json_{slug} = {json_str};\n".format( - slug=playlist.slug, - json_str=json.dumps( - list(map(to_data_js, playlist_videos)), indent=4 - ), - ) - ) - - # write a metadata.json file with some content-related data - with open( - self.build_dir.joinpath("metadata.json"), "w", encoding="utf-8" - ) as fp: - json.dump({"video_format": self.video_format}, fp, indent=4) - - # clean videos left out in videos directory - remove_unused_videos(videos) - def make_json_files(self, actual_videos_ids): """Generate JSON files to be consumed by the frontend""" diff --git a/scraper/src/youtube2zim/templates/article.html b/scraper/src/youtube2zim/templates/article.html deleted file mode 100644 index cd9a70a7..00000000 --- a/scraper/src/youtube2zim/templates/article.html +++ /dev/null @@ -1,44 +0,0 @@ - - {{ title }} - - - - - - - - - - - - - - - -
- -

{{ author.channelTitle }}

-

{{ title }}

- -
-

{{ description }}

-
-
{{ date }}
-
- - - - diff --git a/scraper/src/youtube2zim/templates/home.html b/scraper/src/youtube2zim/templates/home.html deleted file mode 100644 index cdeeb717..00000000 --- a/scraper/src/youtube2zim/templates/home.html +++ /dev/null @@ -1,63 +0,0 @@ - - {{ title }} - - - - - - - - - - - - - - - - -
- -

{{ title }}

-
-
-
-
- - - - -
- -
-
-
-
-
    -
    -
    {{ back_label }}
    - -
    - - - - - - - - diff --git a/scraper/src/youtube2zim/utils.py b/scraper/src/youtube2zim/utils.py index 427fd110..3398e17f 100644 --- a/scraper/src/youtube2zim/utils.py +++ b/scraper/src/youtube2zim/utils.py @@ -4,7 +4,6 @@ import json from pathlib import Path -import jinja2 from slugify import slugify @@ -45,11 +44,3 @@ def load_mandatory_json(cache_dir: Path, key): def has_argument(arg_name, all_args): """whether --arg_name is specified in all_args""" return list(filter(lambda x: x.startswith(f"--{arg_name}"), all_args)) - - -def render_template(env: jinja2.Environment, template_name: str, **kwargs): - """render a Jinja template and ensures that result is a string""" - html = env.get_template(template_name).render(kwargs) - if not isinstance(html, str): - raise Exception("Jinja template did not returned a string") - return html From 0d7050f2ff8a5e1a5b8d17cebe5e784fe1a4cf42 Mon Sep 17 00:00:00 2001 From: Dan Niles <56271899+dan-niles@users.noreply.github.com> Date: Tue, 11 Jun 2024 14:00:04 +0530 Subject: [PATCH 3/3] Remove `locale` folder and files used for translation --- CHANGELOG | 1 + Dockerfile | 1 - .../locale/fr/LC_MESSAGES/messages.po | 35 ------------------- 3 files changed, 1 insertion(+), 36 deletions(-) delete mode 100644 scraper/src/youtube2zim/locale/fr/LC_MESSAGES/messages.po diff --git a/CHANGELOG b/CHANGELOG index 11f68e4f..57f8f54e 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Update dependencies in pyproject.toml (pydantic, pyhumps, python-slugify) - Update scraper to generate JSON files for `zimui` (#212) - Remove old UI files and methods: template files (home.html, article.html) and `make_html_files` method in scraper.py +- Remove broken locale folder and files used for translation; translation will be restored with #222 ## [2.3.0] - 2024-05-22 diff --git a/Dockerfile b/Dockerfile index 17b1fb36..80a3f4ab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,6 @@ LABEL org.opencontainers.image.source https://github.com/openzim/youtube # Install necessary packages RUN apt-get update \ && apt-get install -y --no-install-recommends \ - locales-all \ wget \ unzip \ ffmpeg \ diff --git a/scraper/src/youtube2zim/locale/fr/LC_MESSAGES/messages.po b/scraper/src/youtube2zim/locale/fr/LC_MESSAGES/messages.po deleted file mode 100644 index 84f1cc09..00000000 --- a/scraper/src/youtube2zim/locale/fr/LC_MESSAGES/messages.po +++ /dev/null @@ -1,35 +0,0 @@ -# French translations for PROJECT. -# Copyright (C) 2019 ORGANIZATION -# This file is distributed under the same license as the PROJECT project. -# FIRST AUTHOR , 2019. -# -msgid "" -msgstr "" -"Project-Id-Version: PROJECT VERSION\n" -"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" -"POT-Creation-Date: 2020-09-25 08:13+0000\n" -"PO-Revision-Date: 2019-09-05 11:50+0000\n" -"Last-Translator: reg \n" -"Language: fr\n" -"Language-Team: \n" -"Plural-Forms: nplurals=2; plural=(n > 1)\n" -"MIME-Version: 1.0\n" -"Content-Type: text/plain; charset=utf-8\n" -"Content-Transfer-Encoding: 8bit\n" -"Generated-By: Babel 2.8.0\n" - -#: youtube2zim/scraper.py:738 -msgid "Youtube Channel “{title}”" -msgstr "Chaîne Youtube « {title} »" - -#: youtube2zim/scraper.py:742 -msgid "Youtube Channels" -msgstr "Chaînes Youtube" - -#: youtube2zim/scraper.py:871 -msgid "Page {current}/{total}" -msgstr "Page {current}/{total}" - -#: youtube2zim/scraper.py:872 -msgid "Back to top" -msgstr "Haut"