{{ author.channelTitle }}
-{{ title }}
- -{{ description }}
-From 4b727454df966dd2eeea0534daaa328697f70c4f Mon Sep 17 00:00:00 2001 From: Dan Niles <56271899+dan-niles@users.noreply.github.com> Date: Tue, 4 Jun 2024 09:09:16 +0530 Subject: [PATCH 1/3] Update scraper to generate JSON files for `zimui` --- CHANGELOG | 2 + scraper/pyproject.toml | 4 +- scraper/src/youtube2zim/schemas.py | 94 +++++++++ scraper/src/youtube2zim/scraper.py | 302 +++++++++++++++++++++++++---- scraper/src/youtube2zim/youtube.py | 15 +- zimui/index.html | 4 +- zimui/public/favicon.ico | Bin 4286 -> 0 bytes 7 files changed, 383 insertions(+), 38 deletions(-) create mode 100644 scraper/src/youtube2zim/schemas.py delete mode 100644 zimui/public/favicon.ico diff --git a/CHANGELOG b/CHANGELOG index 9f9e905a..4cd1b28e 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Move scraper files to `scraper` subfolder and update workflows - Bump `requests` package from 2.32.0 to 2.32.2 - Initialize new Vue.js project in `zimui` subfolder +- Update dependencies in pyproject.toml (pydantic, pyhumps, python-slugify) +- Update scraper to generate JSON files for `zimui` (#212) ## [2.3.0] - 2024-05-22 diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index 95714248..8d282add 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -8,7 +8,7 @@ requires-python = ">=3.12,<3.13" description = "Make ZIM file from a Youtube channel, user or playlist(s)" readme = "../README.md" dependencies = [ - "python-slugify==3.0.3", + "python-slugify==8.0.4", "yt-dlp", # youtube-dl should be updated as frequently as possible "python-dateutil==2.9.0.post0", "jinja2==3.1.4", @@ -16,6 +16,8 @@ dependencies = [ "requests==2.32.2", "kiwixstorage==0.8.3", "pif==0.8.2", + "pydantic==2.7.2", + "pyhumps==3.8.0", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/scraper/src/youtube2zim/schemas.py b/scraper/src/youtube2zim/schemas.py new file mode 100644 index 00000000..b50547da --- /dev/null +++ b/scraper/src/youtube2zim/schemas.py @@ -0,0 +1,94 @@ +from humps import camelize +from pydantic import BaseModel + + +class CamelModel(BaseModel): + """Model to transform Python snake_case into JSON camelCase.""" + + class Config: + alias_generator = camelize + populate_by_name = True + + +class Author(CamelModel): + channel_id: str + channel_title: str + profile_path: str | None = None + banner_path: str | None = None + + +class Subtitle(CamelModel): + """Class to serialize data about a YouTube video subtitle.""" + + code: str + name: str + + +class Video(CamelModel): + """Class to serialize data about a YouTube video.""" + + id: str + title: str + description: str + author: Author + publication_date: str + video_path: str + thumbnail_path: str | None = None + subtitle_path: str | None = None + subtitle_list: list[Subtitle] + duration: str + + +class VideoPreview(CamelModel): + """Class to serialize data about a YouTube video for preview.""" + + slug: str + id: str + title: str + thumbnail_path: str | None = None + duration: str + + +class Playlist(CamelModel): + """Class to serialize data about a YouTube playlist.""" + + id: str + author: Author + title: str + description: str + publication_date: str + thumbnail_path: str | None = None + videos: list[VideoPreview] + videos_count: int + + +class PlaylistPreview(CamelModel): + """Class to serialize data about a YouTube playlist for preview.""" + + slug: str + id: str + title: str + thumbnail_path: str | None = None + videos_count: int + main_video_slug: str + + +class Playlists(CamelModel): + """Class to serialize data about a list of YouTube playlists.""" + + playlists: list[PlaylistPreview] + + +class Channel(CamelModel): + """Class to serialize data about a YouTube channel.""" + + id: str + title: str + description: str + channel_name: str + channel_description: str + profile_path: str | None = None + banner_path: str | None = None + joined_date: str + collection_type: str + main_playlist: str | None = None diff --git a/scraper/src/youtube2zim/scraper.py b/scraper/src/youtube2zim/scraper.py index ba1e3329..3d844512 100644 --- a/scraper/src/youtube2zim/scraper.py +++ b/scraper/src/youtube2zim/scraper.py @@ -36,6 +36,7 @@ from zimscraperlib.inputs import compute_descriptions from zimscraperlib.video.presets import VideoMp4Low, VideoWebmLow from zimscraperlib.zim import Creator +from zimscraperlib.zim.filesystem import FileItem from zimscraperlib.zim.metadata import ( validate_description, validate_longdescription, @@ -54,6 +55,16 @@ logger, ) from youtube2zim.processing import post_process_video, process_thumbnail +from youtube2zim.schemas import ( + Author, + Channel, + Playlist, + PlaylistPreview, + Playlists, + Subtitle, + Video, + VideoPreview, +) from youtube2zim.utils import ( clean_text, get_slug, @@ -66,7 +77,7 @@ credentials_ok, extract_playlists_details_from, get_channel_json, - # get_videos_authors_info, + get_videos_authors_info, get_videos_json, save_channel_branding, skip_deleted_videos, @@ -317,43 +328,37 @@ def run(self): ) logger.info(f"{nb_videos_msg}.") - # Commented out for now, - # but we have to rework this part to work with the new vuejs zimui - # download videos (and recompress) - # logger.info( - # "downloading all videos, subtitles and thumbnails " - # f"(concurrency={self.max_concurrency})" - # ) - # logger.info(f" format: {self.video_format}") - # logger.info(f" quality: {self.video_quality}") - # logger.info(f" generated-subtitles: {self.all_subtitles}") - # if self.s3_storage: - # logger.info( - # f" using cache: {self.s3_storage.url.netloc} " - # f"with bucket: {self.s3_storage.bucket_name}" - # ) - # succeeded, failed = self.download_video_files( - # max_concurrency=self.max_concurrency - # ) - # if failed: - # logger.error(f"{len(failed)} video(s) failed to download: {failed}") - # if len(failed) >= len(succeeded): - # logger.critical("More than half of videos failed. exiting") - # raise OSError("Too much videos failed to download") - - # logger.info("retrieve channel-info for all videos (author details)") - # get_videos_authors_info(succeeded) - - # logger.info("download all author's profile pictures") - # self.download_authors_branding() + logger.info( + "downloading all videos, subtitles and thumbnails " + f"(concurrency={self.max_concurrency})" + ) + logger.info(f" format: {self.video_format}") + logger.info(f" quality: {self.video_quality}") + logger.info(f" generated-subtitles: {self.all_subtitles}") + if self.s3_storage: + logger.info( + f" using cache: {self.s3_storage.url.netloc} " + f"with bucket: {self.s3_storage.bucket_name}" + ) + succeeded, failed = self.download_video_files( + max_concurrency=self.max_concurrency + ) + if failed: + logger.error(f"{len(failed)} video(s) failed to download: {failed}") + if len(failed) >= len(succeeded): + logger.critical("More than half of videos failed. exiting") + raise OSError("Too much videos failed to download") + + logger.info("retrieve channel-info for all videos (author details)") + get_videos_authors_info(succeeded) + + logger.info("download all author's profile pictures") + self.download_authors_branding() logger.info("update general metadata") self.update_metadata() - # logger.info("creating HTML files") - # self.make_html_files(succeeded) - # make zim file os.makedirs(self.output_dir, exist_ok=True) if not self.no_zim: @@ -412,6 +417,12 @@ def run(self): logger.debug(f"Preparing zimfile at {self.zim_file.filename}") logger.debug(f"Recursively adding files from {self.build_dir}") self.add_zimui() + + logger.info("creating JSON files") + self.make_json_files(succeeded) + + logger.info("Adding files to ZIM") + self.add_files_to_zim(self.build_dir, self.zim_file) except KeyboardInterrupt: self.zim_file.can_finish = False logger.error("KeyboardInterrupt, exiting.") @@ -923,6 +934,7 @@ def update_metadata(self): method="thumbnail", dst=self.build_dir.joinpath("favicon.png"), ) + png_profile_path.unlink() def make_html_files(self, actual_videos_ids): """make up HTML structure to read the content @@ -1108,3 +1120,227 @@ def to_data_js(video): # clean videos left out in videos directory remove_unused_videos(videos) + + def make_json_files(self, actual_videos_ids): + """Generate JSON files to be consumed by the frontend""" + + def remove_unused_videos(videos): + video_ids = [video["contentDetails"]["videoId"] for video in videos] + for path in self.videos_dir.iterdir(): + if path.is_dir() and path.name not in video_ids: + logger.debug(f"Removing unused video {path.name}") + shutil.rmtree(path, ignore_errors=True) + + def is_present(video): + """whether this video has actually been succeffuly downloaded""" + return video["contentDetails"]["videoId"] in actual_videos_ids + + def video_has_channel(videos_channels, video): + return video["contentDetails"]["videoId"] in videos_channels + + def get_thumbnail_path(video_id): + return f"videos/{video_id}/video.webp" + + def get_subtitles(video_id) -> list[Subtitle]: + video_dir = self.videos_dir.joinpath(video_id) + languages = [ + x.stem.split(".")[1] + for x in video_dir.iterdir() + if x.is_file() and x.name.endswith(".vtt") + ] + + def to_subtitle_object(lang): + try: + try: + subtitle = get_language_details( + YOUTUBE_LANG_MAP.get(lang, lang) + ) + except NotFound: + lang_simpl = re.sub(r"^([a-z]{2})-.+$", r"\1", lang) + subtitle = get_language_details( + YOUTUBE_LANG_MAP.get(lang_simpl, lang_simpl) + ) + except Exception: + logger.error(f"Failed to get language details for {lang}") + raise + return Subtitle( + code=lang, + name=f"{subtitle['english'].title()} - {subtitle['query']}", + ) + + # Youtube.com sorts subtitles by English name + return sorted(map(to_subtitle_object, languages), key=lambda x: x.name) + + def get_videos_list(playlist): + videos = load_mandatory_json( + self.cache_dir, f"playlist_{playlist.playlist_id}_videos" + ) + videos = list(filter(skip_deleted_videos, videos)) + videos = list(filter(is_present, videos)) + videos = list(filter(has_channel, videos)) + videos = sorted(videos, key=lambda v: v["snippet"]["position"]) + return videos + + def generate_video_object(video) -> Video: + video_id = video["contentDetails"]["videoId"] + author = videos_channels[video_id] + subtitles_list = get_subtitles(video_id) + return Video( + id=video_id, + title=video["snippet"]["title"], + description=video["snippet"]["description"], + author=Author( + channel_id=author["channelId"], + channel_title=author["channelTitle"], + profile_path=f"channels/{author['channelId']}/profile.jpg", + banner_path=f"channels/{author['channelId']}/banner.jpg", + ), + publication_date=video["contentDetails"]["videoPublishedAt"], + video_path=f"videos/{video_id}/video.{self.video_format}", + thumbnail_path=get_thumbnail_path(video_id), + subtitle_path=f"videos/{video_id}" if len(subtitles_list) > 0 else None, + subtitle_list=subtitles_list, + duration=videos_channels[video_id]["duration"], + ) + + def generate_video_preview_object(video) -> VideoPreview: + video_id = video["contentDetails"]["videoId"] + return VideoPreview( + slug=get_video_slug(video), + id=video_id, + title=video["snippet"]["title"], + thumbnail_path=get_thumbnail_path(video_id), + duration=videos_channels[video_id]["duration"], + ) + + def get_video_slug(video) -> str: + title = video["snippet"]["title"] + video_id = video["contentDetails"]["videoId"] + return f"{get_slug(title)}-{video_id[:4]}" + + def generate_playlist_object(playlist) -> Playlist: + videos = get_videos_list(playlist) + return Playlist( + id=playlist.playlist_id, + title=playlist.title, + description=playlist.description, + videos=[generate_video_preview_object(video) for video in videos], + publication_date=playlist.published_at, + author=Author( + channel_id=playlist.creator_id, + channel_title=playlist.creator_name, + profile_path=f"channels/{playlist.creator_id}/profile.jpg", + banner_path=f"channels/{playlist.creator_id}/banner.jpg", + ), + videos_count=len(videos), + thumbnail_path=get_thumbnail_path( + videos[0]["contentDetails"]["videoId"] + ), + ) + + def generate_playlist_preview_object(playlist) -> PlaylistPreview: + videos = get_videos_list(playlist) + return PlaylistPreview( + slug=get_playlist_slug(playlist), + id=playlist.playlist_id, + title=playlist.title, + thumbnail_path=get_thumbnail_path( + videos[0]["contentDetails"]["videoId"] + ), + videos_count=len(videos), + main_video_slug=get_video_slug(videos[0]), + ) + + def get_playlist_slug(playlist) -> str: + return f"{get_slug(playlist.title)}-{playlist.playlist_id[-4:]}" + + videos = load_mandatory_json(self.cache_dir, "videos").values() + # filter videos so we only include the ones we could retrieve + videos = list(filter(is_present, videos)) + videos_channels = load_mandatory_json(self.cache_dir, "videos_channels") + has_channel = functools.partial(video_has_channel, videos_channels) + # filter videos to exclude those for which we have no channel (#76) + videos = list(filter(has_channel, videos)) + for video in videos: + slug = get_video_slug(video) + self.zim_file.add_item_for( + path=f"videos/{slug}.json", + title=slug, + content=generate_video_object(video).model_dump_json( + by_alias=True, indent=2 + ), + mimetype="application/json", + is_front=False, + ) + + # write playlists JSON files + playlist_list = [] + + main_playlist_slug = None + if len(self.playlists) > 0: + main_playlist_slug = get_playlist_slug( + self.playlists[0] + ) # set first playlist as main playlist + + for playlist in self.playlists: + playlist_slug = get_playlist_slug(playlist) + playlist_path = f"playlists/{playlist_slug}.json" + + if playlist.playlist_id != self.uploads_playlist_id: + playlist_list.append(generate_playlist_preview_object(playlist)) + else: + main_playlist_slug = ( + playlist_slug # set uploads playlist as main playlist + ) + + self.zim_file.add_item_for( + path=playlist_path, + title=playlist.title, + content=generate_playlist_object(playlist).model_dump_json( + by_alias=True, indent=2 + ), + mimetype="application/json", + is_front=False, + ) + + # write playlists.json file + self.zim_file.add_item_for( + path="playlists.json", + title="Playlists", + content=Playlists(playlists=playlist_list).model_dump_json( + by_alias=True, indent=2 + ), + mimetype="application/json", + is_front=False, + ) + + # write channel.json file + channel_data = get_channel_json(self.main_channel_id) + self.zim_file.add_item_for( + path="channel.json", + title=self.title, + content=Channel( + id=str(self.main_channel_id), + title=str(self.title), + description=str(self.description), + channel_name=channel_data["snippet"]["title"], + channel_description=channel_data["snippet"]["description"], + profile_path="profile.jpg", + banner_path="banner.jpg", + collection_type=self.collection_type, + main_playlist=main_playlist_slug, + joined_date=channel_data["snippet"]["publishedAt"], + ).model_dump_json(by_alias=True, indent=2), + mimetype="application/json", + is_front=False, + ) + + # clean videos left out in videos directory + remove_unused_videos(videos) + + def add_files_to_zim(self, dir_path: Path, zim_file: Creator): + """recursively add a path to a zim file""" + for file_path in filter( + lambda file_path: file_path.is_file(), dir_path.rglob("*") + ): + zim_file.add_item(FileItem(dir_path, file_path)) diff --git a/scraper/src/youtube2zim/youtube.py b/scraper/src/youtube2zim/youtube.py index 2725e14c..f9cd6909 100644 --- a/scraper/src/youtube2zim/youtube.py +++ b/scraper/src/youtube2zim/youtube.py @@ -24,12 +24,21 @@ class Playlist: - def __init__(self, playlist_id, title, description, creator_id, creator_name): + def __init__( + self, + playlist_id, + title, + description, + creator_id, + creator_name, + published_at=None, + ): self.playlist_id = playlist_id self.title = title self.description = description self.creator_id = creator_id self.creator_name = creator_name + self.published_at = published_at self.slug = get_slug(title, js_safe=True) @classmethod @@ -41,6 +50,7 @@ def from_id(cls, playlist_id): description=playlist_json["snippet"]["description"], creator_id=playlist_json["snippet"]["channelId"], creator_name=playlist_json["snippet"]["channelTitle"], + published_at=playlist_json["snippet"]["publishedAt"], ) def to_dict(self): @@ -221,7 +231,7 @@ def retrieve_videos_for(videos_ids): VIDEOS_API, params={ "id": ",".join(videos_ids), - "part": "snippet", + "part": "snippet,contentDetails", "key": YOUTUBE.api_key, "maxResults": RESULTS_PER_PAGE, "pageToken": page_token, @@ -238,6 +248,7 @@ def retrieve_videos_for(videos_ids): item["id"]: { "channelId": item["snippet"]["channelId"], "channelTitle": item["snippet"]["channelTitle"], + "duration": item["contentDetails"]["duration"], } } ) diff --git a/zimui/index.html b/zimui/index.html index cd531825..22929b5e 100644 --- a/zimui/index.html +++ b/zimui/index.html @@ -2,13 +2,13 @@
- +