Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: differentiate shorts, lives and long videos #371

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]


### Changed

- Differentiate user uploaded shorts, lives & long videos (#367)

### Fixed

- Corrected the short video resolution in the UI (#366)

### Fixed

- Check for empty playlists after filtering, and after downloading videos (#375)
Expand Down
8 changes: 3 additions & 5 deletions scraper/src/youtube2zim/playlists/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ def run(self):
(
playlists,
main_channel_id,
uploads_playlist_id,
user_long_uploads_playlist_id,
user_short_uploads_playlist_id,
user_lives_playlist_id,
is_playlist,
) = extract_playlists_details_from(self.youtube_id)

Expand All @@ -106,10 +108,6 @@ def run(self):
shutil.rmtree(self.build_dir, ignore_errors=True)

for playlist in playlists:
if playlist.playlist_id == uploads_playlist_id:
logger.info(f"Skipping playlist {playlist.playlist_id} (uploads one)")
continue

logger.info(f"Executing youtube2zim for playlist {playlist.playlist_id}")
success, process = self.run_playlist_zim(playlist)
if success:
Expand Down
5 changes: 4 additions & 1 deletion scraper/src/youtube2zim/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,10 @@ class Channel(CamelModel):
profile_path: str | None = None
banner_path: str | None = None
joined_date: str
main_playlist: str | None = None
first_playlist: str | None = None
user_long_uploads_playlist: str | None = None
user_short_uploads_playlist: str | None = None
user_lives_playlist: str | None = None
playlist_count: int


Expand Down
66 changes: 24 additions & 42 deletions scraper/src/youtube2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,9 @@

# process-related
self.playlists = []
self.uploads_playlist_id = None
self.user_long_uploads_playlist_id = None
self.user_short_uploads_playlist_id = None
self.user_lives_playlist_id = None
self.videos_ids = []
self.video_ids_count = 0
self.videos_processed = 0
Expand Down Expand Up @@ -229,30 +231,6 @@
def is_single_channel(self):
return len({pl.creator_id for pl in self.playlists}) == 1

@property
def sorted_playlists(self):
"""sorted list of playlists (by title) but with Uploads one at first if any"""
if len(self.playlists) <= 1:
return self.playlists

sorted_playlists = sorted(self.playlists, key=lambda x: x.title)
index = 0
# make sure our Uploads, special playlist is first
if self.uploads_playlist_id:
try:
index = [
index
for index, p in enumerate(sorted_playlists)
if p.playlist_id == self.uploads_playlist_id
][-1]
except Exception:
index = 0
return (
[sorted_playlists[index]]
+ sorted_playlists[0:index]
+ sorted_playlists[index + 1 :]
)

def run(self):
"""execute the scraper step by step"""

Expand All @@ -278,170 +256,170 @@
# check that build_dir is correct
if not self.build_dir.exists() or not self.build_dir.is_dir():
raise OSError(f"Incorrect build_dir: {self.build_dir}")

logger.info(f"starting youtube scraper for {self.youtube_id}")
logger.info(f"preparing build folder at {self.build_dir.resolve()}")
self.prepare_build_folder()

logger.info("testing Youtube credentials")
if not credentials_ok():
raise ValueError(
"Unable to connect to Youtube API v3. check `API_KEY`."
)

if self.s3_url_with_credentials and not self.s3_credentials_ok():
raise ValueError(
"Unable to connect to Optimization Cache. Check its URL."
)

# fail early if supplied branding files are missing
self.check_branding_values()

logger.info("compute playlists list to retrieve")
self.extract_playlists()

logger.info(
".. {} playlists:\n {}".format(
len(self.playlists),
"\n ".join([p.playlist_id for p in self.playlists]),
)
)

logger.info("compute list of videos")
self.extract_videos_list()

self.video_ids_count = len(self.videos_ids)
nb_videos_msg = f".. {self.video_ids_count} videos"
if self.dateafter.start.year != 1:
nb_videos_msg += (
f" in date range: {self.dateafter.start} - {datetime.date.today()}"
)
logger.info(f"{nb_videos_msg}.")

# set a timer to report progress only every 10 seconds
every(10).seconds.do(self.report_progress)

logger.info("update general metadata")
self.update_metadata()

if not self.title:
raise Exception("title is mandatory")
if not self.description:
raise Exception("description is mandatory")
if not self.creator:
raise Exception("creator is mandatory")

# check that illustration is correct
illustration = "favicon.png"
illustration_path = self.build_dir / illustration
if not illustration_path.exists() or not illustration_path.is_file():
raise OSError(
f"Incorrect illustration: {illustration} ({illustration_path})"
)
with open(illustration_path, "rb") as fh:
illustration_data = fh.read()

logger.info("building ZIM file")
self.zim_file = Creator(
filename=self.output_dir / self.fname,
main_path="index.html",
ignore_duplicates=True,
disable_metadata_checks=self.disable_metadata_checks,
)
self.zim_file.config_metadata(
Name=self.name,
Language=self.language,
Title=self.title,
Description=self.description,
LongDescription=self.long_description,
Creator=self.creator,
Publisher=self.publisher,
Tags=";".join(self.tags) if self.tags else "",
Scraper=SCRAPER,
Date=datetime.date.today(),
Illustration_48x48_at_1=illustration_data,
)
self.zim_file.start()

logger.debug(f"Preparing zimfile at {self.zim_file.filename}")

logger.info("add main channel branding to ZIM")
self.add_main_channel_branding_to_zim()

logger.debug(f"add zimui files from {self.zimui_dist}")
self.add_zimui()

# download videos (and recompress)
logger.info(
"downloading all videos, subtitles and thumbnails "
f"(concurrency={self.max_concurrency})"
)
logger.info(f" format: {self.video_format}")
logger.info(f" quality: {self.video_quality}")
logger.info(f" generated-subtitles: {self.all_subtitles}")
if self.s3_storage:
logger.info(
f" using cache: {self.s3_storage.url.netloc} "
f"with bucket: {self.s3_storage.bucket_name}"
)
succeeded, failed = self.download_video_files(
max_concurrency=self.max_concurrency
)
if failed:
logger.error(f"{len(failed)} video(s) failed to download: {failed}")
if len(failed) >= len(succeeded):
logger.critical("More than half of videos failed. exiting")
raise OSError("Too much videos failed to download")

logger.info("retrieve channel-info for all videos (author details)")
get_videos_authors_info(succeeded)

logger.info("download all author's profile pictures")
self.download_authors_branding()

logger.info("creating JSON files")
self.make_json_files(succeeded)
except KeyboardInterrupt:
logger.error("KeyboardInterrupt, exiting.")
return 1
except Exception as exc:
logger.error(f"Interrupting process due to error: {exc}")
logger.exception(exc)
return 1
else:
logger.info("Finishing ZIM file…")
self.zim_file.finish()
finally:
self.report_progress()
logger.info("removing temp folder")
shutil.rmtree(self.build_dir, ignore_errors=True)

logger.info("all done!")

def add_zimui(self):
logger.info(f"Adding files in {self.zimui_dist}")
for file in self.zimui_dist.rglob("*"):
if file.is_dir():
continue
path = str(Path(file).relative_to(self.zimui_dist))
logger.debug(f"Adding {path} to ZIM")
if path == "index.html": # Change index.html title and add to ZIM
index_html_path = self.zimui_dist / path
html_content = index_html_path.read_text(encoding="utf-8")
new_html_content = re.sub(
r"(<title>)(.*?)(</title>)",
rf"\1{self.title}\3",
html_content,
flags=re.IGNORECASE,
)
self.zim_file.add_item_for(
path=path,
content=new_html_content,
mimetype="text/html",
is_front=True,
)
else:
self.zim_file.add_item_for(

Check notice on line 422 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

codefactor.io / CodeFactor

scraper/src/youtube2zim/scraper.py#L259-L422

Complex Method
path,
fpath=file,
is_front=False,
Expand Down Expand Up @@ -552,7 +530,9 @@
(
self.playlists,
self.main_channel_id,
self.uploads_playlist_id,
self.user_long_uploads_playlist_id,
self.user_short_uploads_playlist_id,
self.user_lives_playlist_id,
self.is_playlist,
) = extract_playlists_details_from(self.youtube_id)

Expand Down Expand Up @@ -960,76 +940,76 @@
if self.is_playlist and len(self.playlists) == 1
else clean_text(main_channel_json["snippet"]["description"])
) or "-"
self.title = self.title or auto_title or "-"
self.description, self.long_description = compute_descriptions(
default_description=auto_description,
user_description=self.description,
user_long_description=self.long_description,
)

if self.creator is None:
if self.is_single_channel:
self.creator = _("Youtube Channel “{title}”").format(
title=main_channel_json["snippet"]["title"]
)
else:
self.creator = _("Youtube Channels")

self.tags = self.tags or ["youtube"]
if "_videos:yes" not in self.tags:
self.tags.append("_videos:yes")

# copy our main_channel branding into /(profile|banner).jpg if not supplied
if not self.profile_path.exists():
shutil.copy(
self.channels_dir.joinpath(self.main_channel_id, "profile.jpg"),
self.profile_path,
)

# set colors from images if not supplied
if self.main_color is None or self.secondary_color is None:
profile_main, profile_secondary = get_colors(self.profile_path)
self.main_color = self.main_color or profile_main
self.secondary_color = self.secondary_color or profile_secondary

# convert profile image to png for favicon
png_profile_path = self.build_dir.joinpath("profile.png")
convert_image(self.profile_path, png_profile_path)

resize_image(
png_profile_path,
width=48,
height=48,
method="thumbnail",
dst=self.build_dir.joinpath("favicon.png"),
)
png_profile_path.unlink()

def make_json_files(self, actual_videos_ids):
"""Generate JSON files to be consumed by the frontend"""

def remove_unused_videos():
for path in self.videos_dir.iterdir():
if path.is_dir() and path.name not in actual_videos_ids:
logger.debug(f"Removing unused video {path.name}")
shutil.rmtree(path, ignore_errors=True)

def is_present(video):
"""whether this video has actually been succeffuly downloaded"""
return video["contentDetails"]["videoId"] in actual_videos_ids

def video_has_channel(videos_channels, video):
return video["contentDetails"]["videoId"] in videos_channels

def get_thumbnail_path(video_id):
return f"videos/{video_id}/video.webp"

def get_subtitles(video_id) -> list[Subtitle]:
subtitles_list = load_json(self.subtitles_cache_dir, video_id)
if subtitles_list is None:
return []
return subtitles_list["subtitles"]

Check notice on line 1012 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

codefactor.io / CodeFactor

scraper/src/youtube2zim/scraper.py#L943-L1012

Complex Method
def get_videos_list(playlist):
videos = load_mandatory_json(
self.cache_dir, f"playlist_{playlist.playlist_id}_videos"
Expand All @@ -1045,6 +1025,7 @@
author = videos_channels[video_id]
subtitles_list = get_subtitles(video_id)
channel_data = get_channel_json(author["channelId"])

return Video(
id=video_id,
title=video["snippet"]["title"],
Expand Down Expand Up @@ -1151,10 +1132,13 @@
)

# write playlists JSON files
playlist_list = []
home_playlist_list = []
playlist_list: list[PlaylistPreview] = []
home_playlist_list: list[Playlist] = []

user_long_uploads_playlist_slug = None
user_short_uploads_playlist_slug = None
user_lives_playlist_slug = None

main_playlist_slug = None
empty_playlists = list(
filter(lambda playlist: len(get_videos_list(playlist)) == 0, self.playlists)
)
Expand All @@ -1167,10 +1151,6 @@
if len(self.playlists) == 0:
raise Exception("No playlist succeeded to download")

main_playlist_slug = get_playlist_slug(
self.playlists[0]
) # set first playlist as main playlist

for playlist in self.playlists:
playlist_slug = get_playlist_slug(playlist)
playlist_path = f"playlists/{playlist_slug}.json"
Expand All @@ -1195,16 +1175,15 @@
# modify playlist object for preview on homepage
playlist_obj.videos = playlist_obj.videos[:12]

if playlist.playlist_id == self.uploads_playlist_id:
main_playlist_slug = (
playlist_slug # set uploads playlist as main playlist
)
# insert uploads playlist at the beginning of the list
playlist_list.insert(0, generate_playlist_preview_object(playlist))
home_playlist_list.insert(0, playlist_obj)
home_playlist_list.append(playlist_obj)
if playlist.playlist_id == self.user_long_uploads_playlist_id:
user_long_uploads_playlist_slug = playlist_slug
elif playlist.playlist_id == self.user_short_uploads_playlist_id:
user_short_uploads_playlist_slug = playlist_slug
elif playlist.playlist_id == self.user_lives_playlist_id:
user_lives_playlist_slug = playlist_slug
else:
playlist_list.append(generate_playlist_preview_object(playlist))
home_playlist_list.append(playlist_obj)

# write playlists.json file
self.zim_file.add_item_for(
Expand Down Expand Up @@ -1241,7 +1220,10 @@
channel_description=channel_data["snippet"]["description"],
profile_path="profile.jpg",
banner_path="banner.jpg",
main_playlist=main_playlist_slug,
first_playlist=playlist_list[0].id,
user_long_uploads_playlist=user_long_uploads_playlist_slug,
user_short_uploads_playlist=user_short_uploads_playlist_slug,
user_lives_playlist=user_lives_playlist_slug,
playlist_count=len(self.playlists),
joined_date=channel_data["snippet"]["publishedAt"],
).model_dump_json(by_alias=True, indent=2),
Expand Down
57 changes: 46 additions & 11 deletions scraper/src/youtube2zim/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ def __init__(
@classmethod
def from_id(cls, playlist_id):
playlist_json = get_playlist_json(playlist_id)
if playlist_json is None:
raise PlaylistNotFoundError(
f"Invalid playlistId `{playlist_id}`: Not Found"
)
return Playlist(
playlist_id=playlist_id,
title=playlist_json["snippet"]["title"],
Expand Down Expand Up @@ -176,10 +180,13 @@ def get_playlist_json(playlist_id):
req.raise_for_status()
try:
playlist_json = req.json()["items"][0]
total_results = req.json().get("pageInfo", {}).get("totalResults", 0)
if total_results == 0:
logger.error(f"Playlist `{playlist_id}`: No Item Available")
return None
except IndexError:
raise PlaylistNotFoundError(
f"Invalid playlistId `{playlist_id}`: Not Found"
) from None
logger.error(f"Invalid playlistId `{playlist_id}`: Not Found")
return None
save_json(YOUTUBE.cache_dir, fname, playlist_json)
return playlist_json

Expand Down Expand Up @@ -336,8 +343,9 @@ def skip_outofrange_videos(date_range, item):
def extract_playlists_details_from(youtube_id: str):
"""prepare a list of Playlist from user request"""

uploads_playlist_id = None
main_channel_id = None
main_channel_id = user_long_uploads_playlist_id = user_short_uploads_playlist_id = (
user_lives_playlist_id
) = None
if "," not in youtube_id:
try:
# first try to consider passed ID is a channel ID (or username or handle)
Expand All @@ -347,11 +355,36 @@ def extract_playlists_details_from(youtube_id: str):
playlist_ids = [
p["id"] for p in get_channel_playlists_json(main_channel_id)
]
# we always include uploads playlist (contains everything)
playlist_ids += [
channel_json["contentDetails"]["relatedPlaylists"]["uploads"]
]
uploads_playlist_id = playlist_ids[-1]

# Get special playlists JSON objects
user_long_uploads_json = get_playlist_json("UULF" + main_channel_id[2:])
user_short_uploads_json = get_playlist_json("UUSH" + main_channel_id[2:])
user_lives_json = get_playlist_json("UULV" + main_channel_id[2:])

# Extract special playlists IDs if the JSON objects are not None
user_long_uploads_playlist_id = (
user_long_uploads_json["id"] if user_long_uploads_json else None
)
user_short_uploads_playlist_id = (
user_short_uploads_json["id"] if user_short_uploads_json else None
)
user_lives_playlist_id = user_lives_json["id"] if user_lives_json else None

# Add special playlists if they exists, in proper order
playlist_ids = (
list(
filter(
None,
[
user_long_uploads_playlist_id,
user_short_uploads_playlist_id,
user_lives_playlist_id,
],
)
)
+ playlist_ids
)

is_playlist = False
except ChannelNotFoundError:
# channel not found, then ID should be a playlist
Expand All @@ -370,6 +403,8 @@ def extract_playlists_details_from(youtube_id: str):
# dict.fromkeys maintains the order of playlist_ids while removing duplicates
[Playlist.from_id(playlist_id) for playlist_id in dict.fromkeys(playlist_ids)],
main_channel_id,
uploads_playlist_id,
user_long_uploads_playlist_id,
user_short_uploads_playlist_id,
user_lives_playlist_id,
is_playlist,
)
2 changes: 1 addition & 1 deletion scraper/tests-integration/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def test_zim_channel_json():

assert channel_json["id"] == "UC8elThf5TGMpQfQc_VE917Q"
assert channel_json["channelName"] == "openZIM_testing"
assert channel_json["mainPlaylist"] == "uploads_from_openzim_testing-917Q"
assert channel_json["firstPlaylist"] == "uploads_from_openzim_testing-917Q"


def test_zim_videos():
Expand Down
2 changes: 1 addition & 1 deletion zimui/cypress/fixtures/channel/channel.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@
"profilePath": "profile.jpg",
"bannerPath": "banner.jpg",
"joinedDate": "2024-06-04T13:30:16.232286Z",
"mainPlaylist": "uploads_from_openzim_testing-917Q"
"firstPlaylist": "uploads_from_openzim_testing-917Q"
}
1 change: 1 addition & 0 deletions zimui/src/assets/main.css
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
html {
overflow: auto !important;
font-family: 'Roboto', sans-serif;
overflow-y: scroll !important;
}

body {
Expand Down
35 changes: 35 additions & 0 deletions zimui/src/assets/vjs-youtube.css
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,38 @@
.vjs-youtube .vjs-tech canvas {
border-radius: 8px;
}


.video-js.vjs-fluid,
.video-js.vjs-16-9,
.video-js.vjs-4-3,
video.video-js,
video.vjs-tech {
max-height: calc(100vh - 64px);
position: relative !important;
width: 100%;
height: auto !important;
max-width: 100% !important;
padding-top: 0 !important;
line-height: 0;
}
.vjs-control-bar {
line-height: 1;
}

/* Fullscreen styles */
.video-js.vjs-fullscreen {
display: flex;
align-items: center;
justify-content: center;
background-color: black;
text-align: center;
}

.video-js.vjs-fullscreen video {
margin: auto;
width: auto !important;
height: 100% !important;
max-height: 100vh;
object-fit: contain;
}
35 changes: 23 additions & 12 deletions zimui/src/components/channel/ChannelHeader.vue
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,34 @@ onMounted(async () => {
}
})

const tabs = [
{
id: 0,
title: 'Videos',
to: { name: 'videos' }
},
{
id: 1,
title: 'Playlists',
to: { name: 'playlists' }
// Computed tabs array based on store data
const tabs = computed(() => {
const baseTabs = [
{ id: 0, title: 'Home', to: { name: 'home' } }
];

if (main.channel?.userLongUploadsPlaylist) {
baseTabs.push({ id: 1, title: 'Videos', to: { name: 'videos' } });
}

if (main.channel?.userShortUploadsPlaylist) {
baseTabs.push({ id: 2, title: 'Shorts', to: { name: 'shorts' } });
}

if (main.channel?.userLivesPlaylist) {
baseTabs.push({ id: 3, title: 'Lives', to: { name: 'lives' } });
}
]

baseTabs.push({ id: 4, title: 'Playlists', to: { name: 'playlists' } });

return baseTabs;
});


// Hide tabs if there is only one playlist
const hideTabs = computed(() => main.channel?.playlistCount === 1)

const tab = ref<number>(tabs[0].id)
const tab = ref<number>(tabs.value[0]?.id || 0);
</script>

<template>
Expand Down
Loading
Loading