Skip to content

Commit

Permalink
Filter-out non-public videos and properly cleanup unsuccessful videos
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Oct 14, 2024
1 parent f9d4b1e commit b707b90
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 5 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed

- Filter-out non-public videos and properly cleanup unsuccessful videos (#362)

## [3.2.0] - 2024-10-11

### Deprecated
Expand Down
9 changes: 5 additions & 4 deletions scraper/src/youtube2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
get_videos_json,
save_channel_branding,
skip_deleted_videos,
skip_non_public_videos,
skip_outofrange_videos,
)

Expand Down Expand Up @@ -611,6 +612,7 @@ def extract_videos_list(self):
)
filter_videos = filter(skip_outofrange, videos_json)
filter_videos = filter(skip_deleted_videos, filter_videos)
filter_videos = filter(skip_non_public_videos, filter_videos)

Check warning on line 615 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L615

Added line #L615 was not covered by tests
all_videos.update(
{v["contentDetails"]["videoId"]: v for v in filter_videos}
)
Expand Down Expand Up @@ -1034,10 +1036,9 @@ def update_metadata(self):
def make_json_files(self, actual_videos_ids):
"""Generate JSON files to be consumed by the frontend"""

def remove_unused_videos(videos):
video_ids = [video["contentDetails"]["videoId"] for video in videos]
def remove_unused_videos():

Check warning on line 1039 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L1039

Added line #L1039 was not covered by tests
for path in self.videos_dir.iterdir():
if path.is_dir() and path.name not in video_ids:
if path.is_dir() and path.name not in actual_videos_ids:
logger.debug(f"Removing unused video {path.name}")
shutil.rmtree(path, ignore_errors=True)

Expand Down Expand Up @@ -1278,7 +1279,7 @@ def get_playlist_slug(playlist) -> str:
)

# clean videos left out in videos directory
remove_unused_videos(videos)
remove_unused_videos()

Check warning on line 1282 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L1282

Added line #L1282 was not covered by tests

def add_file_to_zim(
self,
Expand Down
7 changes: 6 additions & 1 deletion scraper/src/youtube2zim/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def get_videos_json(playlist_id):
PLAYLIST_ITEMS_API,
params={
"playlistId": playlist_id,
"part": "snippet,contentDetails",
"part": "snippet,contentDetails,status",
"key": YOUTUBE.api_key,
"maxResults": RESULTS_PER_PAGE,
"pageToken": page_token,
Expand Down Expand Up @@ -309,6 +309,11 @@ def skip_deleted_videos(item):
)


def skip_non_public_videos(item):

Check warning on line 312 in scraper/src/youtube2zim/youtube.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/youtube.py#L312

Added line #L312 was not covered by tests
"""filter func to filter-out non-public videos"""
return item["status"]["privacyStatus"] == "public"

Check warning on line 314 in scraper/src/youtube2zim/youtube.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/youtube.py#L314

Added line #L314 was not covered by tests


def skip_outofrange_videos(date_range, item):
"""filter func to filter-out videos that are not within specified date range"""
return dt_parser.parse(item["snippet"]["publishedAt"]).date() in date_range
Expand Down

0 comments on commit b707b90

Please sign in to comment.