Skip to content

Commit

Permalink
Merge pull request #17 from ivansaul/refactor
Browse files Browse the repository at this point in the history
feat: add caching mechanism for API responses
  • Loading branch information
ivansaul authored Nov 18, 2024
2 parents adeb301 + df8ef74 commit ec810cd
Show file tree
Hide file tree
Showing 8 changed files with 155 additions and 37 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,8 @@ jobs:
- name: Lint with Ruff
run: poetry run ruff check --output-format=github .

- name: Type Check with MyPy
run: poetry run mypy --pretty --show-error-codes .

- name: Run Tests with Pytest
run: poetry run pytest
66 changes: 65 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,14 @@ aiohttp = "^3.11.2"
tqdm = "^4.67.0"
pytest-playwright = "^0.5.2"
unidecode = "^1.3.8"
platformdirs = "^4.3.6"

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.3"
ruff = "^0.7.4"
ipykernel = "^6.29.5"
python-semantic-release = "^9.14.0"
mypy = "^1.13.0"

[build-system]
requires = ["poetry-core"]
Expand Down
46 changes: 26 additions & 20 deletions src/platzi/async_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

from playwright.async_api import BrowserContext, Page, async_playwright

from .collectors import get_chapters_urls, get_course_title, get_unit
from .collectors import get_course_title, get_draft_chapters, get_unit
from .constants import HEADERS, LOGIN_DETAILS_URL, LOGIN_URL, SESSION_FILE
from .helpers import read_json, write_json
from .helpers import hash_id, read_json, write_json
from .logger import Logger
from .m3u8 import m3u8_dl
from .models import TypeUnit, User
from .utils import download, progressive_scroll, slugify
from .models import TypeUnit, Unit, User
from .utils import Cache, download, progressive_scroll, slugify


def login_required(func):
Expand Down Expand Up @@ -143,36 +143,42 @@ async def download(self, url: str, **kwargs):
)

# iterate over chapters
chapters_urls = await get_chapters_urls(page)
for idx, (title, urls) in enumerate(chapters_urls, 1):
print(f"{title}")
draft_chapters = await get_draft_chapters(page)
for idx, draft_chapter in enumerate(draft_chapters, 1):
Logger.info(f"Downloading {draft_chapter.name}")

CHAP_DIR = DL_DIR / f"{idx:02}_{slugify(title)}"
CHAP_DIR = DL_DIR / f"{idx:02}_{draft_chapter.slug}"
CHAP_DIR.mkdir(parents=True, exist_ok=True)

# iterate over units
for jdx, unit_url in enumerate(urls, 1):
unit = await get_unit(self.context, unit_url)
name = f"{jdx:02}_{slugify(unit.title)}"
for jdx, draft_unit in enumerate(draft_chapter.units, 1):
cache_hash = hash_id(draft_unit.url)
cache_data = Cache.get(cache_hash)

if cache_data:
unit = Unit.model_validate(cache_data)
else:
unit = await get_unit(self.context, draft_unit.url)
Cache.set(cache_hash, unit.model_dump())

file_name = f"{jdx:02}_{unit.slug}"

# download video
if unit.video:
dst = CHAP_DIR / f"{name}.mp4"
Logger.print(f"[{name}.mp4]", "[DOWNLOADING][VIDEO]")
dst = CHAP_DIR / f"{file_name}.mp4"
Logger.print(f"[{dst.name}]", "[DOWNLOADING]")
await m3u8_dl(unit.video.url, dst.as_posix(), headers=HEADERS)

if unit.video.subtitles_url:
dst = CHAP_DIR / f"{name}.vtt"
Logger.print(f"[{name}.vtt]", "[DOWNLOADING][SUBTITLES]")
dst = CHAP_DIR / f"{file_name}.vtt"
Logger.print(f"[{dst.name}]", "[DOWNLOADING]")
await download(unit.video.subtitles_url, dst)

# download lecture
if unit.type == TypeUnit.LECTURE:
Logger.print(f"[{name}.mhtml]", "[DOWNLOADING][LECTURE]")
await self.save_page(
unit.url,
path=CHAP_DIR / f"{name}.mhtml",
)
dst = CHAP_DIR / f"{file_name}.mhtml"
Logger.print(f"[{dst.name}]", "[DOWNLOADING]")
await self.save_page(unit.url, path=dst)

print("=" * 100)

Expand Down
44 changes: 31 additions & 13 deletions src/platzi/collectors.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from playwright.async_api import BrowserContext, Page

from .constants import PLATZI_URL
from .models import TypeUnit, Unit, Video
from .utils import get_m3u8_url, get_subtitles_url
from .models import Chapter, TypeUnit, Unit, Video
from .utils import get_m3u8_url, get_subtitles_url, slugify


async def get_course_title(page: Page) -> str:
Expand All @@ -19,36 +19,53 @@ async def get_course_title(page: Page) -> str:
return title


async def get_chapters_urls(page: Page) -> list[tuple[str, list[str]]]:
async def get_draft_chapters(page: Page) -> list[Chapter]:
SELECTOR = ".Content-feed div.ContentBlock"
EXCEPTION = Exception("No sections found")
try:
locator = page.locator(SELECTOR)
items = []

chapters: list[Chapter] = []
for i in range(await locator.count()):
title = await locator.nth(i).locator("h3").first.text_content()
chapter_name = await locator.nth(i).locator("h3").first.text_content()

if not title:
if not chapter_name:
raise EXCEPTION

block_list_locator = locator.nth(i).locator(".ContentBlock-list a")

urls: list[str] = []
units: list[Unit] = []
for j in range(await block_list_locator.count()):
url = await block_list_locator.nth(j).get_attribute("href")
ITEM_LOCATOR = block_list_locator.nth(j)

if not url:
raise EXCEPTION
unit_url = await ITEM_LOCATOR.get_attribute("href")
unit_title = await ITEM_LOCATOR.locator("h5").first.text_content()

urls.append(PLATZI_URL + url)
if not unit_url or not unit_title:
raise EXCEPTION

items.append((title, urls))
units.append(
Unit(
type=TypeUnit.VIDEO,
title=unit_title,
url=PLATZI_URL + unit_url,
slug=slugify(unit_title),
)
)

chapters.append(
Chapter(
name=chapter_name,
slug=slugify(chapter_name),
units=units,
)
)

except Exception as e:
await page.close()
raise EXCEPTION from e

return items
return chapters


async def get_unit(context: BrowserContext, url: str) -> Unit:
Expand Down Expand Up @@ -82,6 +99,7 @@ async def get_unit(context: BrowserContext, url: str) -> Unit:
title=title,
type=type,
video=video,
slug=slugify(title),
)

except Exception:
Expand Down
6 changes: 4 additions & 2 deletions src/platzi/constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import tempfile
from pathlib import Path

SESSION_DIR = Path(tempfile.gettempdir()) / ".platzi"
import platformdirs

APP_NAME = "Platzi"
SESSION_DIR = Path(platformdirs.user_data_dir(APP_NAME))
SESSION_FILE = SESSION_DIR / "state.json"

LOGIN_URL = "https://platzi.com/login"
Expand Down
3 changes: 2 additions & 1 deletion src/platzi/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class User(BaseModel):
phone_number: str


class TypeUnit(Enum):
class TypeUnit(str, Enum):
LECTURE = "lecture"
VIDEO = "video"
QUIZ = "quiz"
Expand All @@ -41,6 +41,7 @@ class Unit(BaseModel):
type: TypeUnit
title: str
url: str
slug: str
video: Video | None = None
resources: list[Resource] | None = None

Expand Down
22 changes: 22 additions & 0 deletions src/platzi/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from playwright.async_api import Page
from unidecode import unidecode

from .constants import SESSION_DIR
from .helpers import read_json, write_json


async def progressive_scroll(
page: Page, time: float = 3, delay: float = 0.1, steps: int = 250
Expand Down Expand Up @@ -105,3 +108,22 @@ async def download(url: str, path: Path, **kwargs):
with open(path.as_posix(), "wb") as file:
async for chunk in response.content.iter_chunked(1024):
file.write(chunk)


class Cache:
@classmethod
def get(cls, id: str) -> dict | None:
path = SESSION_DIR / f"{id}.json"
try:
return read_json(path.as_posix())
except Exception:
return None

@classmethod
def set(cls, id: str, content: dict) -> None:
path = SESSION_DIR / f"{id}.json"
path.parent.mkdir(parents=True, exist_ok=True)
try:
write_json(path.as_posix(), content)
except Exception:
pass

0 comments on commit ec810cd

Please sign in to comment.