Merge pull request #17 from ivansaul/refactor

feat: add caching mechanism for API responses
ivansaul · Nov 18, 2024 · ec810cd · ec810cd
2 parents adeb301 + df8ef74
commit ec810cd
Show file tree

Hide file tree

Showing 8 changed files with 155 additions and 37 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -49,5 +49,8 @@ jobs:
       - name: Lint with Ruff
         run: poetry run ruff check --output-format=github .
 
+      - name: Type Check with MyPy
+        run: poetry run mypy --pretty --show-error-codes .
+
       - name: Run Tests with Pytest
         run: poetry run pytest
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,12 +31,14 @@ aiohttp = "^3.11.2"
 tqdm = "^4.67.0"
 pytest-playwright = "^0.5.2"
 unidecode = "^1.3.8"
+platformdirs = "^4.3.6"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.3"
 ruff = "^0.7.4"
 ipykernel = "^6.29.5"
 python-semantic-release = "^9.14.0"
+mypy = "^1.13.0"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/src/platzi/async_api.py b/src/platzi/async_api.py
@@ -4,13 +4,13 @@
 
 from playwright.async_api import BrowserContext, Page, async_playwright
 
-from .collectors import get_chapters_urls, get_course_title, get_unit
+from .collectors import get_course_title, get_draft_chapters, get_unit
 from .constants import HEADERS, LOGIN_DETAILS_URL, LOGIN_URL, SESSION_FILE
-from .helpers import read_json, write_json
+from .helpers import hash_id, read_json, write_json
 from .logger import Logger
 from .m3u8 import m3u8_dl
-from .models import TypeUnit, User
-from .utils import download, progressive_scroll, slugify
+from .models import TypeUnit, Unit, User
+from .utils import Cache, download, progressive_scroll, slugify
 
 
 def login_required(func):
@@ -143,36 +143,42 @@ async def download(self, url: str, **kwargs):
         )
 
         # iterate over chapters
-        chapters_urls = await get_chapters_urls(page)
-        for idx, (title, urls) in enumerate(chapters_urls, 1):
-            print(f"{title}")
+        draft_chapters = await get_draft_chapters(page)
+        for idx, draft_chapter in enumerate(draft_chapters, 1):
+            Logger.info(f"Downloading {draft_chapter.name}")
 
-            CHAP_DIR = DL_DIR / f"{idx:02}_{slugify(title)}"
+            CHAP_DIR = DL_DIR / f"{idx:02}_{draft_chapter.slug}"
             CHAP_DIR.mkdir(parents=True, exist_ok=True)
 
             # iterate over units
-            for jdx, unit_url in enumerate(urls, 1):
-                unit = await get_unit(self.context, unit_url)
-                name = f"{jdx:02}_{slugify(unit.title)}"
+            for jdx, draft_unit in enumerate(draft_chapter.units, 1):
+                cache_hash = hash_id(draft_unit.url)
+                cache_data = Cache.get(cache_hash)
+
+                if cache_data:
+                    unit = Unit.model_validate(cache_data)
+                else:
+                    unit = await get_unit(self.context, draft_unit.url)
+                    Cache.set(cache_hash, unit.model_dump())
+
+                file_name = f"{jdx:02}_{unit.slug}"
 
                 # download video
                 if unit.video:
-                    dst = CHAP_DIR / f"{name}.mp4"
-                    Logger.print(f"[{name}.mp4]", "[DOWNLOADING][VIDEO]")
+                    dst = CHAP_DIR / f"{file_name}.mp4"
+                    Logger.print(f"[{dst.name}]", "[DOWNLOADING]")
                     await m3u8_dl(unit.video.url, dst.as_posix(), headers=HEADERS)
 
                     if unit.video.subtitles_url:
-                        dst = CHAP_DIR / f"{name}.vtt"
-                        Logger.print(f"[{name}.vtt]", "[DOWNLOADING][SUBTITLES]")
+                        dst = CHAP_DIR / f"{file_name}.vtt"
+                        Logger.print(f"[{dst.name}]", "[DOWNLOADING]")
                         await download(unit.video.subtitles_url, dst)
 
                 # download lecture
                 if unit.type == TypeUnit.LECTURE:
-                    Logger.print(f"[{name}.mhtml]", "[DOWNLOADING][LECTURE]")
-                    await self.save_page(
-                        unit.url,
-                        path=CHAP_DIR / f"{name}.mhtml",
-                    )
+                    dst = CHAP_DIR / f"{file_name}.mhtml"
+                    Logger.print(f"[{dst.name}]", "[DOWNLOADING]")
+                    await self.save_page(unit.url, path=dst)
 
             print("=" * 100)
 

diff --git a/src/platzi/collectors.py b/src/platzi/collectors.py
@@ -1,8 +1,8 @@
 from playwright.async_api import BrowserContext, Page
 
 from .constants import PLATZI_URL
-from .models import TypeUnit, Unit, Video
-from .utils import get_m3u8_url, get_subtitles_url
+from .models import Chapter, TypeUnit, Unit, Video
+from .utils import get_m3u8_url, get_subtitles_url, slugify
 
 
 async def get_course_title(page: Page) -> str:
@@ -19,36 +19,53 @@ async def get_course_title(page: Page) -> str:
     return title
 
 
-async def get_chapters_urls(page: Page) -> list[tuple[str, list[str]]]:
+async def get_draft_chapters(page: Page) -> list[Chapter]:
     SELECTOR = ".Content-feed div.ContentBlock"
     EXCEPTION = Exception("No sections found")
     try:
         locator = page.locator(SELECTOR)
-        items = []
+
+        chapters: list[Chapter] = []
         for i in range(await locator.count()):
-            title = await locator.nth(i).locator("h3").first.text_content()
+            chapter_name = await locator.nth(i).locator("h3").first.text_content()
 
-            if not title:
+            if not chapter_name:
                 raise EXCEPTION
 
             block_list_locator = locator.nth(i).locator(".ContentBlock-list a")
 
-            urls: list[str] = []
+            units: list[Unit] = []
             for j in range(await block_list_locator.count()):
-                url = await block_list_locator.nth(j).get_attribute("href")
+                ITEM_LOCATOR = block_list_locator.nth(j)
 
-                if not url:
-                    raise EXCEPTION
+                unit_url = await ITEM_LOCATOR.get_attribute("href")
+                unit_title = await ITEM_LOCATOR.locator("h5").first.text_content()
 
-                urls.append(PLATZI_URL + url)
+                if not unit_url or not unit_title:
+                    raise EXCEPTION
 
-            items.append((title, urls))
+                units.append(
+                    Unit(
+                        type=TypeUnit.VIDEO,
+                        title=unit_title,
+                        url=PLATZI_URL + unit_url,
+                        slug=slugify(unit_title),
+                    )
+                )
+
+            chapters.append(
+                Chapter(
+                    name=chapter_name,
+                    slug=slugify(chapter_name),
+                    units=units,
+                )
+            )
 
     except Exception as e:
         await page.close()
         raise EXCEPTION from e
 
-    return items
+    return chapters
 
 
 async def get_unit(context: BrowserContext, url: str) -> Unit:
@@ -82,6 +99,7 @@ async def get_unit(context: BrowserContext, url: str) -> Unit:
             title=title,
             type=type,
             video=video,
+            slug=slugify(title),
         )
 
     except Exception:

diff --git a/src/platzi/constants.py b/src/platzi/constants.py
@@ -1,7 +1,9 @@
-import tempfile
 from pathlib import Path
 
-SESSION_DIR = Path(tempfile.gettempdir()) / ".platzi"
+import platformdirs
+
+APP_NAME = "Platzi"
+SESSION_DIR = Path(platformdirs.user_data_dir(APP_NAME))
 SESSION_FILE = SESSION_DIR / "state.json"
 
 LOGIN_URL = "https://platzi.com/login"

diff --git a/src/platzi/models.py b/src/platzi/models.py
@@ -19,7 +19,7 @@ class User(BaseModel):
     phone_number: str
 
 
-class TypeUnit(Enum):
+class TypeUnit(str, Enum):
     LECTURE = "lecture"
     VIDEO = "video"
     QUIZ = "quiz"
@@ -41,6 +41,7 @@ class Unit(BaseModel):
     type: TypeUnit
     title: str
     url: str
+    slug: str
     video: Video | None = None
     resources: list[Resource] | None = None
 

diff --git a/src/platzi/utils.py b/src/platzi/utils.py
@@ -6,6 +6,9 @@
 from playwright.async_api import Page
 from unidecode import unidecode
 
+from .constants import SESSION_DIR
+from .helpers import read_json, write_json
+
 
 async def progressive_scroll(
     page: Page, time: float = 3, delay: float = 0.1, steps: int = 250
@@ -105,3 +108,22 @@ async def download(url: str, path: Path, **kwargs):
             with open(path.as_posix(), "wb") as file:
                 async for chunk in response.content.iter_chunked(1024):
                     file.write(chunk)
+
+
+class Cache:
+    @classmethod
+    def get(cls, id: str) -> dict | None:
+        path = SESSION_DIR / f"{id}.json"
+        try:
+            return read_json(path.as_posix())
+        except Exception:
+            return None
+
+    @classmethod
+    def set(cls, id: str, content: dict) -> None:
+        path = SESSION_DIR / f"{id}.json"
+        path.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            write_json(path.as_posix(), content)
+        except Exception:
+            pass