From 4d3f3fc458941e04efb8a7e2e58ad15076dea1e9 Mon Sep 17 00:00:00 2001 From: William Chu Date: Thu, 18 Apr 2024 20:34:00 +1000 Subject: [PATCH] feat: add ./splat_cli helper script --- lambda_function.py | 26 +++++++++--------- scripts/local.py | 28 ------------------- splat_cli.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 40 deletions(-) delete mode 100644 scripts/local.py create mode 100755 splat_cli.py diff --git a/lambda_function.py b/lambda_function.py index 4c3eb26..f94e160 100644 --- a/lambda_function.py +++ b/lambda_function.py @@ -8,10 +8,14 @@ import tempfile import uuid import xml.etree.ElementTree as ET +from collections.abc import Iterator +from contextlib import contextmanager from dataclasses import dataclass, field from urllib.parse import urlparse import boto3 +import playwright +import playwright.sync_api import pydantic import requests import sentry_sdk @@ -90,7 +94,8 @@ def init() -> None: os.environ["FONTCONFIG_PATH"] = "/var/task/fonts" -def playwright_page_to_pdf(browser_url: str, headers: dict, output_filepath: str) -> None: +@contextmanager +def _playwright_visit_page(browser_url: str, headers: dict) -> Iterator[playwright.sync_api.Page]: print("splat|playwright_handler|url=", browser_url) with sync_playwright() as p: browser = p.chromium.launch() @@ -102,21 +107,18 @@ def playwright_page_to_pdf(browser_url: str, headers: dict, output_filepath: str wait_until="domcontentloaded", ) page.emulate_media(media="print") + page.wait_for_load_state("domcontentloaded") + page.wait_for_load_state("networkidle") + yield page + + +def playwright_page_to_pdf(browser_url: str, headers: dict, output_filepath: str) -> None: + with _playwright_visit_page(browser_url, headers) as page: page.pdf(path=output_filepath, format="A4") def playwright_page_to_html_string(browser_url: str, headers: dict) -> str: - print("splat|playwright_handler|url=", browser_url) - with sync_playwright() as p: - browser = p.chromium.launch() - context = browser.new_context() - context.set_extra_http_headers(headers) - page = context.new_page() - page.goto( - browser_url, - wait_until="domcontentloaded", - ) - page.emulate_media(media="print") + with _playwright_visit_page(browser_url, headers) as page: return page.content() diff --git a/scripts/local.py b/scripts/local.py deleted file mode 100644 index 186bd52..0000000 --- a/scripts/local.py +++ /dev/null @@ -1,28 +0,0 @@ -import base64 -import json -import pathlib - -import requests - -LAMBDA_URL = "http://localhost:8080/2015-03-31/functions/function/invocations" - - -def call_lamdba(body: dict, raise_exception=True) -> tuple[int, dict, bytes]: - response = requests.post(LAMBDA_URL, json={"body": json.dumps(body)}, timeout=60) - if raise_exception: - response.raise_for_status() - data = response.json() - status_code = data["statusCode"] - is_base64_encoded = data["isBase64Encoded"] - if is_base64_encoded: - return status_code, {}, base64.b64decode(data["body"]) - else: - body = json.loads(data.get("body")) if data.get("body") else {} - if raise_exception and status_code not in {200, 201}: - raise Exception(body) - - return status_code, body, b"" - - -_, _, pdf_bytes = call_lamdba({"renderer": "playwright", "document_content": "

Hello world

"}) -pathlib.Path("/tmp/output.pdf").write_bytes(pdf_bytes) # noqa diff --git a/splat_cli.py b/splat_cli.py new file mode 100755 index 0000000..4ed95b1 --- /dev/null +++ b/splat_cli.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# usage: ./splat_cli.py --open -o /tmp/google.pdf -b https://google.com +import argparse +import base64 +import json +import pathlib + +import requests + +DEFAULT_LAMBDA_URL = "http://localhost:8080/2015-03-31/functions/function/invocations" + +parser = argparse.ArgumentParser( + description="Run against splat locally. Sample usage: ./splat_cli.py --open -o /tmp/google.pdf -b https://google.com" +) +parser.add_argument("--document-content", "-c", help="The content of the document") +parser.add_argument("--document-url", "-u", help="The URL of the document") +parser.add_argument("--browser-url", "-b", help="Use a playwright to browse to the url") +parser.add_argument("--renderer", "-r", help="The renderer to use", default="princexml") +parser.add_argument("--output-path", "-o", help="The path to save the output PDF", required=True) +parser.add_argument("--open", help="Open the resulting pdf", default=False, action="store_true") +parser.add_argument( + "--lambda-url", + help="Lambda URL to receive the payload body. Defaults to local dev setup.", + default=DEFAULT_LAMBDA_URL, +) + +args = parser.parse_args() + +document_content = args.document_content +document_url = args.document_url +browser_url = args.browser_url +renderer = args.renderer +output_path = args.output_path +lambda_url = args.lambda_url + + +def call_lamdba(body: dict, raise_exception=True) -> tuple[int, dict, bytes]: + response = requests.post(lambda_url, json={"body": json.dumps(body)}, timeout=60) + if raise_exception: + response.raise_for_status() + data = response.json() + status_code = data["statusCode"] + is_base64_encoded = data["isBase64Encoded"] + if is_base64_encoded: + return status_code, {}, base64.b64decode(data["body"]) + else: + body = json.loads(data.get("body")) if data.get("body") else {} + if raise_exception and status_code not in {200, 201}: + raise Exception(body) + + return status_code, body, b"" + + +if __name__ == "__main__": + body = {"renderer": renderer} + if document_content: + body["document_content"] = document_content + elif document_url: + body["document_url"] = document_url + elif browser_url: + body["browser_url"] = browser_url + _, _, pdf_bytes = call_lamdba(body) + pathlib.Path(output_path).write_bytes(pdf_bytes) + if args.open: + import os + + os.system(f"open {output_path}") # noqa