diff --git a/autogen/browser_utils/mdconvert.py b/autogen/browser_utils/mdconvert.py index d38b023cd973..4be7fa94626a 100644 --- a/autogen/browser_utils/mdconvert.py +++ b/autogen/browser_utils/mdconvert.py @@ -28,20 +28,8 @@ # File-format detection import puremagic import requests -from binaryornot.check import is_binary from bs4 import BeautifulSoup -# Optional OCR support -IS_OCR_CAPABLE = False -try: - import easyocr - import numpy as np - import PIL - - IS_OCR_CAPABLE = True -except ModuleNotFoundError: - pass - # Optional Transcription support IS_AUDIO_TRANSCRIPTION_CAPABLE = False try: @@ -155,10 +143,9 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: # Guess the content type from any file extension that might be around content_type, encoding = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", "")) + # Only work with text if content_type is None: - # No content type, so peek at the file and see if it's binary - if is_binary(local_path): - return None + return None elif "text/" not in content_type.lower(): return None @@ -725,8 +712,6 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: if extension.lower() not in [".jpg", ".jpeg", ".png"]: return None - ocr_min_confidence = kwargs.get("ocr_min_confidence", 0.25) - md_content = "" # Add metadata @@ -756,25 +741,6 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + "\n" ) - if IS_OCR_CAPABLE: - image = PIL.Image.open(local_path) - # Remove transparency - if image.mode in ("RGBA", "P"): - image = image.convert("RGB") - - reader = easyocr.Reader(["en"]) # specify the language(s) - output = reader.readtext(np.array(image)) # local_path) - # The output is a list of tuples, each containing the coordinates of the text and the text itself. - # We join all the text pieces together to get the final text. - ocr_text = " " - for item in output: - if item[2] >= ocr_min_confidence: - ocr_text += item[1] + " " - ocr_text = ocr_text.strip() - - if len(ocr_text) > 0: - md_content += "\n# Text detected by OCR:\n" + ocr_text - return DocumentConverterResult( title=None, text_content=md_content, diff --git a/setup.py b/setup.py index e61ab8a23493..362aa1217986 100644 --- a/setup.py +++ b/setup.py @@ -87,7 +87,6 @@ "pathvalidate", # for mdconvert "puremagic", # File identification - "binaryornot", # More file identification "pdfminer.six", # Pdf "mammoth", # Docx "python-pptx", # Ppts diff --git a/test/test_browser_utils.py b/test/test_browser_utils.py deleted file mode 100755 index e6bc3b220ef6..000000000000 --- a/test/test_browser_utils.py +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 -m pytest -import hashlib -import math -import os -import re -import tempfile - -import pytest -import requests -from agentchat.test_assistant_agent import KEY_LOC # noqa: E402 - -BLOG_POST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" -BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen" -BLOG_POST_STRING = "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?" - -WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Microsoft" -WIKIPEDIA_TITLE = "Microsoft - Wikipedia" -WIKIPEDIA_STRING = "Redmond" - -PLAIN_TEXT_URL = "https://raw.githubusercontent.com/microsoft/autogen/main/README.md" -IMAGE_URL = "https://github.com/afourney.png" - -PDF_URL = "https://arxiv.org/pdf/2308.08155.pdf" -PDF_STRING = "Figure 1: AutoGen enables diverse LLM-based applications using multi-agent conversations." - -BING_QUERY = "Microsoft" -BING_TITLE = f"{BING_QUERY} - Search" -BING_STRING = f"A Bing search for '{BING_QUERY}' found" - -try: - from autogen.browser_utils import SimpleTextBrowser -except ImportError: - skip_all = True -else: - skip_all = False - -try: - BING_API_KEY = os.environ["BING_API_KEY"] -except KeyError: - skip_bing = True -else: - skip_bing = False - - -def _rm_folder(path): - """Remove all the regular files in a folder, then deletes the folder. Assumes a flat file structure, with no subdirectories.""" - for fname in os.listdir(path): - fpath = os.path.join(path, fname) - if os.path.isfile(fpath): - os.unlink(fpath) - os.rmdir(path) - - -@pytest.mark.skipif( - skip_all, - reason="do not run if dependency is not installed", -) -def test_simple_text_browser(): - # Create a temp downloads folder (removing any leftover ones from prior tests) - with tempfile.TemporaryDirectory() as downloads_folder: - # Instantiate the browser - user_agent = "python-requests/" + requests.__version__ - viewport_size = 1024 - browser = SimpleTextBrowser( - downloads_folder=downloads_folder, - viewport_size=viewport_size, - request_kwargs={ - "headers": {"User-Agent": user_agent}, - }, - ) - - # Test that we can visit a page and find what we expect there - top_viewport = browser.visit_page(BLOG_POST_URL) - assert browser.viewport == top_viewport - assert browser.page_title.strip() == BLOG_POST_TITLE.strip() - assert BLOG_POST_STRING in browser.page_content - - # Check if page splitting works - approx_pages = math.ceil( - len(browser.page_content) / viewport_size - ) # May be fewer, since it aligns to word breaks - assert len(browser.viewport_pages) <= approx_pages - assert abs(len(browser.viewport_pages) - approx_pages) <= 1 # allow only a small deviation - assert browser.viewport_pages[0][0] == 0 - assert browser.viewport_pages[-1][1] == len(browser.page_content) - - # Make sure we can reconstruct the full contents from the split pages - buffer = "" - for bounds in browser.viewport_pages: - buffer += browser.page_content[bounds[0] : bounds[1]] - assert buffer == browser.page_content - - # Test scrolling (scroll all the way to the bottom) - for i in range(1, len(browser.viewport_pages)): - browser.page_down() - assert browser.viewport_current_page == i - # Test scrolloing beyond the limits - for i in range(0, 5): - browser.page_down() - assert browser.viewport_current_page == len(browser.viewport_pages) - 1 - - # Test scrolling (scroll all the way to the bottom) - for i in range(len(browser.viewport_pages) - 2, 0, -1): - browser.page_up() - assert browser.viewport_current_page == i - # Test scrolloing beyond the limits - for i in range(0, 5): - browser.page_up() - assert browser.viewport_current_page == 0 - - # Test Wikipedia handling - assert WIKIPEDIA_STRING in browser.visit_page(WIKIPEDIA_URL) - assert WIKIPEDIA_TITLE.strip() == browser.page_title.strip() - - # Visit a plain-text file - response = requests.get(PLAIN_TEXT_URL) - response.raise_for_status() - expected_results = response.text - - browser.visit_page(PLAIN_TEXT_URL) - assert browser.page_content.strip() == expected_results.strip() - - # Directly download an image, and compute its md5 - response = requests.get(IMAGE_URL, stream=True) - response.raise_for_status() - expected_md5 = hashlib.md5(response.raw.read()).hexdigest() - - # Visit an image causing it to be downloaded by the SimpleTextBrowser, then compute its md5 - viewport = browser.visit_page(IMAGE_URL) - m = re.search(r"Downloaded '(.*?)' to '(.*?)'", viewport) - fetched_url = m.group(1) - download_loc = m.group(2) - assert fetched_url == IMAGE_URL - - with open(download_loc, "rb") as fh: - downloaded_md5 = hashlib.md5(fh.read()).hexdigest() - - # MD%s should match - assert expected_md5 == downloaded_md5 - - # Fetch a PDF - viewport = browser.visit_page(PDF_URL) - assert PDF_STRING in viewport - - -@pytest.mark.skipif( - skip_bing, - reason="do not run bing tests if key is missing", -) -def test_bing_search(): - # Instantiate the browser - user_agent = "python-requests/" + requests.__version__ - browser = SimpleTextBrowser( - bing_api_key=BING_API_KEY, - viewport_size=1024, - request_kwargs={ - "headers": {"User-Agent": user_agent}, - }, - ) - - assert BING_STRING in browser.visit_page("bing: " + BING_QUERY) - assert BING_TITLE == browser.page_title - assert len(browser.viewport_pages) == 1 - assert browser.viewport_pages[0] == (0, len(browser.page_content)) - - -if __name__ == "__main__": - """Runs this file's tests from the command line.""" - test_simple_text_browser() - test_bing_search()