tests/test_tesseract.py: update for mupdf master changes.

test_tesseract(): update expected exception text with MuPDF master. test_3842b(): new, check diagnostics when tesseract cannot find language.
pymupdf · Dec 3, 2024 · 25252fb · 25252fb
1 parent fe1299f
commit 25252fb
Showing 1 changed file with 38 additions and 8 deletions.
diff --git a/tests/test_tesseract.py b/tests/test_tesseract.py
@@ -20,8 +20,12 @@ def test_tesseract():
     page = doc[5]
     if hasattr(pymupdf, 'mupdf'):
         # rebased.
+        if pymupdf.mupdf_version_tuple < (1, 26):
+            tail = 'OCR initialisation failed'
+        else:
+            tail = 'Tesseract language initialisation failed'
         if pymupdf.mupdf_version_tuple >= (1, 24):
-            e_expected = 'code=3: OCR initialisation failed'
+            e_expected = f'code=3: {tail}'
             if platform.system() == 'OpenBSD':
                 # 2023-12-12: For some reason the SWIG catch code only catches
                 # the exception as FzErrorBase.
@@ -30,7 +34,7 @@ def test_tesseract():
             else:
                 e_expected_type = pymupdf.mupdf.FzErrorLibrary
         else:
-            e_expected = 'code=2: OCR initialisation failed'
+            e_expected = f'code=2: {tail}'
             e_expected_type = None
     else:
         # classic.
@@ -57,31 +61,57 @@ def test_tesseract():
         rebased = hasattr(pymupdf, 'mupdf')
         if rebased:
             wt = pymupdf.TOOLS.mupdf_warnings()
-            if pymupdf.mupdf_version_tuple < (1, 25):
-                assert wt
-            else:
+            if pymupdf.mupdf_version_tuple < (1, 26):
                 assert wt == (
                         'UNHANDLED EXCEPTION!\n'
-                        'library error: Tesseract initialisation failed\n'
-                        'dropping unclosed output'
+                        'library error: Tesseract initialisation failed'
                         )
+            else:
+                assert not wt
 
 
+def test_3842b():
+    # Check Tesseract failure when given a bogus languages.
+    #
+    # Note that Tesseract seems to output its own diagnostics.
+    #
+    path = os.path.normpath(f'{__file__}/../../tests/resources/test_3842.pdf')
+    with pymupdf.open(path) as document:
+        page = document[6]
+        try:
+            partial_tp = page.get_textpage_ocr(flags=0, full=False, language='qwerty')
+        except Exception as e:
+            print(f'test_3842b(): received exception: {e}')
+            if 'No tessdata specified and Tesseract is not installed' in str(e):
+                pass
+            else:
+                if pymupdf.mupdf_version_tuple < (1, 26):
+                    assert 'OCR initialisation failed' in str(e)
+                    wt = pymupdf.TOOLS.mupdf_warnings()
+                    assert wt == 'UNHANDLED EXCEPTION!\nlibrary error: Tesseract initialisation failed\nUNHANDLED EXCEPTION!\nlibrary error: Tesseract initialisation failed', \
+                            f'Unexpected {wt=}'
+                else:
+                    assert 'Tesseract language initialisation failed' in str(e)
+
+
 def test_3842():
     path = os.path.normpath(f'{__file__}/../../tests/resources/test_3842.pdf')
     with pymupdf.open(path) as document:
         page = document[6]
         try:
             partial_tp = page.get_textpage_ocr(flags=0, full=False)
         except Exception as e:
+            print(f'test_3842(): received exception: {e}', flush=1)
             assert 'No tessdata specified and Tesseract is not installed' in str(e)
         else:
             text = page.get_text(textpage=partial_tp)
             print()
             print(text)
             print(f'text:\n{text!r}')
 
-            # 2024-11-29: This is the current incorrect output.
+            # 2024-11-29: This is the current incorrect output. We use
+            # underscores for lines containing entirely whitespace (which
+            # textwrap.dedent() unfortunately replaces with empty lines).
             text_expected = textwrap.dedent('''
                     NIST SP 800-223  
                     _