From e0f464a1578b08ecd6dc742acc2a16d882f63e3b Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 11 Dec 2024 20:15:16 -0500 Subject: [PATCH] fix: correctly patch EOF handling in pdfminer (fixes: #3815) --- .../partition/pdf_image/test_pdf.py | 15 +++- unstructured/partition/pdf.py | 7 +- unstructured/patches/pdfminer.py | 68 ++++++++++++++++--- 3 files changed, 75 insertions(+), 15 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index cea6b44129..b954e0bb7a 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1192,8 +1192,8 @@ def test_partition_pdf_with_fast_finds_headers_footers( @pytest.mark.parametrize( ("filename", "expected_log"), [ + # This one is *actually* an invalid PDF document ("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."), - ("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."), ], ) def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog): @@ -1201,6 +1201,19 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}")) assert expected_log in caplog.text +@pytest.mark.parametrize( + ("filename", "expected_log"), + [ + # This one is *not* an invalid PDF document, make sure we + # don't try to "repair" it unnecessarily + ("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."), + ], +) +def test_properly_patch_pdfminer(filename, expected_log, caplog): + caplog.set_level(logging.INFO) + assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}")) + assert expected_log not in caplog.text + def assert_element_extraction( elements: list[Element], diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index f87812d40b..6bde4126fd 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -11,7 +11,6 @@ import numpy as np import wrapt -from pdfminer import psparser from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox from pdfminer.utils import open_filename from pi_heif import register_heif_opener @@ -96,16 +95,14 @@ PartitionStrategy, ) from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements -from unstructured.patches.pdfminer import parse_keyword from unstructured.utils import first, requires_dependencies +from unstructured.patches.pdfminer import patch_psparser if TYPE_CHECKING: pass -# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix -# the bug: https://github.com/pdfminer/pdfminer.six/pull/885 -psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore +patch_psparser() RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL) diff --git a/unstructured/patches/pdfminer.py b/unstructured/patches/pdfminer.py index 20b938d1ce..dd69b7fe7d 100644 --- a/unstructured/patches/pdfminer.py +++ b/unstructured/patches/pdfminer.py @@ -1,18 +1,35 @@ -from typing import Union +from typing import Union, Tuple -from pdfminer.psparser import END_KEYWORD, KWD, PSBaseParser, PSKeyword +import functools +import pdfminer +from pdfminer.psparser import ( + PSBaseParser, + KWD, + PSBaseParserToken, + PSEOF, + END_KEYWORD, + PSKeyword, + log, +) +factory_seek = PSBaseParser.seek -def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int: - """Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR - https://github.com/pdfminer/pdfminer.six/pull/885.""" + +@functools.wraps(PSBaseParser.seek) +def seek(self: PSBaseParser, pos: int) -> None: + factory_seek(self, pos) + self.eof = False + + +@functools.wraps(PSBaseParser._parse_keyword) +def _parse_keyword(self, s: bytes, i: int) -> int: m = END_KEYWORD.search(s, i) - if not m: - j = len(s) - self._curtoken += s[i:] - else: + if m: j = m.start(0) self._curtoken += s[i:j] + else: + self._curtoken += s[i:] + return len(s) if self._curtoken == b"true": token: Union[bool, PSKeyword] = True elif self._curtoken == b"false": @@ -22,3 +39,36 @@ def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int: self._add_token(token) self._parse1 = self._parse_main return j + + +@functools.wraps(PSBaseParser.nexttoken) +def nexttoken(self) -> Tuple[int, PSBaseParserToken]: + if self.eof: + # It's not really unexpected, come on now... + raise PSEOF("Unexpected EOF") + while not self._tokens: + try: + self.fillbuf() + self.charpos = self._parse1(self.buf, self.charpos) + except PSEOF: + # If we hit EOF in the middle of a token, try to parse + # it by tacking on whitespace, and delay raising PSEOF + # until next time around + self.charpos = self._parse1(b"\n", 0) + self.eof = True + # Oh, so there wasn't actually a token there? OK. + if not self._tokens: + raise + token = self._tokens.pop(0) + log.debug("nexttoken: %r", token) + return token + + +def patch_psparser(): + """Monkey-patch certain versions of pdfminer.six to avoid breaking + tokens across buffers.""" + # Presuming the bug will be fixed in the next release + if pdfminer.__version__ <= "20240706": + PSBaseParser.seek = seek + PSBaseParser._parse_keyword = _parse_keyword + PSBaseParser.nexttoken = nexttoken