pdfminer · pietermarsman · Jun 28, 2024 · Mar 5, 2024 · Mar 10, 2024 · Mar 10, 2024
diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml
@@ -0,0 +1,39 @@
+name: CIFuzz
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+permissions: {}
+jobs:
+  Fuzzing:
+    runs-on: ubuntu-latest
+    permissions:
+      security-events: write
+    steps:
+    - name: Build Fuzzers
+      id: build
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'pdfminersix'
+        language: python
+    - name: Run Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'pdfminersix'
+        language: python
+        fuzz-seconds: 800
+        output-sarif: true
+    - name: Upload Crash
+      uses: actions/upload-artifact@v3
+      if: failure() && steps.build.outcome == 'success'
+      with:
+        name: artifacts
+        path: ./out/artifacts
+    - name: Upload Sarif
+      if: always() && steps.build.outcome == 'success'
+      uses: github/codeql-action/upload-sarif@v2
+      with:
+        # Path to SARIF file relative to the root of the repository
+        sarif_file: cifuzz-sarif/results.sarif
+        checkout_path: cifuzz-sarif
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ### Added
 
 - Support for zipped jpeg's ([#938](https://github.com/pdfminer/pdfminer.six/pull/938))
+- Added fuzzing harnesses for integration into Google's OSS-Fuzz
 
 ### Fixed
 

diff --git a/fuzzing/build.sh b/fuzzing/build.sh
@@ -0,0 +1,10 @@
+cd "$SRC"/pdfminer.six
+pip3 install .[dev]
+
+# Build fuzzers in $OUT
+for fuzzer in $(find fuzzing -name '*_fuzzer.py');do
+  compile_python_fuzzer "$fuzzer" --collect-all charset_normalizer --hidden-import=_cffi_backend
+  base_name=$(basename "$fuzzer")
+  base_name_no_ext=${base_name%.*}
+  zip -q $OUT/"$base_name_no_ext".zip $SRC/corpus/*
+done
diff --git a/fuzzing/extract_text_fuzzer.py b/fuzzing/extract_text_fuzzer.py
@@ -0,0 +1,45 @@
+import sys
+
+import atheris
+
+from fuzz_helpers import EnhancedFuzzedDataProvider
+
+with atheris.instrument_imports():
+    from pdf_utils import PDFValidator, prepare_pdfminer_fuzzing
+    from pdfminer.high_level import extract_text
+
+from pdfminer.psparser import PSException
+
+
+def TestOneInput(data: bytes):
+    if not PDFValidator.is_valid_byte_stream(data):
+        # Not worth continuing with this test case
+        return -1
+
+    fdp = EnhancedFuzzedDataProvider(data)
+
+    try:
+        with fdp.ConsumeMemoryFile() as f:
+            max_pages = fdp.ConsumeIntInRange(0, 1000)
+            extract_text(
+                f,
+                maxpages=max_pages,
+                page_numbers=fdp.ConsumeIntList(fdp.ConsumeIntInRange(0, max_pages), 2),
+                laparams=PDFValidator.generate_layout_parameters(fdp)
+            )
+    except (AssertionError, PSException):
+        return -1
+    except Exception as e:
+        if PDFValidator.should_ignore_error(e):
+            return -1
+        raise e
+
+
+def main():
+    prepare_pdfminer_fuzzing()
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fuzzing/extract_text_to_fp_fuzzer.py b/fuzzing/extract_text_to_fp_fuzzer.py
@@ -0,0 +1,63 @@
+import io
+import sys
+
+import atheris
+
+from fuzz_helpers import EnhancedFuzzedDataProvider
+
+with atheris.instrument_imports():
+    from pdf_utils import PDFValidator, prepare_pdfminer_fuzzing
+    from pdfminer.high_level import extract_text_to_fp
+    from pdfminer.psparser import PSException
+
+available_output_formats = [
+    'text',
+    'html',
+    'xml',
+    'tag'
+]
+available_layout_modes = [
+    'exact',
+    'normal',
+    'loose'
+]
+
+
+def TestOneInput(data: bytes):
+    if not PDFValidator.is_valid_byte_stream(data):
+        # Not worth continuing with this test case
+        return -1
+
+    fdp = EnhancedFuzzedDataProvider(data)
+
+    try:
+        with fdp.ConsumeMemoryFile(all_data=False) as f_in, io.BytesIO() as f_out:
+            max_pages = fdp.ConsumeIntInRange(0, 1000)
+            extract_text_to_fp(
+                f_in,
+                f_out,
+                output_type=fdp.PickValueInList(available_output_formats),
+                laparams=PDFValidator.generate_layout_parameters(fdp),
+                maxpages=max_pages,
+                page_numbers=fdp.ConsumeIntList(fdp.ConsumeIntInRange(0, max_pages), 2),
+                scale=fdp.ConsumeFloatInRange(0.0, 2.0),
+                rotation=fdp.ConsumeIntInRange(0, 360),
+                layoutmode=fdp.PickValueInList(available_layout_modes),
+                strip_control=fdp.ConsumeBool()
+            )
+    except (AssertionError, PSException):
+        return -1
+    except Exception as e:
+        if PDFValidator.should_ignore_error(e):
+            return -1
+        raise e
+
+
+def main():
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    prepare_pdfminer_fuzzing()
+    main()
diff --git a/fuzzing/fuzz_helpers.py b/fuzzing/fuzz_helpers.py
@@ -0,0 +1,46 @@
+import io
+import tempfile
+import atheris
+import contextlib
+from typing import List, Set, Dict, Tuple, Any
+
+
+class EnhancedFuzzedDataProvider(atheris.FuzzedDataProvider):
+    def ConsumeRandomBytes(self) -> bytes:
+        return self.ConsumeBytes(self.ConsumeIntInRange(0, self.remaining_bytes()))
+
+    def ConsumeRandomString(self) -> str:
+        return self.ConsumeUnicodeNoSurrogates(self.ConsumeIntInRange(0, self.remaining_bytes()))
+
+    def ConsumeRemainingString(self) -> str:
+        return self.ConsumeUnicodeNoSurrogates(self.remaining_bytes())
+
+    def ConsumeRemainingBytes(self) -> bytes:
+        return self.ConsumeBytes(self.remaining_bytes())
+
+    @contextlib.contextmanager
+    def ConsumeMemoryFile(self, all_data: bool = False, as_bytes: bool = True) -> io.BytesIO:
+        if all_data:
+            file_data = self.ConsumeRemainingBytes() if as_bytes else self.ConsumeRemainingString()
+        else:
+            file_data = self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()
+
+        file = io.BytesIO(file_data) if as_bytes else io.StringIO(file_data)
+        yield file
+        file.close()
+
+    @contextlib.contextmanager
+    def ConsumeTemporaryFile(self, suffix: str, all_data: bool = False, as_bytes: bool = True) -> str:
+        if all_data:
+            file_data = self.ConsumeRemainingBytes() if as_bytes else self.ConsumeRemainingString()
+        else:
+            file_data = self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()
+
+        mode = 'w+b' if as_bytes else 'w+'
+        tfile = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix)
+        tfile.write(file_data)
+        tfile.seek(0)
+        tfile.flush()
+        yield tfile.name
+        tfile.close()
+
diff --git a/fuzzing/page_extraction_fuzzer.py b/fuzzing/page_extraction_fuzzer.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+import atheris
+import sys
+
+from fuzz_helpers import EnhancedFuzzedDataProvider
+
+with atheris.instrument_imports():
+    from pdf_utils import PDFValidator, prepare_pdfminer_fuzzing
+    from pdfminer.high_level import extract_pages
+    from pdfminer.psparser import PSException
+
+
+def TestOneInput(data: bytes):
+    if not PDFValidator.is_valid_byte_stream(data):
+        # Not worth continuing with this test case
+        return -1
+
+    fdp = EnhancedFuzzedDataProvider(data)
+
+    try:
+        with fdp.ConsumeMemoryFile() as f:
+            max_pages = fdp.ConsumeIntInRange(0, 1000)
+            list(extract_pages(
+                f,
+                maxpages=max_pages,
+                page_numbers=fdp.ConsumeIntList(fdp.ConsumeIntInRange(0, max_pages), 2),
+                laparams=PDFValidator.generate_layout_parameters(fdp)
+            ))
+    except (AssertionError, PSException):
+        return -1
+    except Exception as e:
+        if PDFValidator.should_ignore_error(e):
+            return -1
+        raise e
+
+
+def main():
+    prepare_pdfminer_fuzzing()
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fuzzing/pdf_utils.py b/fuzzing/pdf_utils.py
@@ -0,0 +1,69 @@
+"""
+Utilities shared across the various PDF fuzzing harnesses
+"""
+import logging
+from typing import Optional
+
+import atheris
+
+from pdfminer.layout import LAParams
+from pdfminer.psparser import PSException
+
+# List of all exception message substrings explicitly raised by pdfminer that do not inherit from PSException
+_EXPLICIT_EXCEPTION_MESSAGES = [
+    'Unsupported',
+    'duplicate labels',
+    'AcroForm',
+    'SASLPrep',
+    'Invalid'
+]
+
+
+def prepare_pdfminer_fuzzing():
+    """
+    Used to disable logging of the pdfminer module
+    """
+    logging.getLogger('pdfminer').setLevel(logging.CRITICAL)
+
+
+class PDFValidator:
+    """
+    Custom mutator class for PDFs for more efficient fuzzing
+    """
+    _PDF_MAGIC_BYTES = b'%PDF-'
+
+    @staticmethod
+    @atheris.instrument_func
+    def is_valid_byte_stream(data: bytes) -> bool:
+        """
+        Performs basic checks on the incoming byte-stream to determine if it is worth passing the input to the library
+        :return: Whether the byte-stream passes the basic checks
+        """
+        if not data.startswith(PDFValidator._PDF_MAGIC_BYTES):
+            return False
+        if b'/Root' not in data:
+            return False
+
+        return True
+
+    @staticmethod
+    @atheris.instrument_func
+    def generate_layout_parameters(fdp: atheris.FuzzedDataProvider) -> Optional[LAParams]:
+        return LAParams(
+            line_overlap=fdp.ConsumeFloat(),
+            char_margin=fdp.ConsumeFloat(),
+            line_margin=fdp.ConsumeFloat(),
+            word_margin=fdp.ConsumeFloat(),
+            boxes_flow=fdp.ConsumeFloatInRange(-1.0, 1.0) if fdp.ConsumeBool() else None,
+            detect_vertical=fdp.ConsumeBool(),
+            all_texts=fdp.ConsumeBool()
+        ) if fdp.ConsumeBool() else None
+
+    @staticmethod
+    def should_ignore_error(e: Exception) -> bool:
+        """
+        Determines if the given raised exception is an exception explicitly raised by pdfminer
+        :param e: The exception to check
+        :return: Whether the exception should be ignored or re-thrown
+        """
+        return isinstance(e, PSException) or any(em_ss in str(e) for em_ss in _EXPLICIT_EXCEPTION_MESSAGES)
diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py
@@ -977,7 +977,7 @@ def find_xref(self, parser: PDFParser) -> int:
         else:
             raise PDFNoValidXRef("Unexpected EOF")
         log.debug("xref found: pos=%r", prev)
-        assert prev is not None
+        assert prev is not None and prev.isdigit()
         return int(prev)
 
     # read xref table

diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py
@@ -80,7 +80,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                     assert self.doc is not None
                     obj = PDFObjRef(self.doc, objid, genno)
                     self.push((pos, obj))
-                except PSSyntaxError:
+                except (TypeError, PSSyntaxError):
                     pass
         elif token is self.KEYWORD_STREAM:
             # stream object
@@ -163,7 +163,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                 (objid, genno) = (int(objid), int(genno))  # type: ignore[arg-type]
                 obj = PDFObjRef(self.doc, objid, genno)
                 self.push((pos, obj))
-            except PSSyntaxError:
+            except (TypeError, PSSyntaxError):
                 pass
             return
         elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):

diff --git a/setup.py b/setup.py
@@ -1,10 +1,22 @@
+import sys
+
 from pathlib import Path
 from setuptools import setup
 
 root_dir = Path(__file__).parent
 with open(root_dir / "README.md", "rt") as f:
     readme = f.read()
 
+extras_require = {
+    "dev": ["pytest", "nox", "black", "mypy == 0.931"],
+    "docs": ["sphinx", "sphinx-argparse"],
+    "image": ["Pillow"],
+}
+
+if sys.version_info < (3, 12):
+    # There is currently no atheris support for Python 3.12
+    extras_require["dev"].append("atheris")
+
 setup(
     name="pdfminer.six",
     setuptools_git_versioning={
@@ -19,11 +31,7 @@
         'typing_extensions; python_version < "3.8"',
         'importlib_metadata; python_version < "3.8"',
     ],
-    extras_require={
-        "dev": ["pytest", "nox", "black", "mypy == 0.931"],
-        "docs": ["sphinx", "sphinx-argparse"],
-        "image": ["Pillow"],
-    },
+    extras_require=extras_require,
     description="PDF parser and analyzer",
     long_description=readme,
     long_description_content_type="text/markdown",