Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OSSFuzz Integration #949

Merged
merged 21 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .github/workflows/cifuzz.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: CIFuzz
on:
push:
branches:
- master
pull_request:
permissions: {}
jobs:
Fuzzing:
runs-on: ubuntu-latest
permissions:
security-events: write
steps:
- name: Build Fuzzers
id: build
uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
with:
oss-fuzz-project-name: 'pdfminersix'
language: python
- name: Run Fuzzers
uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
with:
oss-fuzz-project-name: 'pdfminersix'
language: python
fuzz-seconds: 800
output-sarif: true
- name: Upload Crash
uses: actions/upload-artifact@v3
if: failure() && steps.build.outcome == 'success'
with:
name: artifacts
path: ./out/artifacts
- name: Upload Sarif
if: always() && steps.build.outcome == 'success'
uses: github/codeql-action/upload-sarif@v2
with:
# Path to SARIF file relative to the root of the repository
sarif_file: cifuzz-sarif/results.sarif
checkout_path: cifuzz-sarif
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Added

- Support for zipped jpeg's ([#938](https://github.com/pdfminer/pdfminer.six/pull/938))
- Added fuzzing harnesses for integration into Google's OSS-Fuzz
pietermarsman marked this conversation as resolved.
Show resolved Hide resolved

### Fixed

Expand Down
10 changes: 10 additions & 0 deletions fuzzing/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cd "$SRC"/pdfminer.six
pip3 install .[dev]

# Build fuzzers in $OUT
for fuzzer in $(find fuzzing -name '*_fuzzer.py');do
compile_python_fuzzer "$fuzzer" --collect-all charset_normalizer --hidden-import=_cffi_backend
base_name=$(basename "$fuzzer")
base_name_no_ext=${base_name%.*}
zip -q $OUT/"$base_name_no_ext".zip $SRC/corpus/*
done
45 changes: 45 additions & 0 deletions fuzzing/extract_text_fuzzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import sys

import atheris

from fuzz_helpers import EnhancedFuzzedDataProvider
pietermarsman marked this conversation as resolved.
Show resolved Hide resolved

with atheris.instrument_imports():
from pdf_utils import PDFValidator, prepare_pdfminer_fuzzing
from pdfminer.high_level import extract_text

from pdfminer.psparser import PSException


def TestOneInput(data: bytes):
pietermarsman marked this conversation as resolved.
Show resolved Hide resolved
pietermarsman marked this conversation as resolved.
Show resolved Hide resolved
if not PDFValidator.is_valid_byte_stream(data):
# Not worth continuing with this test case
return -1

fdp = EnhancedFuzzedDataProvider(data)

try:
with fdp.ConsumeMemoryFile() as f:
max_pages = fdp.ConsumeIntInRange(0, 1000)
extract_text(
f,
maxpages=max_pages,
page_numbers=fdp.ConsumeIntList(fdp.ConsumeIntInRange(0, max_pages), 2),
pietermarsman marked this conversation as resolved.
Show resolved Hide resolved
laparams=PDFValidator.generate_layout_parameters(fdp)
pietermarsman marked this conversation as resolved.
Show resolved Hide resolved
)
except (AssertionError, PSException):
return -1
except Exception as e:
if PDFValidator.should_ignore_error(e):
return -1
raise e


def main():
prepare_pdfminer_fuzzing()
atheris.Setup(sys.argv, TestOneInput)
atheris.Fuzz()


if __name__ == "__main__":
main()
63 changes: 63 additions & 0 deletions fuzzing/extract_text_to_fp_fuzzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import io
import sys

import atheris

from fuzz_helpers import EnhancedFuzzedDataProvider

with atheris.instrument_imports():
from pdf_utils import PDFValidator, prepare_pdfminer_fuzzing
from pdfminer.high_level import extract_text_to_fp
from pdfminer.psparser import PSException

available_output_formats = [
'text',
'html',
'xml',
'tag'
]
available_layout_modes = [
'exact',
'normal',
'loose'
]


def TestOneInput(data: bytes):
if not PDFValidator.is_valid_byte_stream(data):
# Not worth continuing with this test case
return -1

fdp = EnhancedFuzzedDataProvider(data)

try:
with fdp.ConsumeMemoryFile(all_data=False) as f_in, io.BytesIO() as f_out:
max_pages = fdp.ConsumeIntInRange(0, 1000)
extract_text_to_fp(
f_in,
f_out,
output_type=fdp.PickValueInList(available_output_formats),
laparams=PDFValidator.generate_layout_parameters(fdp),
maxpages=max_pages,
page_numbers=fdp.ConsumeIntList(fdp.ConsumeIntInRange(0, max_pages), 2),
scale=fdp.ConsumeFloatInRange(0.0, 2.0),
rotation=fdp.ConsumeIntInRange(0, 360),
layoutmode=fdp.PickValueInList(available_layout_modes),
strip_control=fdp.ConsumeBool()
)
except (AssertionError, PSException):
return -1
except Exception as e:
if PDFValidator.should_ignore_error(e):
return -1
raise e


def main():
atheris.Setup(sys.argv, TestOneInput)
atheris.Fuzz()


if __name__ == "__main__":
prepare_pdfminer_fuzzing()
main()
46 changes: 46 additions & 0 deletions fuzzing/fuzz_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import io
import tempfile
import atheris
import contextlib
from typing import List, Set, Dict, Tuple, Any


class EnhancedFuzzedDataProvider(atheris.FuzzedDataProvider):
def ConsumeRandomBytes(self) -> bytes:
return self.ConsumeBytes(self.ConsumeIntInRange(0, self.remaining_bytes()))

def ConsumeRandomString(self) -> str:
return self.ConsumeUnicodeNoSurrogates(self.ConsumeIntInRange(0, self.remaining_bytes()))

def ConsumeRemainingString(self) -> str:
return self.ConsumeUnicodeNoSurrogates(self.remaining_bytes())

def ConsumeRemainingBytes(self) -> bytes:
return self.ConsumeBytes(self.remaining_bytes())

@contextlib.contextmanager
def ConsumeMemoryFile(self, all_data: bool = False, as_bytes: bool = True) -> io.BytesIO:
if all_data:
file_data = self.ConsumeRemainingBytes() if as_bytes else self.ConsumeRemainingString()
else:
file_data = self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()

file = io.BytesIO(file_data) if as_bytes else io.StringIO(file_data)
yield file
file.close()

@contextlib.contextmanager
def ConsumeTemporaryFile(self, suffix: str, all_data: bool = False, as_bytes: bool = True) -> str:
if all_data:
file_data = self.ConsumeRemainingBytes() if as_bytes else self.ConsumeRemainingString()
else:
file_data = self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()

mode = 'w+b' if as_bytes else 'w+'
tfile = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix)
tfile.write(file_data)
tfile.seek(0)
tfile.flush()
yield tfile.name
tfile.close()

44 changes: 44 additions & 0 deletions fuzzing/page_extraction_fuzzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3
import atheris
import sys

from fuzz_helpers import EnhancedFuzzedDataProvider

with atheris.instrument_imports():
from pdf_utils import PDFValidator, prepare_pdfminer_fuzzing
from pdfminer.high_level import extract_pages
from pdfminer.psparser import PSException


def TestOneInput(data: bytes):
if not PDFValidator.is_valid_byte_stream(data):
# Not worth continuing with this test case
return -1

fdp = EnhancedFuzzedDataProvider(data)

try:
with fdp.ConsumeMemoryFile() as f:
max_pages = fdp.ConsumeIntInRange(0, 1000)
list(extract_pages(
f,
maxpages=max_pages,
page_numbers=fdp.ConsumeIntList(fdp.ConsumeIntInRange(0, max_pages), 2),
laparams=PDFValidator.generate_layout_parameters(fdp)
))
except (AssertionError, PSException):
return -1
except Exception as e:
if PDFValidator.should_ignore_error(e):
return -1
raise e


def main():
prepare_pdfminer_fuzzing()
atheris.Setup(sys.argv, TestOneInput)
atheris.Fuzz()


if __name__ == "__main__":
main()
69 changes: 69 additions & 0 deletions fuzzing/pdf_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
Utilities shared across the various PDF fuzzing harnesses
"""
import logging
from typing import Optional

import atheris

from pdfminer.layout import LAParams
from pdfminer.psparser import PSException

# List of all exception message substrings explicitly raised by pdfminer that do not inherit from PSException
_EXPLICIT_EXCEPTION_MESSAGES = [
'Unsupported',
'duplicate labels',
'AcroForm',
'SASLPrep',
'Invalid'
]


def prepare_pdfminer_fuzzing():
"""
Used to disable logging of the pdfminer module
"""
logging.getLogger('pdfminer').setLevel(logging.CRITICAL)


class PDFValidator:
"""
Custom mutator class for PDFs for more efficient fuzzing
"""
_PDF_MAGIC_BYTES = b'%PDF-'

@staticmethod
@atheris.instrument_func
def is_valid_byte_stream(data: bytes) -> bool:
"""
Performs basic checks on the incoming byte-stream to determine if it is worth passing the input to the library
:return: Whether the byte-stream passes the basic checks
"""
if not data.startswith(PDFValidator._PDF_MAGIC_BYTES):
return False
if b'/Root' not in data:
return False

return True

@staticmethod
@atheris.instrument_func
def generate_layout_parameters(fdp: atheris.FuzzedDataProvider) -> Optional[LAParams]:
return LAParams(
line_overlap=fdp.ConsumeFloat(),
char_margin=fdp.ConsumeFloat(),
line_margin=fdp.ConsumeFloat(),
word_margin=fdp.ConsumeFloat(),
boxes_flow=fdp.ConsumeFloatInRange(-1.0, 1.0) if fdp.ConsumeBool() else None,
detect_vertical=fdp.ConsumeBool(),
all_texts=fdp.ConsumeBool()
) if fdp.ConsumeBool() else None

@staticmethod
def should_ignore_error(e: Exception) -> bool:
"""
Determines if the given raised exception is an exception explicitly raised by pdfminer
:param e: The exception to check
:return: Whether the exception should be ignored or re-thrown
"""
return isinstance(e, PSException) or any(em_ss in str(e) for em_ss in _EXPLICIT_EXCEPTION_MESSAGES)
2 changes: 1 addition & 1 deletion pdfminer/pdfdocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -977,7 +977,7 @@ def find_xref(self, parser: PDFParser) -> int:
else:
raise PDFNoValidXRef("Unexpected EOF")
log.debug("xref found: pos=%r", prev)
assert prev is not None
assert prev is not None and prev.isdigit()
pietermarsman marked this conversation as resolved.
Show resolved Hide resolved
return int(prev)

# read xref table
Expand Down
4 changes: 2 additions & 2 deletions pdfminer/pdfparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
assert self.doc is not None
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
except (TypeError, PSSyntaxError):
pass
elif token is self.KEYWORD_STREAM:
# stream object
Expand Down Expand Up @@ -163,7 +163,7 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
(objid, genno) = (int(objid), int(genno)) # type: ignore[arg-type]
obj = PDFObjRef(self.doc, objid, genno)
self.push((pos, obj))
except PSSyntaxError:
except (TypeError, PSSyntaxError):
pass
return
elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
Expand Down
18 changes: 13 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
import sys

from pathlib import Path
from setuptools import setup

root_dir = Path(__file__).parent
with open(root_dir / "README.md", "rt") as f:
readme = f.read()

extras_require = {
"dev": ["pytest", "nox", "black", "mypy == 0.931"],
"docs": ["sphinx", "sphinx-argparse"],
"image": ["Pillow"],
}

if sys.version_info < (3, 12):
# There is currently no atheris support for Python 3.12
extras_require["dev"].append("atheris")

setup(
name="pdfminer.six",
setuptools_git_versioning={
Expand All @@ -19,11 +31,7 @@
'typing_extensions; python_version < "3.8"',
'importlib_metadata; python_version < "3.8"',
],
extras_require={
"dev": ["pytest", "nox", "black", "mypy == 0.931"],
"docs": ["sphinx", "sphinx-argparse"],
"image": ["Pillow"],
},
extras_require=extras_require,
description="PDF parser and analyzer",
long_description=readme,
long_description_content_type="text/markdown",
Expand Down
Loading