Skip to content

Commit

Permalink
ImportDocuments - Replace pdfminer3k with pypdf
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Sep 21, 2023
1 parent 77f07ba commit 1cb5734
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 27 deletions.
30 changes: 4 additions & 26 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,7 @@
from Orange.data.util import get_unique_names
from Orange.misc.utils.embedder_utils import get_proxies
from Orange.util import Registry, dummy_callback
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfparser import PDFDocument, PDFParser
from pypdf import PdfReader as PyPDFReader
from requests.exceptions import ConnectionError

from orangecontrib.text.corpus import Corpus
Expand Down Expand Up @@ -130,28 +127,9 @@ class PdfReader(Reader):
ext = [".pdf"]

def read_file(self):
with open(self.path, 'rb') as f:
parser = PDFParser(f)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = 0.1
laparams.word_margin = 1.0
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
extracted_text = []

for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj,
LTTextLine):
extracted_text.append(lt_obj.get_text())
self.content = ' '.join(extracted_text).replace('\x00', '')
reader = PyPDFReader(self.path)
texts = [page.extract_text() for page in reader.pages]
self.content = " ".join(texts)


class XmlReader(Reader):
Expand Down
32 changes: 32 additions & 0 deletions orangecontrib/text/tests/test_import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
TxtReader,
TextData,
XmlReader,
PdfReader,
)


Expand Down Expand Up @@ -296,5 +297,36 @@ def test_error(self):
os.remove(fp.name)


DATA_PATH = os.path.join(os.path.dirname(__file__), "data", "documents")


class TestPdfReader(unittest.TestCase):
def test_file(self):
reader = PdfReader(os.path.join(DATA_PATH, "good", "minimal-document.pdf"))
res = reader.read()[0]
exp = (
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam "
"nonumy eirmod"
)
self.assertTrue(res.content.startswith(exp))

path = os.path.join(DATA_PATH, "good", "sample_pdf.pdf")
reader = PdfReader(path)
res = reader.read()[0]
self.assertEqual("This is a test pdf file", res.content)
self.assertEqual("sample_pdf", res.name)
self.assertEqual(os.path.join(path), res.path)
self.assertListEqual([".pdf"], res.ext)
self.assertEqual("good", res.category)

def test_error(self):
reader = PdfReader(
os.path.join(DATA_PATH, "corrupted", "sample_pdf_corrupted.pdf")
)
res = reader.read()
self.assertIsNone(res[0])
self.assertEqual("sample_pdf_corrupted.pdf", res[1])


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ orange-widget-base >=4.20.0
orange-canvas-core
owlready2
pandas
pdfminer3k>=1.3.1
pypdf
pyqtgraph
pyyaml
requests
Expand Down

0 comments on commit 1cb5734

Please sign in to comment.