Skip to content

Commit

Permalink
Prevent line breaks, deliver reading order.
Browse files Browse the repository at this point in the history
Refactor plain text and "words" extraction with sort=True:
We previously simply sorted the output by ascending bottom and left coordinate.
This change collects words  (and respectively text) that are approximately on the same line.
Apart from extremely malformed pages, words and respectively text is returned in "natural" reading sequence.

This change also suppresses line breaks generated by MuPDF just because of large horizontal distances (as it e.g. often happens between  table cell content of the same row.
  • Loading branch information
JorjMcKie committed Sep 23, 2024
1 parent 22382f4 commit 75a060d
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 20 deletions.
18 changes: 10 additions & 8 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13319,14 +13319,16 @@ def width(self):
TEXT_OUTPUT_XML = 3
TEXT_OUTPUT_XHTML = 4

TEXT_PRESERVE_LIGATURES = 1
TEXT_PRESERVE_WHITESPACE = 2
TEXT_PRESERVE_IMAGES = 4
TEXT_INHIBIT_SPACES = 8
TEXT_DEHYPHENATE = 16
TEXT_PRESERVE_SPANS = 32
TEXT_MEDIABOX_CLIP = 64
TEXT_CID_FOR_UNKNOWN_UNICODE = 128
TEXT_PRESERVE_LIGATURES = mupdf.FZ_STEXT_PRESERVE_LIGATURES
TEXT_PRESERVE_WHITESPACE = mupdf.FZ_STEXT_PRESERVE_WHITESPACE
TEXT_PRESERVE_IMAGES = mupdf.FZ_STEXT_PRESERVE_IMAGES
TEXT_INHIBIT_SPACES = mupdf.FZ_STEXT_PRESERVE_LIGATURES
TEXT_DEHYPHENATE = mupdf.FZ_STEXT_DEHYPHENATE
TEXT_PRESERVE_SPANS = mupdf.FZ_STEXT_PRESERVE_SPANS
TEXT_MEDIABOX_CLIP = mupdf.FZ_STEXT_MEDIABOX_CLIP
TEXT_CID_FOR_UNKNOWN_UNICODE = mupdf.FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE
TEXT_COLLECT_STRUCTURE = 256 # mupdf.FZ_STEXT_COLLECT_STRUCTURE
TEXT_ACCURATE_BBOXES = 512 # mupdf.FZ_STEXT_ACCURATE_BBOXES

TEXTFLAGS_WORDS = (0
| TEXT_PRESERVE_LIGATURES
Expand Down
196 changes: 184 additions & 12 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,34 +507,195 @@ def get_text_words(
textpage: pymupdf.TextPage = None,
sort: bool = False,
delimiters=None,
tolerance=3,
) -> list:
"""Return the text words as a list with the bbox for each word.
Args:
page: pymupdf.Page
clip: (rect-like) area on page to consider
flags: (int) control the amount of data parsed into the textpage.
delimiters: (str,list) characters to use as word delimiters
textpage: (pymupdf.TextPage) either passed-in or None.
sort: (bool) sort the words in reading sequence.
delimiters: (str,list) characters to use as word delimiters.
tolerance: (float) consider words to be part of the same line if
top or bottom coordinate are not larger than this. Relevant
only if sort=True.
Returns:
Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
"""

def sort_words(words):
"""Sort words line-wise, forgiving small deviations."""
words.sort(key=lambda w: (w[3], w[0]))
nwords = [] # final word list
line = [words[0]] # collects words roughly in same line
lrect = pymupdf.Rect(words[0][:4]) # start the line rectangle
for w in words[1:]:
wrect = pymupdf.Rect(w[:4])
if (
abs(wrect.y0 - lrect.y0) <= tolerance
or abs(wrect.y1 - lrect.y1) <= tolerance
):
line.append(w)
lrect |= wrect
else:
line.sort(key=lambda w: w[0]) # sort words in line l-t-r
nwords.extend(line) # append to final words list
line = [w] # start next line
lrect = wrect # start next line rect

line.sort(key=lambda w: w[0]) # sort words in line l-t-r
nwords.extend(line) # append to final words list

return nwords

pymupdf.CheckParent(page)
if flags is None:
flags = pymupdf.TEXT_PRESERVE_WHITESPACE | pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_MEDIABOX_CLIP
flags = pymupdf.TEXTFLAGS_WORDS
tp = textpage
if tp is None:
tp = page.get_textpage(clip=clip, flags=flags)
elif getattr(tp, "parent") != page:
raise ValueError("not a textpage of this page")

words = tp.extractWORDS(delimiters)

# if textpage was given, we subselect the words in clip
if textpage is not None and clip is not None:
# sub-select words contained in clip
clip = pymupdf.Rect(clip)
words = [
w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4]))
]

if textpage is None:
del tp
if sort is True:
words.sort(key=lambda w: (w[3], w[0]))
if words and sort is True:
# advanced sort if any words found
words = sort_words(words)

return words


def get_sorted_text(
page: pymupdf.Page,
clip: rect_like = None,
flags: OptInt = None,
textpage: pymupdf.TextPage = None,
tolerance=3,
) -> str:
"""Extract plain text avoiding unacceptable line breaks.
Text contained in clip will be sorted in reading sequence. Some effort
is also spent to simulate layout vertically and horizontally.
Args:
page: pymupdf.Page
clip: (rect-like) only consider text inside
flags: (int) text extraction flags
textpage: pymupdf.TextPage
tolerance: (float) consider words to be on the same line if their top
or bottom coordinates do not differ more than this.
Notes:
If a TextPage is provided, all text is checked for being inside clip
with at least 50% of its bbox.
This allows to use some "global" TextPage in conjunction with sub-
selecting words in parts of the defined TextPage rectangle.
Returns:
A text string in reading sequence. Left indentation of each line,
inter-line and inter-word distances strive to reflect the layout.
"""

def line_text(clip, line):
"""Create the string of one text line.
We are trying to simulate some horizontal layout here, too.
Args:
clip: (pymupdf.Rect) the area from which all text is being read.
line: (list) word tuples (rect, text) contained in the line
Returns:
Text in this line. Generated from words in 'line'. Distance from
predecessor is translated to multiple spaces, thus simulating
text indentations and large horizontal distances.
"""
line.sort(key=lambda w: w[0].x0)
ltext = "" # text in the line
x1 = clip.x0 # end coordinate of ltext
lrect = pymupdf.EMPTY_RECT() # bbox of this line
for r, t in line:
lrect |= r # update line bbox
# convert distance to previous word to multiple spaces
dist = max(
int(round((r.x0 - x1) / r.width * len(t))),
0 if x1 == clip.x0 else 1,
) # number of space characters

ltext += " " * dist + t # append word string
x1 = r.x1 # update new end position
return ltext

# Extract words in correct sequence first.
words = [
(pymupdf.Rect(w[:4]), w[4])
for w in get_text_words(
page,
clip=clip,
flags=flags,
textpage=textpage,
sort=True,
tolerance=tolerance,
)
]

if not words: # no text present
return ""
totalbox = pymupdf.EMPTY_RECT() # area covering all text
for wr, text in words:
totalbox |= wr

lines = [] # list of reconstituted lines
line = [words[0]] # current line
lrect = words[0][0] # the line's rectangle

# walk through the words
for wr, text in words[1:]: # start with second word
w0r, _ = line[-1] # read previous word in current line

# if this word matches top or bottom of the line, append it
if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance:
line.append((wr, text))
lrect |= wr
else:
# output current line and re-initialize
ltext = line_text(totalbox, line)
lines.append((lrect, ltext))
line = [(wr, text)]
lrect = wr

# also append unfinished last line
ltext = line_text(totalbox, line)
lines.append((lrect, ltext))

# sort all lines vertically
lines.sort(key=lambda l: (l[0].y1))

text = lines[0][1] # text of first line
y1 = lines[0][0].y1 # its bottom coordinate
for lrect, ltext in lines[1:]:
distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5)
breaks = "\n" * (distance + 1)
text += breaks + ltext
y1 = lrect.y1

# return text in clip
return text


def get_textbox(
page: pymupdf.Page,
rect: rect_like,
Expand Down Expand Up @@ -731,14 +892,15 @@ def get_image_rects(page: pymupdf.Page, name, transform=False) -> list:


def get_text(
page: pymupdf.Page,
option: str = "text",
clip: rect_like = None,
flags: OptInt = None,
textpage: pymupdf.TextPage = None,
sort: bool = False,
delimiters=None,
):
page: pymupdf.Page,
option: str = "text",
clip: rect_like = None,
flags: OptInt = None,
textpage: pymupdf.TextPage = None,
sort: bool = False,
delimiters=None,
tolerance=3,
):
"""Extract text from a page or an annotation.
This is a unifying wrapper for various methods of the pymupdf.TextPage class.
Expand Down Expand Up @@ -787,6 +949,16 @@ def get_text(
return get_text_blocks(
page, clip=clip, flags=flags, textpage=textpage, sort=sort
)

if option == "text" and sort is True:
return get_sorted_text(
page,
clip=clip,
flags=flags,
textpage=textpage,
tolerance=tolerance,
)

pymupdf.CheckParent(page)
cb = None
if option in ("html", "xml", "xhtml"): # no clipping for MuPDF functions
Expand Down
Binary file added tests/resources/test-linebreaks.pdf
Binary file not shown.
16 changes: 16 additions & 0 deletions tests/test_linebreaks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pymupdf

import os.path


def test_linebreaks():
"""Test avoidance of linebreaks."""
path = os.path.abspath(f"{__file__}/../../tests/resources/test-linebreaks.pdf")
doc = pymupdf.open(path)
page = doc[0]
tp = page.get_textpage(flags=pymupdf.TEXTFLAGS_WORDS)
word_count = len(page.get_text("words", textpage=tp))
line_count1 = len(page.get_text(textpage=tp).splitlines())
line_count2 = len(page.get_text(sort=True, textpage=tp).splitlines())
assert word_count == line_count1
assert line_count2 < line_count1 / 2

0 comments on commit 75a060d

Please sign in to comment.