Prevent line breaks, deliver reading order.

Refactor plain text and "words" extraction with sort=True: We previously simply sorted the output by ascending bottom and left coordinate. This change collects words (and respectively text) that are approximately on the same line. Apart from extremely malformed pages, words and respectively text is returned in "natural" reading sequence. This change also suppresses line breaks generated by MuPDF just because of large horizontal distances (as it e.g. often happens between table cell content of the same row.
pymupdf · Sep 23, 2024 · 75a060d · 75a060d
1 parent 22382f4
commit 75a060d
Show file tree

Hide file tree

Showing 4 changed files with 210 additions and 20 deletions.
diff --git a/src/__init__.py b/src/__init__.py
@@ -13319,14 +13319,16 @@ def width(self):
 TEXT_OUTPUT_XML = 3
 TEXT_OUTPUT_XHTML = 4
 
-TEXT_PRESERVE_LIGATURES = 1
-TEXT_PRESERVE_WHITESPACE = 2
-TEXT_PRESERVE_IMAGES = 4
-TEXT_INHIBIT_SPACES = 8
-TEXT_DEHYPHENATE = 16
-TEXT_PRESERVE_SPANS = 32
-TEXT_MEDIABOX_CLIP = 64
-TEXT_CID_FOR_UNKNOWN_UNICODE = 128
+TEXT_PRESERVE_LIGATURES = mupdf.FZ_STEXT_PRESERVE_LIGATURES
+TEXT_PRESERVE_WHITESPACE = mupdf.FZ_STEXT_PRESERVE_WHITESPACE
+TEXT_PRESERVE_IMAGES = mupdf.FZ_STEXT_PRESERVE_IMAGES
+TEXT_INHIBIT_SPACES = mupdf.FZ_STEXT_PRESERVE_LIGATURES
+TEXT_DEHYPHENATE = mupdf.FZ_STEXT_DEHYPHENATE
+TEXT_PRESERVE_SPANS = mupdf.FZ_STEXT_PRESERVE_SPANS
+TEXT_MEDIABOX_CLIP = mupdf.FZ_STEXT_MEDIABOX_CLIP
+TEXT_CID_FOR_UNKNOWN_UNICODE = mupdf.FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE
+TEXT_COLLECT_STRUCTURE = 256 # mupdf.FZ_STEXT_COLLECT_STRUCTURE
+TEXT_ACCURATE_BBOXES = 512 # mupdf.FZ_STEXT_ACCURATE_BBOXES
 
 TEXTFLAGS_WORDS = (0
         | TEXT_PRESERVE_LIGATURES

diff --git a/src/utils.py b/src/utils.py
@@ -507,34 +507,195 @@ def get_text_words(
     textpage: pymupdf.TextPage = None,
     sort: bool = False,
     delimiters=None,
+    tolerance=3,
 ) -> list:
     """Return the text words as a list with the bbox for each word.
 
     Args:
+        page: pymupdf.Page
+        clip: (rect-like) area on page to consider
         flags: (int) control the amount of data parsed into the textpage.
-        delimiters: (str,list) characters to use as word delimiters
+        textpage: (pymupdf.TextPage) either passed-in or None.
+        sort: (bool) sort the words in reading sequence.
+        delimiters: (str,list) characters to use as word delimiters.
+        tolerance: (float) consider words to be part of the same line if
+            top or bottom coordinate are not larger than this. Relevant
+            only if sort=True.
 
     Returns:
         Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
     """
+
+    def sort_words(words):
+        """Sort words line-wise, forgiving small deviations."""
+        words.sort(key=lambda w: (w[3], w[0]))
+        nwords = []  # final word list
+        line = [words[0]]  # collects words roughly in same line
+        lrect = pymupdf.Rect(words[0][:4])  # start the line rectangle
+        for w in words[1:]:
+            wrect = pymupdf.Rect(w[:4])
+            if (
+                abs(wrect.y0 - lrect.y0) <= tolerance
+                or abs(wrect.y1 - lrect.y1) <= tolerance
+            ):
+                line.append(w)
+                lrect |= wrect
+            else:
+                line.sort(key=lambda w: w[0])  # sort words in line l-t-r
+                nwords.extend(line)  # append to final words list
+                line = [w]  # start next line
+                lrect = wrect  # start next line rect
+
+        line.sort(key=lambda w: w[0])  # sort words in line l-t-r
+        nwords.extend(line)  # append to final words list
+
+        return nwords
+
     pymupdf.CheckParent(page)
     if flags is None:
-        flags = pymupdf.TEXT_PRESERVE_WHITESPACE | pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_MEDIABOX_CLIP
+        flags = pymupdf.TEXTFLAGS_WORDS
     tp = textpage
     if tp is None:
         tp = page.get_textpage(clip=clip, flags=flags)
     elif getattr(tp, "parent") != page:
         raise ValueError("not a textpage of this page")
 
     words = tp.extractWORDS(delimiters)
+
+    # if textpage was given, we subselect the words in clip
+    if textpage is not None and clip is not None:
+        # sub-select words contained in clip
+        clip = pymupdf.Rect(clip)
+        words = [
+            w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4]))
+        ]
+
     if textpage is None:
         del tp
-    if sort is True:
-        words.sort(key=lambda w: (w[3], w[0]))
+    if words and sort is True:
+        # advanced sort if any words found
+        words = sort_words(words)
 
     return words
 
 
+def get_sorted_text(
+    page: pymupdf.Page,
+    clip: rect_like = None,
+    flags: OptInt = None,
+    textpage: pymupdf.TextPage = None,
+    tolerance=3,
+) -> str:
+    """Extract plain text avoiding unacceptable line breaks.
+
+    Text contained in clip will be sorted in reading sequence. Some effort
+    is also spent to simulate layout vertically and horizontally.
+
+    Args:
+        page: pymupdf.Page
+        clip: (rect-like) only consider text inside
+        flags: (int) text extraction flags
+        textpage: pymupdf.TextPage
+        tolerance: (float) consider words to be on the same line if their top
+            or bottom coordinates do not differ more than this.
+
+    Notes:
+        If a TextPage is provided, all text is checked for being inside clip
+        with at least 50% of its bbox.
+        This allows to use some "global" TextPage in conjunction with sub-
+        selecting words in parts of the defined TextPage rectangle.
+
+    Returns:
+        A text string in reading sequence. Left indentation of each line,
+        inter-line and inter-word distances strive to reflect the layout.
+    """
+
+    def line_text(clip, line):
+        """Create the string of one text line.
+
+        We are trying to simulate some horizontal layout here, too.
+
+        Args:
+            clip: (pymupdf.Rect) the area from which all text is being read.
+            line: (list) word tuples (rect, text) contained in the line
+        Returns:
+            Text in this line. Generated from words in 'line'. Distance from
+            predecessor is translated to multiple spaces, thus simulating
+            text indentations and large horizontal distances.
+        """
+        line.sort(key=lambda w: w[0].x0)
+        ltext = ""  # text in the line
+        x1 = clip.x0  # end coordinate of ltext
+        lrect = pymupdf.EMPTY_RECT()  # bbox of this line
+        for r, t in line:
+            lrect |= r  # update line bbox
+            # convert distance to previous word to multiple spaces
+            dist = max(
+                int(round((r.x0 - x1) / r.width * len(t))),
+                0 if x1 == clip.x0 else 1,
+            )  # number of space characters
+
+            ltext += " " * dist + t  # append word string
+            x1 = r.x1  # update new end position
+        return ltext
+
+    # Extract words in correct sequence first.
+    words = [
+        (pymupdf.Rect(w[:4]), w[4])
+        for w in get_text_words(
+            page,
+            clip=clip,
+            flags=flags,
+            textpage=textpage,
+            sort=True,
+            tolerance=tolerance,
+        )
+    ]
+
+    if not words:  # no text present
+        return ""
+    totalbox = pymupdf.EMPTY_RECT()  # area covering all text
+    for wr, text in words:
+        totalbox |= wr
+
+    lines = []  # list of reconstituted lines
+    line = [words[0]]  # current line
+    lrect = words[0][0]  # the line's rectangle
+
+    # walk through the words
+    for wr, text in words[1:]:  # start with second word
+        w0r, _ = line[-1]  # read previous word in current line
+
+        # if this word matches top or bottom of the line, append it
+        if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance:
+            line.append((wr, text))
+            lrect |= wr
+        else:
+            # output current line and re-initialize
+            ltext = line_text(totalbox, line)
+            lines.append((lrect, ltext))
+            line = [(wr, text)]
+            lrect = wr
+
+    # also append unfinished last line
+    ltext = line_text(totalbox, line)
+    lines.append((lrect, ltext))
+
+    # sort all lines vertically
+    lines.sort(key=lambda l: (l[0].y1))
+
+    text = lines[0][1]  # text of first line
+    y1 = lines[0][0].y1  # its bottom coordinate
+    for lrect, ltext in lines[1:]:
+        distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5)
+        breaks = "\n" * (distance + 1)
+        text += breaks + ltext
+        y1 = lrect.y1
+
+    # return text in clip
+    return text
+
+
 def get_textbox(
     page: pymupdf.Page,
     rect: rect_like,
@@ -731,14 +892,15 @@ def get_image_rects(page: pymupdf.Page, name, transform=False) -> list:
 
 
 def get_text(
-        page: pymupdf.Page,
-        option: str = "text",
-        clip: rect_like = None,
-        flags: OptInt = None,
-        textpage: pymupdf.TextPage = None,
-        sort: bool = False,
-        delimiters=None,
-        ):
+    page: pymupdf.Page,
+    option: str = "text",
+    clip: rect_like = None,
+    flags: OptInt = None,
+    textpage: pymupdf.TextPage = None,
+    sort: bool = False,
+    delimiters=None,
+    tolerance=3,
+):
     """Extract text from a page or an annotation.
 
     This is a unifying wrapper for various methods of the pymupdf.TextPage class.
@@ -787,6 +949,16 @@ def get_text(
         return get_text_blocks(
             page, clip=clip, flags=flags, textpage=textpage, sort=sort
         )
+
+    if option == "text" and sort is True:
+        return get_sorted_text(
+            page,
+            clip=clip,
+            flags=flags,
+            textpage=textpage,
+            tolerance=tolerance,
+        )
+
     pymupdf.CheckParent(page)
     cb = None
     if option in ("html", "xml", "xhtml"):  # no clipping for MuPDF functions

diff --git a/tests/resources/test-linebreaks.pdf b/tests/resources/test-linebreaks.pdf
diff --git a/tests/test_linebreaks.py b/tests/test_linebreaks.py
@@ -0,0 +1,16 @@
+import pymupdf
+
+import os.path
+
+
+def test_linebreaks():
+    """Test avoidance of linebreaks."""
+    path = os.path.abspath(f"{__file__}/../../tests/resources/test-linebreaks.pdf")
+    doc = pymupdf.open(path)
+    page = doc[0]
+    tp = page.get_textpage(flags=pymupdf.TEXTFLAGS_WORDS)
+    word_count = len(page.get_text("words", textpage=tp))
+    line_count1 = len(page.get_text(textpage=tp).splitlines())
+    line_count2 = len(page.get_text(sort=True, textpage=tp).splitlines())
+    assert word_count == line_count1
+    assert line_count2 < line_count1 / 2