From 53f8a727e68d64618edd722e6218a6075e16f8af Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Thu, 12 Dec 2024 05:02:44 -0400 Subject: [PATCH] Some provisional definitions To cope with upcoming features of MuPDF we make a few provisional definitions for flags that will be introduced then. This will avoid complex version checks in our code. Also adjust a table markdown output test: We do no longer replace line breaks by simple spaces, but by HTML line breaks `
`. --- src/__init__.py | 38 ++++++++++++++++++++++++-------------- src/extra.i | 6 +++--- tests/test_tables.py | 6 +++--- 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index f00296cb6..a39732c14 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -8314,7 +8314,7 @@ def _show_pdf_page(self, fz_srcpage, overlay=1, matrix=None, xref=0, oc=0, clip= #------------------------------------------------------------- resources = mupdf.pdf_dict_get_inheritable(tpageref, PDF_NAME('Resources')) if not resources.m_internal: - resources = mupdf.pdf_dict_put_dict(tpageref,PDF_NAME('Resources'),5) + resources = mupdf.pdf_dict_put_dict(tpageref,PDF_NAME('Resources'),5) subres = mupdf.pdf_dict_get(resources, PDF_NAME('XObject')) if not subres.m_internal: subres = mupdf.pdf_dict_put_dict(resources, PDF_NAME('XObject'), 5) @@ -9199,7 +9199,7 @@ def remove_rotation(self): pass for xref, rect in widgets: # modify field rectangles - widget = page.load_widget(xref) + widget = self.load_widget(xref) widget.rect = r widget.update() return rot # the inverse of the generated derotation matrix @@ -13487,6 +13487,13 @@ def width(self): TEXT_OUTPUT_XML = 3 TEXT_OUTPUT_XHTML = 4 +TEXT_STRIKEOUT = 1 +TEXT_UNDERLINE = 2 +TEXT_SYNTHETIC = 4 +TEXT_BOLD = 8 +TEXT_FILLED = 16 +TEXT_STROKED = 32 + TEXT_PRESERVE_LIGATURES = mupdf.FZ_STEXT_PRESERVE_LIGATURES TEXT_PRESERVE_WHITESPACE = mupdf.FZ_STEXT_PRESERVE_WHITESPACE TEXT_PRESERVE_IMAGES = mupdf.FZ_STEXT_PRESERVE_IMAGES @@ -13507,6 +13514,7 @@ def width(self): TEXT_COLLECT_VECTORS = 1024 TEXT_IGNORE_ACTUALTEXT = 2048 TEXT_STEXT_SEGMENT = 4096 +TEXT_COLLECT_FLAGS = 32768 # mupdf.FZ_STEXT_COLLECT_FLAGS TEXTFLAGS_WORDS = (0 | TEXT_PRESERVE_LIGATURES @@ -16509,6 +16517,7 @@ def __str__(self): font_flags = JM_char_font_flags(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)), line, ch) origin = mupdf.FzPoint(ch.m_internal.origin) style.size = ch.m_internal.size + style.font_flags = font_flags style.flags = ch.m_internal.flags style.font = JM_font_name(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font))) if THIS_MUPDF >= MUPDF1250: @@ -16519,7 +16528,8 @@ def __str__(self): style.asc = JM_font_ascender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font))) style.desc = JM_font_descender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font))) - if (0 + if ( + 0 or style.size != old_style.size or style.bidi != old_style.bidi or style.font_flags != old_style.font_flags @@ -16528,7 +16538,7 @@ def __str__(self): or style.color != old_style.color or style.opacity != old_style.opacity or style.font != old_style.font - ): + ): if old_style.size > 0: # not first one, output previous if raw: @@ -16562,24 +16572,24 @@ def __str__(self): span["descender"] = desc span["opacity"] = style.opacity # add more keys depending on MuPDF version - if THIS_MUPDF >= MUPDF1250: #separate if because not flags-dependent + if THIS_MUPDF >= MUPDF1250: # separate if because not flags-dependent span["opacity"] = style.opacity # rest of keys only make sense for FZ_STEXT_COLLECT_FLAGS - if dev_flags & mupdf.FZ_STEXT_COLLECT_FLAGS: - span["underline"] = bool(style.flags & mupdf.FZ_STEXT_UNDERLINE) - span["strikeout"] = bool(style.flags & mupdf.FZ_STEXT_STRIKEOUT) + if dev_flags & TEXT_COLLECT_FLAGS: + span["underline"] = bool(style.flags & TEXT_UNDERLINE) + span["strikeout"] = bool(style.flags & TEXT_STRIKEOUT) else: span["underline"] = None span["strikeout"] = None if THIS_MUPDF > MUPDF1251: - if dev_flags & mupdf.FZ_STEXT_COLLECT_FLAGS: - span["bold"] = bool(style.flags & mupdf.FZ_STEXT_BOLD) + if dev_flags & TEXT_COLLECT_FLAGS: + span["bold"] = bool(style.flags & TEXT_BOLD) else: span["bold"] = None - span["filled"] = bool(style.flags & mupdf.FZ_STEXT_FILLED) - span["stroked"] = bool(style.flags & mupdf.FZ_STEXT_STROKED) - span["clipped"] = bool(style.flags & mupdf.FZ_STEXT_CLIPPED) + span["filled"] = bool(style.flags & TEXT_FILLED) + span["stroked"] = bool(style.flags & TEXT_STROKED) + span["clipped"] = bool(style.flags & TEXT_CLIPPED) # Need to be careful here - doing 'old_style=style' does a shallow # copy, but we need to keep old_style as a distinct instance. @@ -16594,7 +16604,7 @@ def __str__(self): char_dict[dictkey_origin] = JM_py_from_point( ch.m_internal.origin) char_dict[dictkey_bbox] = JM_py_from_rect(r) if THIS_MUPDF >= MUPDF1250: - char_dict["synthetic"] = bool(ch.m_internal.flags & mupdf.FZ_STEXT_SYNTHETIC) + char_dict["synthetic"] = bool(ch.m_internal.flags & TEXT_SYNTHETIC) char_dict[dictkey_c] = chr(ch.m_internal.c) if char_list is None: diff --git a/src/extra.i b/src/extra.i index e040e5dad..ce214b2bb 100644 --- a/src/extra.i +++ b/src/extra.i @@ -3125,7 +3125,7 @@ mupdf::FzRect JM_make_spanlist( DICT_SETITEMSTR_DROP(span, "opacity", Py_BuildValue("f", style.opacity)); #endif - // rest of keys only make sense if FZ_STEXT_COLLECT_FLAGS was set + // rest of keys only make sense if FZ_STEXT_COLLECT_FLAGS (32768) was set #if (THIS_MUPDF >= MUPDF1250) if (dev_flags & 32768) { @@ -3139,9 +3139,9 @@ mupdf::FzRect JM_make_spanlist( } #endif #if (THIS_MUPDF > MUPDF1251) - if (dev_flags & FZ_STEXT_COLLECT_FLAGS) + if (dev_flags & 32768) // FZ_STEXT_COLLECT_FLAGS = 32768 { - DICT_SETITEMSTR_DROP(span, "bold", JM_BOOL(style.flags & FZ_STEXT_BOLD)); + DICT_SETITEMSTR_DROP(span, "bold", JM_BOOL(style.flags & 8)); // FZ_STEXT_BOLD = 8 } else { diff --git a/tests/test_tables.py b/tests/test_tables.py index ca8aef74c..4c2abcbd7 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -288,10 +288,10 @@ def test_markdown(): text = ( "|Header1|Header2|Header3|\n" "|---|---|---|\n" - "|Col11 Col12|Col21 Col22|Col31 Col32 Col33|\n" - "|Col13|Col23|Col34 Col35|\n" + "|Col11 Col12|Col21
Col22|Col31
Col32
Col33|\n" + "|Col13|Col23|Col34
Col35|\n" "|Col14|Col24|Col36|\n" - "|Col15|Col25 Col26||\n\n" + "|Col15|Col25
Col26||\n\n" ) assert tab.to_markdown() == text