Skip to content

Commit

Permalink
fix: preserve hyperlink references in Word document merge
Browse files Browse the repository at this point in the history
  • Loading branch information
Oreoxmt committed Dec 21, 2024
1 parent fc4a147 commit 1bb6eb2
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 12 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ Use the `tidocs merge` command to access a web interface for combining multiple

## Changelog

### [1.0.6] - 2024-12-21

- Fix the issue that hyperlinks become broken after merging Word documents due to incorrect relationship reference handling. ([#2](https://github.com/Oreoxmt/tidocs/issues/2))

### [1.0.5] - 2024-12-03

- Fix compatibility issues with Python 3.9.
Expand Down
2 changes: 1 addition & 1 deletion src/tidocs/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def launch_marimo_app(appname: str, host: str, port: int) -> None:


@click.command(no_args_is_help=True)
@click.version_option(version="1.0.5")
@click.version_option(version="1.0.6")
@click.argument("appname", type=click.Choice(list(APPS.keys())), required=True)
@click.option(
"--host",
Expand Down
45 changes: 34 additions & 11 deletions src/tidocs/docx_handler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import io

from docx import Document
from docx.oxml import parse_xml
from docx.oxml.shared import qn


def merge_word_docs_with_tables(
Expand All @@ -9,7 +10,7 @@ def merge_word_docs_with_tables(
marker_text: str = "TIDOCS_REPLACE_TABLE",
) -> bytes:
"""
Merges tables from one Word document into another at specified marker locations.
Merges tables from one Word document into another at specified marker locations, preserving hyperlinks and other document relationships.
Args:
main_doc_data (bytes): The main document binary data
Expand All @@ -23,20 +24,43 @@ def merge_word_docs_with_tables(
main_doc = Document(io.BytesIO(main_doc_data))
table_doc = Document(io.BytesIO(table_doc_data))

# Create a mapping of relationship IDs between documents
rel_map = {}

# Copy hyperlink relationships from table_doc to main_doc
for rel_id, rel in table_doc.part.rels.items():
if (
rel.reltype
== "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"
):
new_rel_id = main_doc.part.relate_to(
rel._target, rel.reltype, rel.is_external
)
rel_map[rel_id] = new_rel_id

# Find all tables in the table document
tables_to_insert = {}
current_heading = None

# Associate tables with their preceding headings
for element in table_doc.element.body:
if element.tag.endswith("p"): # It's a paragraph
if element.tag.endswith("p"):
paragraph_text = element.text.strip()
if paragraph_text:
# print(paragraph_text)
current_heading = paragraph_text
elif element.tag.endswith("tbl"): # It's a table
elif element.tag.endswith("tbl"):
if current_heading:
tables_to_insert[current_heading] = element
# Deep copy the table element
table_copy = parse_xml(element.xml)

# Update relationship IDs in the copied table
# Find all hyperlinks using the proper namespace approach
for hyperlink in table_copy.xpath(".//w:hyperlink"):
old_rid = hyperlink.get(qn("r:id"))
if old_rid in rel_map:
hyperlink.set(qn("r:id"), rel_map[old_rid])

tables_to_insert[current_heading] = table_copy

# Process the main document
for paragraph in main_doc.paragraphs:
Expand All @@ -53,17 +77,16 @@ def merge_word_docs_with_tables(
return output.getvalue()


# Usage with your existing code
def merge_documents(doc_data: bytes, table_data: bytes) -> bytes:
"""
Wrapper function to merge your documents using the existing download objects
Merge two Word documents, inserting table_data into doc_data.
Args:
doc_data (bytes): Main document data from first Pandoc conversion
table_data (bytes): Table document data from second Pandoc conversion
doc_data: Main document binary data
table_data: Table document binary data
Returns:
bytes: Merged document data
Merged document binary data
"""
try:
merged_data = merge_word_docs_with_tables(doc_data, table_data)
Expand Down

0 comments on commit 1bb6eb2

Please sign in to comment.