diff --git a/magic_pdf/data/read_api.py b/magic_pdf/data/read_api.py index f66ed7ef..76e83b5e 100644 --- a/magic_pdf/data/read_api.py +++ b/magic_pdf/data/read_api.py @@ -104,7 +104,7 @@ def read_local_office(path: str) -> list[PymuDocDataset]: shutil.rmtree(temp_dir) return ret -def read_local_images(path: str, suffixes: list[str]=[]) -> list[ImageDataset]: +def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[ImageDataset]: """Read images from path or directory. Args: diff --git a/next_docs/en/user_guide/install/install.rst b/next_docs/en/user_guide/install/install.rst index f52a6bc6..3c48a56b 100644 --- a/next_docs/en/user_guide/install/install.rst +++ b/next_docs/en/user_guide/install/install.rst @@ -112,7 +112,7 @@ Download model weight files Install LibreOffice[Optional] ---------------------------------- -This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can Skip this section if no need for those filetype processing. +This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can **skip** this section if no need for those filetype processing. Linux/Macos Platform diff --git a/next_docs/en/user_guide/quick_start.rst b/next_docs/en/user_guide/quick_start.rst index 1ccd97e6..11631803 100644 --- a/next_docs/en/user_guide/quick_start.rst +++ b/next_docs/en/user_guide/quick_start.rst @@ -8,8 +8,8 @@ Want to learn about the usage methods under different scenarios ? This page give :maxdepth: 1 quick_start/convert_pdf - quick_start/convert_images + quick_start/convert_image quick_start/convert_ppt - quick_start/convert_word - quick_start/convert_directory - + quick_start/convert_pptx + quick_start/convert_doc + quick_start/convert_docx diff --git a/next_docs/en/user_guide/quick_start/convert_doc.rst b/next_docs/en/user_guide/quick_start/convert_doc.rst new file mode 100644 index 00000000..b2980408 --- /dev/null +++ b/next_docs/en/user_guide/quick_start/convert_doc.rst @@ -0,0 +1,43 @@ + + +Convert Word +============= + +.. admonition:: Warning + :class: tip + + When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF. + + For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output. + +.. code:: python + + import os + + from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader + from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze + from magic_pdf.data.read_api import read_local_office + + # prepare env + local_image_dir, local_md_dir = "output/images", "output" + image_dir = str(os.path.basename(local_image_dir)) + + os.makedirs(local_image_dir, exist_ok=True) + + image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( + local_md_dir + ) + + # proc + ## Create Dataset Instance + input_file = "some_doc.doc" # replace with real ms-office file + + input_file_name = input_file.split(".")[0] + ds = read_local_office(input_file)[0] + + ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( + md_writer, f"{input_file_name}.md", image_dir + ) + + + diff --git a/next_docs/en/user_guide/quick_start/convert_docx.rst b/next_docs/en/user_guide/quick_start/convert_docx.rst new file mode 100644 index 00000000..239928a7 --- /dev/null +++ b/next_docs/en/user_guide/quick_start/convert_docx.rst @@ -0,0 +1,41 @@ + +Convert DocX +============= + +.. admonition:: Warning + :class: tip + + When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF. + + For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output. + + +.. code:: python + + import os + + from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader + from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze + from magic_pdf.data.read_api import read_local_office + + # prepare env + local_image_dir, local_md_dir = "output/images", "output" + image_dir = str(os.path.basename(local_image_dir)) + + os.makedirs(local_image_dir, exist_ok=True) + + image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( + local_md_dir + ) + + # proc + ## Create Dataset Instance + input_file = "some_docx.docx" # replace with real ms-office file + + input_file_name = input_file.split(".")[0] + ds = read_local_office(input_file)[0] + + ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( + md_writer, f"{input_file_name}.md", image_dir + ) + diff --git a/next_docs/en/user_guide/quick_start/convert_image.rst b/next_docs/en/user_guide/quick_start/convert_image.rst new file mode 100644 index 00000000..a23fdacd --- /dev/null +++ b/next_docs/en/user_guide/quick_start/convert_image.rst @@ -0,0 +1,33 @@ + + +Convert Image +=============== + +.. code:: python + + import os + + from magic_pdf.data.data_reader_writer import FileBasedDataWriter + from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze + from magic_pdf.data.read_api import read_local_images + + # prepare env + local_image_dir, local_md_dir = "output/images", "output" + image_dir = str(os.path.basename(local_image_dir)) + + os.makedirs(local_image_dir, exist_ok=True) + + image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( + local_md_dir + ) + + # proc + ## Create Dataset Instance + input_file = "some_image.jpg" # replace with real image file + + input_file_name = input_file.split(".")[0] + ds = read_local_images(input_file)[0] + + ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( + md_writer, f"{input_file_name}.md", image_dir + ) diff --git a/next_docs/en/user_guide/quick_start/convert_images.rst b/next_docs/en/user_guide/quick_start/convert_images.rst deleted file mode 100644 index 60edfc28..00000000 --- a/next_docs/en/user_guide/quick_start/convert_images.rst +++ /dev/null @@ -1,5 +0,0 @@ - - -Convert Images -================ - diff --git a/next_docs/en/user_guide/quick_start/convert_pdf.rst b/next_docs/en/user_guide/quick_start/convert_pdf.rst index 18c1ff27..ebacd9c8 100644 --- a/next_docs/en/user_guide/quick_start/convert_pdf.rst +++ b/next_docs/en/user_guide/quick_start/convert_pdf.rst @@ -3,3 +3,36 @@ Convert PDF ============ +.. code:: python + + import os + + from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader + from magic_pdf.data.dataset import PymuDocDataset + from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze + + # args + pdf_file_name = "abc.pdf" # replace with the real pdf path + name_without_suff = pdf_file_name.split(".")[0] + + # prepare env + local_image_dir, local_md_dir = "output/images", "output" + image_dir = str(os.path.basename(local_image_dir)) + + os.makedirs(local_image_dir, exist_ok=True) + + image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( + local_md_dir + ) + + # read bytes + reader1 = FileBasedDataReader("") + pdf_bytes = reader1.read(pdf_file_name) # read the pdf content + + # proc + ## Create Dataset Instance + ds = PymuDocDataset(pdf_bytes) + + ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir) + + diff --git a/next_docs/en/user_guide/quick_start/convert_ppt.rst b/next_docs/en/user_guide/quick_start/convert_ppt.rst index 6a8abae2..f5b80bc0 100644 --- a/next_docs/en/user_guide/quick_start/convert_ppt.rst +++ b/next_docs/en/user_guide/quick_start/convert_ppt.rst @@ -3,3 +3,39 @@ Convert PPT ============ +.. admonition:: Warning + :class: tip + + When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF. + + For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output. + + +.. code:: python + + import os + + from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader + from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze + from magic_pdf.data.read_api import read_local_office + + # prepare env + local_image_dir, local_md_dir = "output/images", "output" + image_dir = str(os.path.basename(local_image_dir)) + + os.makedirs(local_image_dir, exist_ok=True) + + image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( + local_md_dir + ) + + # proc + ## Create Dataset Instance + input_file = "some_ppt.ppt" # replace with real ms-office file + + input_file_name = input_file.split(".")[0] + ds = read_local_office(input_file)[0] + + ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( + md_writer, f"{input_file_name}.md", image_dir + ) diff --git a/next_docs/en/user_guide/quick_start/convert_pptx.rst b/next_docs/en/user_guide/quick_start/convert_pptx.rst new file mode 100644 index 00000000..07a557b9 --- /dev/null +++ b/next_docs/en/user_guide/quick_start/convert_pptx.rst @@ -0,0 +1,42 @@ + + +Convert PPTX +================= + +.. admonition:: Warning + :class: tip + + When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF. + + For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output. + + + +.. code:: python + + import os + + from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader + from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze + from magic_pdf.data.read_api import read_local_office + + # prepare env + local_image_dir, local_md_dir = "output/images", "output" + image_dir = str(os.path.basename(local_image_dir)) + + os.makedirs(local_image_dir, exist_ok=True) + + image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( + local_md_dir + ) + + # proc + ## Create Dataset Instance + input_file = "some_pptx.pptx" # replace with real ms-office file + + input_file_name = input_file.split(".")[0] + ds = read_local_office(input_file)[0] + + ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md( + md_writer, f"{input_file_name}.md", image_dir + ) diff --git a/next_docs/en/user_guide/quick_start/convert_word.rst b/next_docs/en/user_guide/quick_start/convert_word.rst deleted file mode 100644 index 8ab22d05..00000000 --- a/next_docs/en/user_guide/quick_start/convert_word.rst +++ /dev/null @@ -1,6 +0,0 @@ - - -Convert Word -============= - - diff --git a/next_docs/en/user_guide/tutorial/pipeline.rst b/next_docs/en/user_guide/tutorial/pipeline.rst index 8e73a3f5..f7892eaa 100644 --- a/next_docs/en/user_guide/tutorial/pipeline.rst +++ b/next_docs/en/user_guide/tutorial/pipeline.rst @@ -28,7 +28,6 @@ Minimal Example image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( local_md_dir ) - image_dir = str(os.path.basename(local_image_dir)) # read bytes reader1 = FileBasedDataReader("")