docs: add quick_start example

icecraft · Dec 10, 2024 · 43a571c · 43a571c
1 parent 959f986
commit 43a571c
Show file tree

Hide file tree

Showing 12 changed files with 234 additions and 18 deletions.
diff --git a/magic_pdf/data/read_api.py b/magic_pdf/data/read_api.py
@@ -104,7 +104,7 @@ def read_local_office(path: str) -> list[PymuDocDataset]:
     shutil.rmtree(temp_dir)
     return ret
 
-def read_local_images(path: str, suffixes: list[str]=[]) -> list[ImageDataset]:
+def read_local_images(path: str, suffixes: list[str]=['png', 'jpg']) -> list[ImageDataset]:
     """Read images from path or directory.
 
     Args:

diff --git a/next_docs/en/user_guide/install/install.rst b/next_docs/en/user_guide/install/install.rst
@@ -112,7 +112,7 @@ Download model weight files
 Install LibreOffice[Optional]
 ----------------------------------
 
-This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can Skip this section if no need for those filetype processing.
+This section is required for handle **doc**, **docx**, **ppt**, **pptx** filetype, You can **skip** this section if no need for those filetype processing.
 
 
 Linux/Macos Platform

diff --git a/next_docs/en/user_guide/quick_start.rst b/next_docs/en/user_guide/quick_start.rst
@@ -8,8 +8,8 @@ Want to learn about the usage methods under different scenarios ? This page give
     :maxdepth: 1
 
     quick_start/convert_pdf 
-    quick_start/convert_images
+    quick_start/convert_image
     quick_start/convert_ppt
-    quick_start/convert_word 
-    quick_start/convert_directory
-
+    quick_start/convert_pptx
+    quick_start/convert_doc
+    quick_start/convert_docx
diff --git a/next_docs/en/user_guide/quick_start/convert_doc.rst b/next_docs/en/user_guide/quick_start/convert_doc.rst
@@ -0,0 +1,43 @@
+
+
+Convert Word 
+=============
+
+.. admonition:: Warning
+    :class: tip
+
+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
+
+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+
+.. code:: python 
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_doc.doc"     # replace with real ms-office file
+    
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
+
+
+
diff --git a/next_docs/en/user_guide/quick_start/convert_docx.rst b/next_docs/en/user_guide/quick_start/convert_docx.rst
@@ -0,0 +1,41 @@
+
+Convert DocX
+=============
+
+.. admonition:: Warning
+    :class: tip
+
+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
+
+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+
+
+.. code:: python 
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_docx.docx"     # replace with real ms-office file
+    
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
+
diff --git a/next_docs/en/user_guide/quick_start/convert_image.rst b/next_docs/en/user_guide/quick_start/convert_image.rst
@@ -0,0 +1,33 @@
+
+
+Convert Image
+===============
+
+.. code:: python
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_images
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_image.jpg"       # replace with real image file
+
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_images(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
diff --git a/next_docs/en/user_guide/quick_start/convert_images.rst b/next_docs/en/user_guide/quick_start/convert_images.rst
diff --git a/next_docs/en/user_guide/quick_start/convert_pdf.rst b/next_docs/en/user_guide/quick_start/convert_pdf.rst
@@ -3,3 +3,36 @@
 Convert PDF 
 ============
 
+.. code:: python
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.data.dataset import PymuDocDataset
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+
+    # args
+    pdf_file_name = "abc.pdf"  # replace with the real pdf path
+    name_without_suff = pdf_file_name.split(".")[0]
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # read bytes
+    reader1 = FileBasedDataReader("")
+    pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content
+
+    # proc
+    ## Create Dataset Instance
+    ds = PymuDocDataset(pdf_bytes)
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(md_writer, f"{name_without_suff}.md", image_dir)
+
+
diff --git a/next_docs/en/user_guide/quick_start/convert_ppt.rst b/next_docs/en/user_guide/quick_start/convert_ppt.rst
@@ -3,3 +3,39 @@
 Convert PPT 
 ============
 
+.. admonition:: Warning
+    :class: tip
+
+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
+
+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+
+
+.. code:: python 
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_ppt.ppt"     # replace with real ms-office file
+    
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
diff --git a/next_docs/en/user_guide/quick_start/convert_pptx.rst b/next_docs/en/user_guide/quick_start/convert_pptx.rst
@@ -0,0 +1,42 @@
+
+
+Convert PPTX
+=================
+
+.. admonition:: Warning
+    :class: tip
+
+    When processing MS-Office files, we first use third-party software to convert the MS-Office files to PDF.
+
+    For certain MS-Office files, the quality of the converted PDF files may not be very high, which can affect the quality of the final output.
+
+
+
+.. code:: python 
+
+    import os
+
+    from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+    from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+    from magic_pdf.data.read_api import read_local_office
+
+    # prepare env
+    local_image_dir, local_md_dir = "output/images", "output"
+    image_dir = str(os.path.basename(local_image_dir))
+
+    os.makedirs(local_image_dir, exist_ok=True)
+
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
+        local_md_dir
+    )
+
+    # proc
+    ## Create Dataset Instance
+    input_file = "some_pptx.pptx"     # replace with real ms-office file
+    
+    input_file_name = input_file.split(".")[0]
+    ds = read_local_office(input_file)[0]
+
+    ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
+        md_writer, f"{input_file_name}.md", image_dir
+    )
diff --git a/next_docs/en/user_guide/quick_start/convert_word.rst b/next_docs/en/user_guide/quick_start/convert_word.rst
diff --git a/next_docs/en/user_guide/tutorial/pipeline.rst b/next_docs/en/user_guide/tutorial/pipeline.rst
@@ -28,7 +28,6 @@ Minimal Example
     image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
         local_md_dir
     )
-    image_dir = str(os.path.basename(local_image_dir))
 
     # read bytes
     reader1 = FileBasedDataReader("")