SylphAI-Inc · liyin2015 · Nov 25, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 22, 2024
diff --git a/adalflow/tutorials/adalflow_text_splitter.py b/adalflow/tutorials/adalflow_text_splitter.py
@@ -0,0 +1,114 @@
+from adalflow.components.data_process.text_splitter import TextSplitter
+from adalflow.core.types import Document
+from typing import Optional, Dict
+
+
+def split_by_words(
+    text: str, chunk_size: int = 5, chunk_overlap: int = 1, doc_id: Optional[str] = None
+) -> list:
+    """Split text by words with configurable parameters
+
+    Args:
+        text: Input text to split
+        chunk_size: Maximum number of words per chunk
+        chunk_overlap: Number of overlapping words between chunks
+        doc_id: Optional document ID
+
+    Returns:
+        List of Document objects containing the split text chunks
+    """
+    text_splitter = TextSplitter(
+        split_by="word", chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )
+
+    doc = Document(text=text, id=doc_id or "doc1")
+
+    return text_splitter.call(documents=[doc])
+
+
+def split_by_tokens(
+    text: str, chunk_size: int = 5, chunk_overlap: int = 0, doc_id: Optional[str] = None
+) -> list:
+    """Split text by tokens with configurable parameters
+
+    Args:
+        text: Input text to split
+        chunk_size: Maximum number of tokens per chunk
+        chunk_overlap: Number of overlapping tokens between chunks
+        doc_id: Optional document ID
+
+    Returns:
+        List of Document objects containing the split text chunks
+    """
+    text_splitter = TextSplitter(
+        split_by="token", chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )
+
+    doc = Document(text=text, id=doc_id or "doc1")
+
+    return text_splitter.call(documents=[doc])
+
+
+def split_by_custom(
+    text: str,
+    split_by: str,
+    separators: Dict[str, str],
+    chunk_size: int = 1,
+    chunk_overlap: int = 0,
+    doc_id: Optional[str] = None,
+) -> list:
+    """Split text using custom separator with configurable parameters
+
+    Args:
+        text: Input text to split
+        split_by: Custom split type that matches separator dict key
+        separators: Dictionary mapping split types to separator strings
+        chunk_size: Maximum chunk size
+        chunk_overlap: Overlap size between chunks
+        doc_id: Optional document ID
+
+    Returns:
+        List of Document objects containing the split text chunks
+    """
+    text_splitter = TextSplitter(
+        split_by=split_by,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        separators=separators,
+    )
+
+    doc = Document(text=text, id=doc_id or "doc1")
+
+    return text_splitter.call(documents=[doc])
+
+
+def example_usage():
+    """Example showing how to use the text splitting functions"""
+    # Word splitting example
+    text = "Example text. More example text. Even more text to illustrate."
+    word_splits = split_by_words(text, chunk_size=5, chunk_overlap=1)
+    print("\nWord Split Example:")
+    for doc in word_splits:
+        print(doc)
+
+    # Token splitting example
+    token_splits = split_by_tokens(text, chunk_size=5, chunk_overlap=0)
+    print("\nToken Split Example:")
+    for doc in token_splits:
+        print(doc)
+
+    # Custom separator example
+    question_text = "What is your name? How old are you? Where do you live?"
+    custom_splits = split_by_custom(
+        text=question_text,
+        split_by="question",
+        separators={"question": "?"},
+        chunk_size=1,
+    )
+    print("\nCustom Separator Example:")
+    for doc in custom_splits:
+        print(doc)
+
+
+if __name__ == "__main__":
+    example_usage()
diff --git a/docs/source/tutorials/base_data_class.rst b/docs/source/tutorials/base_data_class.rst
@@ -7,7 +7,7 @@
       <a href="https://colab.research.google.com/github/SylphAI-Inc/AdalFlow/blob/main/notebooks/tutorials/adalflow_dataclasses.ipynb" target="_blank" style="margin-right: 10px;">
          <img alt="Try Quickstart in Colab" src="https://colab.research.google.com/assets/colab-badge.svg" style="vertical-align: middle;">
       </a>
-
+      
    </div>
 
 DataClass

diff --git a/docs/source/tutorials/text_splitter.rst b/docs/source/tutorials/text_splitter.rst
@@ -1,3 +1,15 @@
+.. raw:: html
+
+   <div style="display: flex; justify-content: flex-start; align-items: center; margin-bottom: 20px;">
+      <a href="https://colab.research.google.com/github/SylphAI-Inc/LightRAG/blob/main/notebooks/tutorials/adalflow_text_splitter.ipynb" target="_blank" style="margin-right: 10px;">
+         <img alt="Try Quickstart in Colab" src="https://colab.research.google.com/assets/colab-badge.svg" style="vertical-align: middle;">
+      </a>
+      <a href="https://github.com/SylphAI-Inc/LightRAG/blob/main/adalflow/tutorials/adalflow_text_splitter.py" target="_blank" style="display: flex; align-items: center;">
+         <img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" style="height: 20px; width: 20px; margin-right: 5px;">
+         <span style="vertical-align: middle;"> Open Source Code</span>
+      </a>
+   </div>
+
 .. _tutorials-text_splitter:
 
 

diff --git a/notebooks/tutorials/adalflow_text_splitter.ipynb b/notebooks/tutorials/adalflow_text_splitter.ipynb
@@ -0,0 +1,170 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "A99Pp0T7A9BM"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install adalflow[openai,groq,faiss-cpu]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "y2SVUBNeBMy5"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from getpass import getpass\n",
+    "\n",
+    "# You can use a setup_env file to set the OPENAI_API_KEY too\n",
+    "# (ensure you setup OPENAI_API_KEY in your project .env file) using the following commands:\n",
+    "# from adalflow.utils import setup_env\n",
+    "\n",
+    "# Prompt user to enter their API keys securely\n",
+    "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n",
+    "\n",
+    "# Set environment variables\n",
+    "os.environ['OPENAI_API_KEY'] = openai_api_key\n",
+    "\n",
+    "print(\"API keys have been set.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "RWWG9WRt2r9L",
+    "outputId": "faad52a8-47f5-48bc-e2c3-17a5aea21254"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<00:00, 788.85it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Document(id=6374a3e5-2ef9-40ba-a7b3-e18c2b466390, text='Example text. More example text. ', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None)\n",
+      "Document(id=b46045ba-3ebb-4e66-93d5-ece2d6ace3de, text='text. Even more text to ', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None)\n",
+      "Document(id=eba5555b-e6d6-4ca1-8452-af22295e68f8, text='to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from adalflow.components.data_process.text_splitter import TextSplitter\n",
+    "from adalflow.core.types import Document\n",
+    "\n",
+    "# Configure the splitter settings\n",
+    "text_splitter = TextSplitter(\n",
+    "    split_by=\"word\",\n",
+    "    chunk_size=5,\n",
+    "    chunk_overlap=1\n",
+    ")\n",
+    "\n",
+    "# Example document\n",
+    "doc = Document(\n",
+    "    text=\"Example text. More example text. Even more text to illustrate.\",\n",
+    "    id=\"doc1\"\n",
+    ")\n",
+    "\n",
+    "# Execute the splitting\n",
+    "splitted_docs = text_splitter.call(documents=[doc])\n",
+    "\n",
+    "for doc in splitted_docs:\n",
+    "    print(doc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "LioyB3eCAOs8",
+    "outputId": "11cddc1c-608a-4027-830f-fe30a882a766"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<00:00, 489.02it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Document(id=b0c308f2-73d2-44cf-aaf2-63e8f87198e4, text='Example text. More example', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None)\n",
+      "Document(id=3a37adff-c8ac-4cff-8b5e-9c68e0de9772, text=' text. Even more text', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None)\n",
+      "Document(id=e1b56768-7918-4a94-8f08-a01161cb2dcf, text=' to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from adalflow.components.data_process.text_splitter import TextSplitter\n",
+    "from adalflow.core.types import Document\n",
+    "\n",
+    "# Configure the splitter settings\n",
+    "text_splitter = TextSplitter(\n",
+    "    split_by=\"token\",\n",
+    "    chunk_size=5,\n",
+    "    chunk_overlap=0\n",
+    ")\n",
+    "\n",
+    "doc = Document(\n",
+    "    text=\"Example text. More example text. Even more text to illustrate.\",\n",
+    "    id = \"doc1\"\n",
+    "    )\n",
+    "\n",
+    "splitted_docs = (text_splitter.call(documents=[doc]))\n",
+    "\n",
+    "for doc in splitted_docs:\n",
+    "    print(doc)"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,7 +7,7 @@ @@
           <a href="https://colab.research.google.com/github/SylphAI-Inc/AdalFlow/blob/main/notebooks/tutorials/adalflow_dataclasses.ipynb" target="_blank" style="margin-right: 10px;">
              <img alt="Try Quickstart in Colab" src="https://colab.research.google.com/assets/colab-badge.svg" style="vertical-align: middle;">
           </a>
        </div>
     DataClass
@@ Expand Down @@