Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text splitter docs and colab update #278

Merged
merged 7 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions adalflow/tutorials/adalflow_text_splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from adalflow.components.data_process.text_splitter import TextSplitter
from adalflow.core.types import Document
from typing import Optional, Dict


def split_by_words(
text: str, chunk_size: int = 5, chunk_overlap: int = 1, doc_id: Optional[str] = None
) -> list:
"""Split text by words with configurable parameters

Args:
text: Input text to split
chunk_size: Maximum number of words per chunk
chunk_overlap: Number of overlapping words between chunks
doc_id: Optional document ID

Returns:
List of Document objects containing the split text chunks
"""
text_splitter = TextSplitter(
split_by="word", chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

doc = Document(text=text, id=doc_id or "doc1")

return text_splitter.call(documents=[doc])


def split_by_tokens(
text: str, chunk_size: int = 5, chunk_overlap: int = 0, doc_id: Optional[str] = None
) -> list:
"""Split text by tokens with configurable parameters

Args:
text: Input text to split
chunk_size: Maximum number of tokens per chunk
chunk_overlap: Number of overlapping tokens between chunks
doc_id: Optional document ID

Returns:
List of Document objects containing the split text chunks
"""
text_splitter = TextSplitter(
split_by="token", chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

doc = Document(text=text, id=doc_id or "doc1")

return text_splitter.call(documents=[doc])


def split_by_custom(
text: str,
split_by: str,
separators: Dict[str, str],
chunk_size: int = 1,
chunk_overlap: int = 0,
doc_id: Optional[str] = None,
) -> list:
"""Split text using custom separator with configurable parameters

Args:
text: Input text to split
split_by: Custom split type that matches separator dict key
separators: Dictionary mapping split types to separator strings
chunk_size: Maximum chunk size
chunk_overlap: Overlap size between chunks
doc_id: Optional document ID

Returns:
List of Document objects containing the split text chunks
"""
text_splitter = TextSplitter(
split_by=split_by,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=separators,
)

doc = Document(text=text, id=doc_id or "doc1")

return text_splitter.call(documents=[doc])


def example_usage():
"""Example showing how to use the text splitting functions"""
# Word splitting example
text = "Example text. More example text. Even more text to illustrate."
word_splits = split_by_words(text, chunk_size=5, chunk_overlap=1)
print("\nWord Split Example:")
for doc in word_splits:
print(doc)

# Token splitting example
token_splits = split_by_tokens(text, chunk_size=5, chunk_overlap=0)
print("\nToken Split Example:")
for doc in token_splits:
print(doc)

# Custom separator example
question_text = "What is your name? How old are you? Where do you live?"
custom_splits = split_by_custom(
text=question_text,
split_by="question",
separators={"question": "?"},
chunk_size=1,
)
print("\nCustom Separator Example:")
for doc in custom_splits:
print(doc)


if __name__ == "__main__":
example_usage()
2 changes: 1 addition & 1 deletion docs/source/tutorials/base_data_class.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
<a href="https://colab.research.google.com/github/SylphAI-Inc/AdalFlow/blob/main/notebooks/tutorials/adalflow_dataclasses.ipynb" target="_blank" style="margin-right: 10px;">
<img alt="Try Quickstart in Colab" src="https://colab.research.google.com/assets/colab-badge.svg" style="vertical-align: middle;">
</a>

</div>

DataClass
Expand Down
12 changes: 12 additions & 0 deletions docs/source/tutorials/text_splitter.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
.. raw:: html

<div style="display: flex; justify-content: flex-start; align-items: center; margin-bottom: 20px;">
<a href="https://colab.research.google.com/github/SylphAI-Inc/LightRAG/blob/main/notebooks/tutorials/adalflow_text_splitter.ipynb" target="_blank" style="margin-right: 10px;">
<img alt="Try Quickstart in Colab" src="https://colab.research.google.com/assets/colab-badge.svg" style="vertical-align: middle;">
</a>
<a href="https://github.com/SylphAI-Inc/LightRAG/blob/main/adalflow/tutorials/adalflow_text_splitter.py" target="_blank" style="display: flex; align-items: center;">
<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub" style="height: 20px; width: 20px; margin-right: 5px;">
<span style="vertical-align: middle;"> Open Source Code</span>
</a>
</div>

.. _tutorials-text_splitter:


Expand Down
170 changes: 170 additions & 0 deletions notebooks/tutorials/adalflow_text_splitter.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "A99Pp0T7A9BM"
},
"outputs": [],
"source": [
"!pip install adalflow[openai,groq,faiss-cpu]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "y2SVUBNeBMy5"
},
"outputs": [],
"source": [
"import os\n",
"\n",
"from getpass import getpass\n",
"\n",
"# You can use a setup_env file to set the OPENAI_API_KEY too\n",
"# (ensure you setup OPENAI_API_KEY in your project .env file) using the following commands:\n",
"# from adalflow.utils import setup_env\n",
"\n",
"# Prompt user to enter their API keys securely\n",
"openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n",
"\n",
"# Set environment variables\n",
"os.environ['OPENAI_API_KEY'] = openai_api_key\n",
"\n",
"print(\"API keys have been set.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RWWG9WRt2r9L",
"outputId": "faad52a8-47f5-48bc-e2c3-17a5aea21254"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<00:00, 788.85it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Document(id=6374a3e5-2ef9-40ba-a7b3-e18c2b466390, text='Example text. More example text. ', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None)\n",
"Document(id=b46045ba-3ebb-4e66-93d5-ece2d6ace3de, text='text. Even more text to ', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None)\n",
"Document(id=eba5555b-e6d6-4ca1-8452-af22295e68f8, text='to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"from adalflow.components.data_process.text_splitter import TextSplitter\n",
"from adalflow.core.types import Document\n",
"\n",
"# Configure the splitter settings\n",
"text_splitter = TextSplitter(\n",
" split_by=\"word\",\n",
" chunk_size=5,\n",
" chunk_overlap=1\n",
")\n",
"\n",
"# Example document\n",
"doc = Document(\n",
" text=\"Example text. More example text. Even more text to illustrate.\",\n",
" id=\"doc1\"\n",
")\n",
"\n",
"# Execute the splitting\n",
"splitted_docs = text_splitter.call(documents=[doc])\n",
"\n",
"for doc in splitted_docs:\n",
" print(doc)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LioyB3eCAOs8",
"outputId": "11cddc1c-608a-4027-830f-fe30a882a766"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Splitting Documents in Batches: 100%|██████████| 1/1 [00:00<00:00, 489.02it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Document(id=b0c308f2-73d2-44cf-aaf2-63e8f87198e4, text='Example text. More example', meta_data=None, vector=[], parent_doc_id=doc1, order=0, score=None)\n",
"Document(id=3a37adff-c8ac-4cff-8b5e-9c68e0de9772, text=' text. Even more text', meta_data=None, vector=[], parent_doc_id=doc1, order=1, score=None)\n",
"Document(id=e1b56768-7918-4a94-8f08-a01161cb2dcf, text=' to illustrate.', meta_data=None, vector=[], parent_doc_id=doc1, order=2, score=None)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"from adalflow.components.data_process.text_splitter import TextSplitter\n",
"from adalflow.core.types import Document\n",
"\n",
"# Configure the splitter settings\n",
"text_splitter = TextSplitter(\n",
" split_by=\"token\",\n",
" chunk_size=5,\n",
" chunk_overlap=0\n",
")\n",
"\n",
"doc = Document(\n",
" text=\"Example text. More example text. Even more text to illustrate.\",\n",
" id = \"doc1\"\n",
" )\n",
"\n",
"splitted_docs = (text_splitter.call(documents=[doc]))\n",
"\n",
"for doc in splitted_docs:\n",
" print(doc)"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading