Skip to content

Commit

Permalink
Merge pull request #41 from Encamina/@lmarcos/text_splitter_improvements
Browse files Browse the repository at this point in the history
Text splitter improvements
  • Loading branch information
LuisM000 authored Jan 31, 2024
2 parents 01c2722 + 1b3345d commit b1f7128
Show file tree
Hide file tree
Showing 11 changed files with 240 additions and 19 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ More information about these warnings is available here: https://github.com/micr
- Signature of `UpsertMemoryAsync` method has changed in `IMemoryManager` interface.
- Signature of `BatchUpsertMemoriesAsync` method has changed in `IMemoryManager` interface.
- Dependency with `Kernel` has been removed in `MemoryManager` class. Also, added dependency with `ILogger`.
- Added method overloads to pass `Encamina.Enmarcha.AI.Abstractions.TextSplitterOptions` when splitting text in `Encamina.Enmarcha.AI.Abstractions.ITextSplitter` and its implementations.

### Major change
- Method `GetDocumentConnector` in `DocumentContentExtractorBase` is now `public` instead of `protected`.
Expand Down
2 changes: 1 addition & 1 deletion Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

<PropertyGroup>
<VersionPrefix>8.1.2</VersionPrefix>
<VersionSuffix>preview-15</VersionSuffix>
<VersionSuffix>preview-16</VersionSuffix>
</PropertyGroup>

<!--
Expand Down
6 changes: 6 additions & 0 deletions Enmarcha.sln
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Encamina.Enmarcha.Samples.S
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Encamina.Enmarcha.Samples.SemanticKernel.QuestionAnswering", "samples\SemanticKernel\Encamina.Enmarcha.Samples.SemanticKernel.QuestionAnswering\Encamina.Enmarcha.Samples.SemanticKernel.QuestionAnswering.csproj", "{AA1E5E93-FE02-4395-9260-C7C869F22785}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Encamina.Enmarcha.AI.Tests", "tst\Encamina.Enmarcha.AI.Tests\Encamina.Enmarcha.AI.Tests.csproj", "{7F3ECD81-28E6-4000-9005-1B2ABA8EC1C5}"
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Encamina.Enmarcha.SemanticKernel.Tests", "tst\Encamina.Enmarcha.SemanticKernel.Tests\Encamina.Enmarcha.SemanticKernel.Tests.csproj", "{7B6F4DC4-74E2-4013-8DBA-12B7AAAD5278}"
EndProject
Global
Expand Down Expand Up @@ -348,6 +349,10 @@ Global
{AA1E5E93-FE02-4395-9260-C7C869F22785}.Debug|Any CPU.Build.0 = Debug|Any CPU
{AA1E5E93-FE02-4395-9260-C7C869F22785}.Release|Any CPU.ActiveCfg = Release|Any CPU
{AA1E5E93-FE02-4395-9260-C7C869F22785}.Release|Any CPU.Build.0 = Release|Any CPU
{7F3ECD81-28E6-4000-9005-1B2ABA8EC1C5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{7F3ECD81-28E6-4000-9005-1B2ABA8EC1C5}.Debug|Any CPU.Build.0 = Debug|Any CPU
{7F3ECD81-28E6-4000-9005-1B2ABA8EC1C5}.Release|Any CPU.ActiveCfg = Release|Any CPU
{7F3ECD81-28E6-4000-9005-1B2ABA8EC1C5}.Release|Any CPU.Build.0 = Release|Any CPU
{7B6F4DC4-74E2-4013-8DBA-12B7AAAD5278}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{7B6F4DC4-74E2-4013-8DBA-12B7AAAD5278}.Debug|Any CPU.Build.0 = Debug|Any CPU
{7B6F4DC4-74E2-4013-8DBA-12B7AAAD5278}.Release|Any CPU.ActiveCfg = Release|Any CPU
Expand All @@ -367,6 +372,7 @@ Global
{BF6C4DFC-3CB3-4C62-8B86-08C0C1537CBC} = {B9E33951-E387-4A80-A652-A908FCBB34F3}
{9E8B3AEE-AC1C-4F46-A8D2-3EF550F64005} = {43252034-27E2-4981-AC2D-EA986B287863}
{AA1E5E93-FE02-4395-9260-C7C869F22785} = {43252034-27E2-4981-AC2D-EA986B287863}
{7F3ECD81-28E6-4000-9005-1B2ABA8EC1C5} = {CBD50B5F-AFB8-4DA1-9FD7-17D98EB3ED78}
{7B6F4DC4-74E2-4013-8DBA-12B7AAAD5278} = {CBD50B5F-AFB8-4DA1-9FD7-17D98EB3ED78}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
Expand Down
29 changes: 24 additions & 5 deletions src/Encamina.Enmarcha.AI.Abstractions/ITextSplitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ public interface ITextSplitter
IList<string> Separators { get; }

/// <summary>
/// Joins chunks into a single string using the specified separator.
/// Merges splits into chunks of text, using the specified separator, length function and <see cref="TextSplitterOptions"/>.
/// </summary>
/// <param name="chunks">The collection of chunks to join.</param>
/// <param name="separator">The separator to use between chunks.</param>
/// <returns>A single string with all the chunks joined together by the specified separator.</returns>
string JoinChunks(IEnumerable<string> chunks, string separator);
/// <param name="splits">The collection of splits to merge into chunks.</param>
/// <param name="separator">The separator to use between splits.</param>
/// <param name="lengthFunction">The function to use to calculate the length (or size) of each chunk, as specified by <paramref name="options"/> <see cref="TextSplitterOptions.ChunkSize"/>.</param>

Check warning on line 28 in src/Encamina.Enmarcha.AI.Abstractions/ITextSplitter.cs

View workflow job for this annotation

GitHub Actions / CI

Split this 201 characters long line (which is greater than 200 authorized). (https://rules.sonarsource.com/csharp/RSPEC-103)

Check warning on line 28 in src/Encamina.Enmarcha.AI.Abstractions/ITextSplitter.cs

View workflow job for this annotation

GitHub Actions / CI

Split this 201 characters long line (which is greater than 200 authorized). (https://rules.sonarsource.com/csharp/RSPEC-103)
/// <param name="options">Custom options used for merge.</param>
/// <returns>A collection of chunks built from the splits.</returns>
IEnumerable<string> MergeSplits(IEnumerable<string> splits, string separator, Func<string, int> lengthFunction, TextSplitterOptions options);

/// <summary>
/// Merges splits into chunks of text, using the specified separator and length function.
Expand All @@ -37,11 +39,28 @@ public interface ITextSplitter
/// <returns>A collection of chunks built from the splits.</returns>
IEnumerable<string> MergeSplits(IEnumerable<string> splits, string separator, Func<string, int> lengthFunction);

/// <summary>
/// Joins chunks into a single string using the specified separator.
/// </summary>
/// <param name="chunks">The collection of chunks to join.</param>
/// <param name="separator">The separator to use between chunks.</param>
/// <returns>A single string with all the chunks joined together by the specified separator.</returns>
string JoinChunks(IEnumerable<string> chunks, string separator);

/// <summary>
/// Splits the specified text, using the specified length function.
/// </summary>
/// <param name="text">The text to split.</param>
/// <param name="lengthFunction">A function to use to calculate the length (or size) of each split, usually specified by <see cref="ChunkSize"/>.</param>
/// <returns>A collection of text splits.</returns>
IEnumerable<string> Split(string text, Func<string, int> lengthFunction);

/// <summary>
/// Splits the specified text, using the specified length function and specified <see cref="TextSplitterOptions"/>.
/// </summary>
/// <param name="text">The text to be split.</param>
/// <param name="lengthFunction">Length function used to calculate the length of a string.</param>
/// <param name="options">Custom options used for splitting.</param>
/// <returns>An IEnumerable of smaller text chunks.</returns>
IEnumerable<string> Split(string text, Func<string, int> lengthFunction, TextSplitterOptions options);
}
34 changes: 30 additions & 4 deletions src/Encamina.Enmarcha.AI.Abstractions/TextSplitter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,18 @@ protected TextSplitter(IOptionsMonitor<TextSplitterOptions> options)
public IList<string> Separators { get; }

/// <inheritdoc/>
public abstract IEnumerable<string> Split(string text, Func<string, int> lengthFunction);
public abstract IEnumerable<string> Split(string text, Func<string, int> lengthFunction, TextSplitterOptions options);

/// <inheritdoc/>
public virtual IEnumerable<string> Split(string text, Func<string, int> lengthFunction)
{
return Split(text, lengthFunction, new TextSplitterOptions()
{
ChunkOverlap = ChunkOverlap,
ChunkSize = ChunkSize,
Separators = Separators,
});
}

/// <inheritdoc/>
/// <exception cref="ArgumentNullException">
Expand All @@ -69,12 +80,27 @@ public virtual string JoinChunks(IEnumerable<string> chunks, string separator)
/// Thrown when any of the parameters <paramref name="splits"/>, <paramref name="separator"/>, or <paramref name="lengthFunction"/> is <see langword="null"/>.
/// </exception>"
public virtual IEnumerable<string> MergeSplits(IEnumerable<string> splits, string separator, Func<string, int> lengthFunction)
{
return MergeSplits(splits, separator, lengthFunction, new TextSplitterOptions()
{
ChunkOverlap = ChunkOverlap,
ChunkSize = ChunkSize,
Separators = Separators,
});
}

/// <inheritdoc/>
/// <exception cref="ArgumentNullException">
/// Thrown when any of the parameters <paramref name="splits"/>, <paramref name="separator"/>, <paramref name="lengthFunction"/> or <paramref name="options"/> is <see langword="null"/>.
/// </exception>"
public virtual IEnumerable<string> MergeSplits(IEnumerable<string> splits, string separator, Func<string, int> lengthFunction, TextSplitterOptions options)

Check warning on line 96 in src/Encamina.Enmarcha.AI.Abstractions/TextSplitter.cs

View workflow job for this annotation

GitHub Actions / CI

Refactor this method to reduce its Cognitive Complexity from 20 to the 15 allowed. (https://rules.sonarsource.com/csharp/RSPEC-3776)

Check warning on line 96 in src/Encamina.Enmarcha.AI.Abstractions/TextSplitter.cs

View workflow job for this annotation

GitHub Actions / CI

Refactor this method to reduce its Cognitive Complexity from 20 to the 15 allowed. (https://rules.sonarsource.com/csharp/RSPEC-3776)
{
Guard.IsNotNull(splits);
Guard.IsNotNull(separator);
Guard.IsNotNull(lengthFunction);
Guard.IsNotNull(options);

string chunk = null;
string chunk;
var chunks = new List<string>();
var currentChunks = new Queue<string>();

Expand All @@ -86,7 +112,7 @@ public virtual IEnumerable<string> MergeSplits(IEnumerable<string> splits, strin
var splitLength = lengthFunction(split);
var hasCurrentChunks = currentChunks.Any();

if (hasCurrentChunks && total + splitLength + separatorLength > ChunkSize)
if (hasCurrentChunks && total + splitLength + separatorLength > options.ChunkSize)
{
chunk = JoinChunks(currentChunks, separator);

Expand All @@ -100,7 +126,7 @@ public virtual IEnumerable<string> MergeSplits(IEnumerable<string> splits, strin
// - There is a larger chunk than the chunk overlap
while (
hasCurrentChunks
&& (total > ChunkOverlap || (total + splitLength + separatorLength > ChunkSize && total > 0)))
&& (total > options.ChunkOverlap || (total + splitLength + separatorLength > options.ChunkSize && total > 0)))
{
total -= lengthFunction(currentChunks.Dequeue()) + (currentChunks.Count > 1 ? separatorLength : 0);
hasCurrentChunks = currentChunks.Any();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
namespace Encamina.Enmarcha.AI.TextSplitters;

/// <summary>
/// The recomended implementation of <see cref="ITextSplitter"/> for generic texts. It splits texts in order until the chunks are small
/// The recommended implementation of <see cref="ITextSplitter"/> for generic texts. It splits texts in order until the chunks are small
/// enough (based on <see cref="ITextSplitter.ChunkSize"/>. It will try to keep all paragraphs (and then sentences, and then words) together
/// as long as possible, since those would generically seem to be the strongest semantically related pieces of text that could be splitted.
/// </summary>
Expand All @@ -20,13 +20,13 @@ public RecursiveCharacterTextSplitter(IOptionsMonitor<TextSplitterOptions> optio
}

/// <inheritdoc/>
public override IEnumerable<string> Split(string text, Func<string, int> lengthFunction)
public override IEnumerable<string> Split(string text, Func<string, int> lengthFunction, TextSplitterOptions options)
{
var chunks = new List<string>();

string separator = null;

foreach (var s in Separators)
foreach (var s in options.Separators)
{
if (s == string.Empty || text.Contains(s, StringComparison.OrdinalIgnoreCase))
{
Expand All @@ -41,29 +41,28 @@ public override IEnumerable<string> Split(string text, Func<string, int> lengthF

foreach (var split in splits)
{
if (lengthFunction(split) < ChunkSize)
if (lengthFunction(split) < options.ChunkSize)
{
goodSplits.Add(split);
}
else
{
if (goodSplits.Any())
{
chunks.AddRange(MergeSplits(goodSplits, separator, lengthFunction));
chunks.AddRange(MergeSplits(goodSplits, separator, lengthFunction, options));
goodSplits = new List<string>();
}

var otherChunks = Split(split, lengthFunction);
var otherChunks = Split(split, lengthFunction, options);
chunks.AddRange(otherChunks);
}
}

if (goodSplits.Any())
{
chunks.AddRange(MergeSplits(goodSplits, separator, lengthFunction));
chunks.AddRange(MergeSplits(goodSplits, separator, lengthFunction, options));
}

return chunks;
}
}

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<Project Sdk="Microsoft.NET.Sdk">
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netstandard2.1</TargetFramework>
Expand All @@ -9,6 +9,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Options" Version="8.0.1" />
<PackageReference Include="Bogus" Version="35.4.0" />
</ItemGroup>

Expand Down
51 changes: 51 additions & 0 deletions src/Encamina.Enmarcha.Testing/TestOptionsMonitor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
using Microsoft.Extensions.Options;

namespace Encamina.Enmarcha.Testing;

/// <summary>
/// Represents an implementation of the <see cref="IOptionsMonitor{TOptions}"/> interface for testing purposes.
/// </summary>
/// <typeparam name="TOptions">The type of options being monitored.</typeparam>
public sealed class TestOptionsMonitor<TOptions> : IOptionsMonitor<TOptions>
{
private Action<TOptions, string> currentListener;

/// <summary>
/// Initializes a new instance of the <see cref="TestOptionsMonitor{TOptions}"/> class.
/// </summary>
public TestOptionsMonitor()
{
}

/// <summary>
/// Initializes a new instance of the <see cref="TestOptionsMonitor{TOptions}"/> class with the specified initial value.
/// </summary>
/// <param name="currentValue">The initial value of the options.</param>
public TestOptionsMonitor(TOptions currentValue)
{
CurrentValue = currentValue;
}

/// <inheritdoc/>
public TOptions CurrentValue { get; private set; }

/// <inheritdoc/>
public TOptions Get(string name) => CurrentValue;

/// <summary>
/// Sets the current value of the options and invokes the change listener if registered.
/// </summary>
/// <param name="value">The new value of the options.</param>
public void Set(TOptions value)
{
CurrentValue = value;
currentListener?.Invoke(value, string.Empty);
}

/// <inheritdoc/>
public IDisposable OnChange(Action<TOptions, string> listener)
{
this.currentListener = listener;
return null;
}
}
11 changes: 11 additions & 0 deletions tst/Encamina.Enmarcha.AI.Tests/Encamina.Enmarcha.AI.Tests.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net8.0</TargetFramework>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\src\Encamina.Enmarcha.AI\Encamina.Enmarcha.AI.csproj" />
</ItemGroup>

</Project>
Loading

0 comments on commit b1f7128

Please sign in to comment.