-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #30 from Encamina/@lmarcos/issue-27#-token-countin…
…g-functions-are-slow @lmarcos/issue 27# token counting functions are slow
- Loading branch information
Showing
3 changed files
with
77 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,18 +5,45 @@ namespace Encamina.Enmarcha.SemanticKernel.Abstractions; | |
/// <inheritdoc/> | ||
public interface ILengthFunctions : AI.Abstractions.ILengthFunctions | ||
{ | ||
/// <summary> | ||
/// Gets the default <see cref="GptEncoding">encoding</see> for models like `GPT-3.5-Turbo` and `GPT-4` from OpenAI. | ||
/// </summary> | ||
public static readonly GptEncoding DefaultGptEncoding = GptEncoding.GetEncoding("cl100k_base"); | ||
|
||
/// <summary> | ||
/// Dictionary to cache GptEncoding instances based on encoding names. | ||
/// </summary> | ||
private static readonly Dictionary<string, GptEncoding> EncodingCache = []; | ||
Check warning on line 16 in src/Encamina.Enmarcha.SemanticKernel.Abstractions/ILengthFunctions.cs GitHub Actions / CI
|
||
|
||
/// <summary> | ||
/// Gets the number of tokens using encodings for models like `GPT-3.5-Turbo` and `GPT-4` from OpenAI on the specified text. | ||
/// If the text is <see langword="null"/> or empty (i.e., <see cref="string.Empty"/>), returns zero (<c>0</c>). | ||
/// </summary> | ||
/// <seealso href="https://platform.openai.com/tokenizer"/> | ||
/// <seealso href="https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb"/> | ||
public static Func<string, int> LengthByTokenCount => (text) => string.IsNullOrEmpty(text) ? 0 : GptEncoding.GetEncoding("cl100k_base").Encode(text).Count; | ||
public static Func<string, int> LengthByTokenCount => (text) => string.IsNullOrEmpty(text) ? 0 : DefaultGptEncoding.Encode(text).Count; | ||
|
||
/// <summary> | ||
/// Gets the number of tokens using a given encoding on the specified text. | ||
/// If the text is <see langword="null"/> or empty (i.e., <see cref="string.Empty"/>), returns zero (<c>0</c>). | ||
/// </summary> | ||
/// <seealso href="https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb"/> | ||
public static Func<string, string, int> LengthByTokenCountUsingEncoding => (encoding, text) => string.IsNullOrEmpty(text) ? 0 : GptEncoding.GetEncoding(encoding).Encode(text).Count; | ||
public static Func<string, string, int> LengthByTokenCountUsingEncoding => (encoding, text) => string.IsNullOrEmpty(text) ? 0 : GetCachedEncoding(encoding).Encode(text).Count; | ||
|
||
/// <summary> | ||
/// Gets the GptEncoding instance based on the specified encoding name, caching it for future use. | ||
/// </summary> | ||
/// <param name="encoding">The name of the GptEncoding.</param> | ||
/// <returns>The GptEncoding instance.</returns> | ||
private static GptEncoding GetCachedEncoding(string encoding) | ||
{ | ||
if (EncodingCache.TryGetValue(encoding, out var gptEncoding)) | ||
{ | ||
return gptEncoding; | ||
} | ||
|
||
gptEncoding = GptEncoding.GetEncoding(encoding); | ||
EncodingCache[encoding] = gptEncoding; | ||
return gptEncoding; | ||
} | ||
} |