Skip to content

Commit

Permalink
Merge pull request #103 from Encamina/@lmarcos/excel_document_conector
Browse files Browse the repository at this point in the history
Added Excel Document Connector
  • Loading branch information
LuisM000 authored Apr 24, 2024
2 parents 30d6f74 + 3abdefe commit 9bdeab0
Show file tree
Hide file tree
Showing 28 changed files with 1,894 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ Previous classification is not required if changes are simple or all belong to t
- New method `SupportedFileExtension` to check if a file extension is supported by the current instance of the `IDocumentConnectorProvider`.
- New method `AddDocumentConnector` to add (or replace) a document connector in the current instance of the `IDocumentConnectorProvider` for a specific file extension.
- Added new class `DocumentConnectorProviderBase` which provides a default base implementation of `IDocumentConnectorProvider`.
- Added new document connector to read Excel files (xlsx) `ExcelToMarkdownDocumentConnector`.

### Minor Changes

Expand Down
2 changes: 1 addition & 1 deletion Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

<PropertyGroup>
<VersionPrefix>8.1.6</VersionPrefix>
<VersionSuffix>preview-04</VersionSuffix>
<VersionSuffix>preview-05</VersionSuffix>
</PropertyGroup>

<!--
Expand Down
9 changes: 8 additions & 1 deletion Enmarcha.sln
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,9 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Encamina.Enmarcha.Data.Azur
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Encamina.Enmarcha.AspNet.OpenApi", "src\Encamina.Enmarcha.AspNet.OpenApi\Encamina.Enmarcha.AspNet.OpenApi.csproj", "{0EFAA5CF-7106-40E0-A427-1CFBFFAEA3EC}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Encamina.Enmarcha.Core.Tests", "tst\Encamina.Enmarcha.Core.Tests\Encamina.Enmarcha.Core.Tests.csproj", "{0516ADAE-C543-4B48-94EE-AC535DEFED0E}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Encamina.Enmarcha.Core.Tests", "tst\Encamina.Enmarcha.Core.Tests\Encamina.Enmarcha.Core.Tests.csproj", "{0516ADAE-C543-4B48-94EE-AC535DEFED0E}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Encamina.Enmarcha.SemanticKernel.Connectors.Document.Tests", "tst\Encamina.Enmarcha.SemanticKernel.Connectors.Document.Tests\Encamina.Enmarcha.SemanticKernel.Connectors.Document.Tests.csproj", "{1E9782AE-28E8-4C09-A66B-22A903A76C7F}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Expand Down Expand Up @@ -376,6 +378,10 @@ Global
{0516ADAE-C543-4B48-94EE-AC535DEFED0E}.Debug|Any CPU.Build.0 = Debug|Any CPU
{0516ADAE-C543-4B48-94EE-AC535DEFED0E}.Release|Any CPU.ActiveCfg = Release|Any CPU
{0516ADAE-C543-4B48-94EE-AC535DEFED0E}.Release|Any CPU.Build.0 = Release|Any CPU
{1E9782AE-28E8-4C09-A66B-22A903A76C7F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{1E9782AE-28E8-4C09-A66B-22A903A76C7F}.Debug|Any CPU.Build.0 = Debug|Any CPU
{1E9782AE-28E8-4C09-A66B-22A903A76C7F}.Release|Any CPU.ActiveCfg = Release|Any CPU
{1E9782AE-28E8-4C09-A66B-22A903A76C7F}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand All @@ -394,6 +400,7 @@ Global
{7F3ECD81-28E6-4000-9005-1B2ABA8EC1C5} = {CBD50B5F-AFB8-4DA1-9FD7-17D98EB3ED78}
{7B6F4DC4-74E2-4013-8DBA-12B7AAAD5278} = {CBD50B5F-AFB8-4DA1-9FD7-17D98EB3ED78}
{0516ADAE-C543-4B48-94EE-AC535DEFED0E} = {CBD50B5F-AFB8-4DA1-9FD7-17D98EB3ED78}
{1E9782AE-28E8-4C09-A66B-22A903A76C7F} = {CBD50B5F-AFB8-4DA1-9FD7-17D98EB3ED78}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {F30DF47A-541C-4383-BCEB-E4108D06A70E}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
using System.Text;

using Encamina.Enmarcha.SemanticKernel.Connectors.Document.Models.Excel;

using Microsoft.SemanticKernel.Plugins.Document;

namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;

/// <summary>
/// Extracts text from an Excel file (<c>.xlsx</c>) and exports to Markdown table format.
/// </summary>
public class ExcelToMarkdownDocumentConnector : IDocumentConnector
{
/// <summary>
/// Gets the options for loading the Excel document.
/// </summary>
public ExcelLoadOptions ExcelLoadOptions { get; } = new()
{
ExcludeHiddenSheets = true,
LoadOnlyCellsRangeWithText = true,
ExcludeEmptyColumns = false,
ExcludeEmptyRows = false,
ExcludeHiddenColumns = true,
ExcludeHiddenRows = true,
MergeEmptyRowsRules = new MergeEmptyElementsRules()
{
MinimumElementsToMerge = 10,
ResultingElementsFromMerge = 5
},
MergeEmptyColumnsRules = new MergeEmptyElementsRules()
{
MinimumElementsToMerge = 10,
ResultingElementsFromMerge = 5
}
};

/// <summary>
/// Gets the value used to replace line breaks.
/// </summary>
public string LineBreakReplacement { get; init; } = "<br>";

/// <summary>
/// Gets a value indicating whether cell values should be read with formatted.
/// </summary>
public bool WithFormattedValues { get; init; } = true;

/// <summary>
/// Gets a value indicating whether a separator should be added between the header and the data.
/// </summary>
public bool WithHeaderSeparator { get; init; } = true;

/// <summary>
/// Gets a value indicating whether styling (e.g., bold, italic) should be preserved.
/// </summary>
public bool WithStyling { get; init; } = true;

/// <summary>
/// Gets a value indicating whether the worksheet name should be included in the output.
/// </summary>
public bool WithWorksheetName { get; init; }

/// <summary>
/// Gets the worksheet separator used when reading the Excel document.
/// </summary>
public string WorksheetSeparator { get; init; } = "---";

/// <summary>
/// Gets a value indicating whether the worksheet name should be included in the output.
/// </summary>
public Func<string, string> WorksheetTemplateName { get; init; } = (worksheetName) => $"# {worksheetName}:";

/// <inheritdoc/>
public string ReadText(Stream stream)
{
var resultSb = new StringBuilder();

var excelDocument = ExcelDocument.Create(stream, ExcelLoadOptions);

var worksheets = excelDocument.Worksheets.ToList();

var rowSb = new StringBuilder();

foreach (var worksheet in worksheets)
{
if (WithWorksheetName)
{
rowSb.AppendLine(WorksheetTemplateName(worksheet.Name));
}

var rows = worksheet.Rows;

foreach (var row in rows)
{
var rowTexts = row.Select(GetCellTextValue).ToList();

rowSb.AppendLine($"|{string.Join("|", rowTexts)}|");

// Adds a separator between the header and the data
if (WithHeaderSeparator && rows[0].Equals(row))
{
rowSb.AppendLine($"|{string.Join("|", row.Select(_ => "---"))}|");
}
}

resultSb.AppendLine(rowSb.ToString().Trim());

if (worksheet != worksheets[worksheets.Count - 1])
{
resultSb.Append(WorksheetSeparator).AppendLine();
}

rowSb.Clear();
}

return resultSb.ToString().Trim();
}

/// <inheritdoc/>
public void Initialize(Stream stream)
{
// Intentionally not implemented to comply with the Liskov Substitution Principle
}

/// <inheritdoc/>
public void AppendText(Stream stream, string text)
{
// Intentionally not implemented to comply with the Liskov Substitution Principle
}

private static string ApplyStyles(string value, bool bold, bool italic)
{
if (string.IsNullOrWhiteSpace(value))
{
return value;
}

var style = bold ? "**" : string.Empty;
style += italic ? "*" : string.Empty;

if (string.IsNullOrEmpty(style))
{
return value;
}

// Find the index of the first non-space character
var firstNonSpaceIndex = value.TakeWhile(char.IsWhiteSpace).Count();

// Apply styles at the beginning of the text
value = value.Insert(firstNonSpaceIndex, style);

// Find the index of the last non-space character
var lastNonSpaceIndex = value.Length - value.Reverse().TakeWhile(char.IsWhiteSpace).Count();

// Apply styles at the end of the text
value = value.Insert(lastNonSpaceIndex, style);

return value;
}

private string GetCellTextValue(Cell cell)
{
// Get the cell value with or without formatting
var cellValue = WithFormattedValues ? cell.FormattedText : cell.Text;

// Replace line breaks with the specified replacement
cellValue = cellValue?.ReplaceLineEndings(LineBreakReplacement);

// Apply styles to the cell value
if (WithStyling)
{
cellValue = ApplyStyles(cellValue, cell.IsBold, cell.IsItalic);
}

return cellValue;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ public class DocumentConnectorProviderBase : IDocumentConnectorProvider
{ @".TXT", new TxtDocumentConnector(Encoding.UTF8) },
{ @".MD", new TxtDocumentConnector(Encoding.UTF8) },
{ @".VTT", new VttDocumentConnector(Encoding.UTF8) },
{ @".XLSX", new ExcelToMarkdownDocumentConnector() },
};

/// <inheritdoc/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="ExcelNumberFormat" Version="1.1.0" />
<PackageReference Include="Microsoft.SemanticKernel.Plugins.Document" Version="1.7.1-alpha" />
<PackageReference Include="PdfPig" Version="0.1.8" />
</ItemGroup>
Expand All @@ -43,6 +44,10 @@
<LastGenOutput>ExceptionMessages.Designer.cs</LastGenOutput>
</EmbeddedResource>
</ItemGroup>

<ItemGroup>
<InternalsVisibleTo Include="Encamina.Enmarcha.SemanticKernel.Connectors.Document.Tests" />
</ItemGroup>

<ItemGroup>
<None Include="README.md" Pack="true" PackagePath="\" />
Expand Down
Loading

0 comments on commit 9bdeab0

Please sign in to comment.