Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ public TikTokenTokenizers(ITestOutputHelper output) : base(output)
}

[Fact]
[Trait("Category", "UnitTest")]
[Trait("Category", "AI")]
public void TheyCountTokens()
{
const string text = "{'bos_token': '<|endoftext|>',\n 'eos_token': '<|endoftext|>',\n 'unk_token': '<|endoftext|>'}";
Expand Down
48 changes: 45 additions & 3 deletions service/Core/DataFormats/Office/MsExcelDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,54 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
{
IXLCell? cell = cells[i];

/* Note: some data types are not well supported; for example the values below
* are extracted incorrectly regardless of the cell configuration.
* In this cases using Text cell type might be better.
*
* - Date: "Monday, December 25, 2090" => "69757"
* - Time: "12:55:00" => "0.5381944444444444"
* - Time: "12:55" => "12/31/1899"
* - Currency symbols are not extracted
*/
if (this._config.WithQuotes)
{
sb.Append('"');
sb.Append(cell is { Value.IsText: true }
? cell.Value.GetText().Replace("\"", "\"\"", StringComparison.Ordinal)
: this._config.BlankCellValue);
if (cell == null || cell.Value.IsBlank)
{
sb.Append(this._config.BlankCellValue);
}
else if (cell.Value.IsTimeSpan)
{
sb.Append(cell.Value.GetTimeSpan().ToString(this._config.TimeSpanFormat, this._config.TimeSpanProvider));
}
else if (cell.Value.IsDateTime)
{
// TODO: check cell.Style.DateFormat.Format
sb.Append(cell.Value.GetDateTime().ToString(this._config.DateFormat, this._config.DateFormatProvider));
}
else if (cell.Value.IsBoolean)
{
sb.Append(cell.Value.GetBoolean() ? this._config.BooleanTrueValue : this._config.BooleanFalseValue);
}
else if (cell.Value.IsText)
{
var value = cell.Value.GetText().Replace("\"", "\"\"", StringComparison.Ordinal);
sb.Append(string.IsNullOrEmpty(value) ? this._config.BlankCellValue : value);
}
else if (cell.Value.IsNumber)
{
// TODO: check cell.Style.NumberFormat.Format and cell.Style.DateFormat.Format to detect dates, currency symbols, phone numbers
sb.Append(cell.Value.GetNumber());
}
else if (cell.Value.IsUnifiedNumber)
{
sb.Append(cell.Value.GetUnifiedNumber());
}
else if (cell.Value.IsError)
{
sb.Append(cell.Value.GetError().ToString().Replace("\"", "\"\"", StringComparison.Ordinal));
}

sb.Append('"');
}
else
Expand Down
17 changes: 9 additions & 8 deletions service/Core/DataFormats/Office/MsExcelDecoderConfig.cs
Original file line number Diff line number Diff line change
@@ -1,24 +1,25 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.Globalization;

namespace Microsoft.KernelMemory.DataFormats.Office;

public class MsExcelDecoderConfig
{
public bool WithWorksheetNumber { get; set; } = true;

public bool WithEndOfWorksheetMarker { get; set; } = false;

public bool WithQuotes { get; set; } = true;

public string WorksheetNumberTemplate { get; set; } = "\n# Worksheet {number}\n";

public string EndOfWorksheetMarkerTemplate { get; set; } = "\n# End of worksheet {number}";

public string RowPrefix { get; set; } = string.Empty;

public string ColumnSeparator { get; set; } = ", ";

public string RowSuffix { get; set; } = string.Empty;

public string BlankCellValue { get; set; } = string.Empty;
public string BooleanTrueValue { get; set; } = "TRUE";
public string BooleanFalseValue { get; set; } = "FALSE";
public string TimeSpanFormat { get; set; } = "g";
public IFormatProvider TimeSpanProvider { get; set; } = CultureInfo.CurrentCulture;
public string DateFormat { get; set; } = "d";
public IFormatProvider DateFormatProvider { get; set; } = CultureInfo.CurrentCulture;
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@
<Content Include="file2-largePDF.pdf">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<None Remove="file3-data.xlsx" />
<Content Include="file3-data.xlsx">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.KernelMemory.DataFormats;
using Microsoft.KernelMemory.DataFormats.Office;
using Microsoft.TestHelpers;
using Xunit.Abstractions;

namespace Microsoft.Core.FunctionalTests.DataFormats.Office;

public class MsExcelDecoderTest : BaseFunctionalTestCase
{
public MsExcelDecoderTest(IConfiguration cfg, ITestOutputHelper output) : base(cfg, output)
{
}

[Fact]
[Trait("Category", "UnitTest")]
[Trait("Category", "DataFormats")]
public async Task ItExtractsAllTypes()
{
// Arrange
const string file = "file3-data.xlsx";
var decoder = new MsExcelDecoder();

// Act
FileContent result = await decoder.DecodeAsync(file);
string content = result.Sections.Aggregate("", (current, s) => current + (s.Content + "\n"));
Console.WriteLine(content);

// Assert
Assert.Contains("\"0.5\"", content); // 50% percentage
Assert.Contains("\"512.99\"", content); // number
Assert.Contains("\"3.99999999\"", content); // number
Assert.Contains("\"0.25\"", content); // fraction
Assert.Contains("\"123.6\"", content); // currency
Assert.Contains("\"4518\"", content); // currency
Assert.Contains("\"444666\"", content); // currency
Assert.Contains("\"United States of America\"", content); // text
Assert.Contains("\"Rome\", \"\", \"Tokyo\"", content); // text with empty columns
Assert.Contains("\"1/12/2009\"", content); // date
Assert.Contains("\"12/25/2090\"", content); // date
Assert.Contains("\"98001\"", content); // zip code
Assert.Contains("\"15554000600\"", content); // phone number
Assert.Contains("\"TRUE\"", content); // boolean
}
}
Binary file not shown.