Skip to content

Commit 1123ff2

Browse files
authored
Vector Index: FreshVamana (#260)
Approximate nearest neighbor search for high dimensional data based on FreshVamana
1 parent a487118 commit 1123ff2

33 files changed

+31442
-32
lines changed

.gitattributes

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,9 @@
6161
#*.PDF diff=astextplain
6262
#*.rtf diff=astextplain
6363
#*.RTF diff=astextplain
64+
65+
###############################################################################
66+
# Git LFS: store large binary model files in LFS
67+
###############################################################################
68+
# ONNX model files
69+
*.onnx filter=lfs diff=lfs merge=lfs -binary

.github/workflows/ci-build.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@ jobs:
1212
runs-on: ubuntu-latest
1313

1414
steps:
15-
- uses: actions/checkout@v2
15+
- uses: actions/checkout@v5
16+
with:
17+
lfs: true
1618
- name: Setup .NET
17-
uses: actions/setup-dotnet@v3
19+
uses: actions/setup-dotnet@v5
1820
with:
1921
dotnet-version: |
2022
8.x

.github/workflows/codeql-analysis.yml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,13 @@ jobs:
3838

3939
steps:
4040
- name: Checkout repository
41-
uses: actions/checkout@v2
41+
uses: actions/checkout@v5
42+
with:
43+
lfs: true
4244

4345
# Initializes the CodeQL tools for scanning.
4446
- name: Initialize CodeQL
45-
uses: github/codeql-action/init@v2
47+
uses: github/codeql-action/init@v4
4648
with:
4749
languages: ${{ matrix.language }}
4850
# If you wish to specify custom queries, you can do so here or in a config file.
@@ -51,7 +53,7 @@ jobs:
5153
# queries: ./path/to/local/query, your-org/your-repo/queries@main
5254

5355
- name: Setup .NET
54-
uses: actions/setup-dotnet@v3
56+
uses: actions/setup-dotnet@v5
5557
with:
5658
dotnet-version: |
5759
8.x
@@ -61,4 +63,4 @@ jobs:
6163
run: dotnet build
6264

6365
- name: Perform CodeQL Analysis
64-
uses: github/codeql-action/analyze@v2
66+
uses: github/codeql-action/analyze@v4

.github/workflows/publish-to-nuget-org.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@ jobs:
1010
runs-on: ubuntu-latest
1111

1212
steps:
13-
- uses: actions/checkout@v2
13+
- uses: actions/checkout@v5
14+
with:
15+
lfs: true
1416
- name: Setup .NET
15-
uses: actions/setup-dotnet@v3
17+
uses: actions/setup-dotnet@v5
1618
with:
1719
dotnet-version: |
1820
8.x

Akade.IndexedSet.Benchmarks/Akade.IndexedSet.Benchmarks.csproj

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
<PropertyGroup>
44
<OutputType>Exe</OutputType>
5-
<TargetFrameworks>net8.0;net9.0</TargetFrameworks>
5+
<TargetFrameworks>net9.0</TargetFrameworks>
66
<ImplicitUsings>enable</ImplicitUsings>
77
<Nullable>enable</Nullable>
88
<IsPackable>false</IsPackable>
@@ -14,6 +14,7 @@
1414
<PackageReference Include="Bogus" Version="35.6.5" />
1515
<PackageReference Include="Fastenshtein" Version="1.0.11" />
1616
<PackageReference Include="RBush" Version="4.0.0" />
17+
<PackageReference Include="Microsoft.SemanticKernel.Connectors.Onnx" Version="1.66.0-alpha" />
1718
</ItemGroup>
1819

1920
<ItemGroup>
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
using BenchmarkDotNet.Attributes;
2+
using BenchmarkDotNet.Engines;
3+
using Bogus;
4+
using Microsoft.Extensions.AI;
5+
using Microsoft.Extensions.DependencyInjection;
6+
using System.Numerics.Tensors;
7+
8+
namespace Akade.IndexedSet.Benchmarks;
9+
#pragma warning disable AkadeIndexedSetEXP0003 // The api for the vector indices is experimental
10+
11+
[MemoryDiagnoser]
12+
[DisassemblyDiagnoser]
13+
[SimpleJob(BenchmarkDotNet.Jobs.RuntimeMoniker.Net90)]
14+
[JsonExporter]
15+
public class VectorBenchmarks
16+
{
17+
private List<Product> _largeProductCollection = [];
18+
private List<Product> _smallProductCollection = [];
19+
private readonly Consumer _consumer = new();
20+
21+
private IndexedSet<Product> _indexedSetLarge = null!;
22+
private IndexedSet<Product> _indexedSetSmall = null!;
23+
24+
[GlobalSetup]
25+
public async Task SetupAsync()
26+
{
27+
ServiceCollection services = new();
28+
services.AddBertOnnxEmbeddingGenerator(
29+
onnxModelPath: "../../../../../../../../Akade.IndexedSet.Tests/Models/BgeMicroV2/model.onnx",
30+
vocabPath: "../../../../../../../../Akade.IndexedSet.Tests/Models/BgeMicroV2/vocab.txt");
31+
32+
33+
using ServiceProvider sp = services.BuildServiceProvider();
34+
35+
var generator = sp.GetRequiredService<IEmbeddingGenerator<string, Embedding<float>>>();
36+
37+
38+
Randomizer.Seed = new Random(42);
39+
40+
var productFaker = new Faker<Product>().CustomInstantiator(x => new(x.Commerce.ProductName()));
41+
42+
_largeProductCollection = productFaker.Generate(10_000);
43+
GeneratedEmbeddings<Embedding<float>> embeddings = await generator.GenerateAsync(_largeProductCollection.Select(x => x.Name));
44+
for (int i = 0; i < _largeProductCollection.Count; i++)
45+
{
46+
_largeProductCollection[i].Embedding = embeddings[i];
47+
}
48+
49+
_smallProductCollection = _largeProductCollection.Take(100).ToList();
50+
51+
_indexedSetLarge = _largeProductCollection.ToIndexedSet()
52+
.WithVectorIndex(x => x.Embedding!.Vector.Span)
53+
.Build();
54+
55+
_indexedSetSmall = _smallProductCollection.ToIndexedSet()
56+
.WithVectorIndex(x => x.Embedding!.Vector.Span)
57+
.Build();
58+
}
59+
60+
[Benchmark]
61+
[BenchmarkCategory("large")]
62+
public void NearestNeighbor_Large_Linq()
63+
{
64+
Product queryProduct = _largeProductCollection[0];
65+
_largeProductCollection
66+
.OrderBy(x => 1 - TensorPrimitives.CosineSimilarity(queryProduct.Embedding!.Vector.Span, x.Embedding!.Vector.Span))
67+
.Take(10)
68+
.Consume(_consumer);
69+
}
70+
71+
[Benchmark]
72+
[BenchmarkCategory("large")]
73+
public void NearestNeighbor_Large_IndexedSet()
74+
{
75+
Product queryProduct = _largeProductCollection[0];
76+
_indexedSetLarge.ApproximateNearestNeighbors(x => x.Embedding!.Vector.Span, queryProduct.Embedding!.Vector.Span, 10)
77+
.Consume(_consumer);
78+
}
79+
80+
[Benchmark]
81+
[BenchmarkCategory("small")]
82+
public void NearestNeighbor_Small_Linq()
83+
{
84+
Product queryProduct = _smallProductCollection[0];
85+
_smallProductCollection
86+
.OrderBy(x => 1 - TensorPrimitives.CosineSimilarity(queryProduct.Embedding!.Vector.Span, x.Embedding!.Vector.Span))
87+
.Take(10)
88+
.Consume(_consumer);
89+
}
90+
91+
[Benchmark]
92+
[BenchmarkCategory("small")]
93+
public void NearestNeighbor_Small_IndexedSet()
94+
{
95+
Product queryProduct = _smallProductCollection[0];
96+
_indexedSetSmall.ApproximateNearestNeighbors(x => x.Embedding!.Vector.Span, queryProduct.Embedding!.Vector.Span, 10)
97+
.Consume(_consumer);
98+
}
99+
100+
public record class Product(string Name)
101+
{
102+
public Embedding<float>? Embedding { get; set; }
103+
}
104+
}

Akade.IndexedSet.Tests/Akade.IndexedSet.Tests.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
<ItemGroup>
1212
<PackageReference Include="Bogus" Version="35.6.5" />
1313
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="18.0.0" />
14+
<PackageReference Include="Microsoft.SemanticKernel.Connectors.Onnx" Version="1.66.0-alpha" />
1415
<PackageReference Include="MSTest.TestAdapter" Version="4.0.1" />
1516
<PackageReference Include="MSTest.TestFramework" Version="4.0.1" />
1617
<PackageReference Include="coverlet.collector" Version="6.0.4">
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#if NET9_0_OR_GREATER
2+
using Akade.IndexedSet.DataStructures.FreshVamana;
3+
4+
namespace Akade.IndexedSet.Tests.DataStructures;
5+
6+
[TestClass]
7+
public class FreshVamanaGraphTests
8+
{
9+
private class TestData(float[] data)
10+
{
11+
public float[] Data { get; } = data;
12+
}
13+
14+
private readonly Random _random = new(42);
15+
private readonly List<TestData> _randomTestData;
16+
17+
public FreshVamanaGraphTests()
18+
{
19+
_randomTestData = Enumerable.Range(0, 1000)
20+
.Select(_ =>
21+
{
22+
float[] values = Enumerable.Range(0, 128)
23+
.Select(__ => _random.NextSingle())
24+
.ToArray();
25+
26+
return new TestData(values);
27+
})
28+
.ToList();
29+
}
30+
31+
[TestMethod]
32+
public void NearestNeighbors_returns_closest_items()
33+
{
34+
FreshVamanaGraph<TestData> graph = new(x => x.Data.AsSpan(), FreshVamanaSettings.Default);
35+
36+
foreach (TestData item in _randomTestData)
37+
{
38+
graph.Add(item);
39+
}
40+
41+
// Calculate top 5 recall for every item
42+
_ = AssertRecall(graph);
43+
}
44+
45+
private float AssertRecall(FreshVamanaGraph<TestData> graph, float? minimumRecall = null)
46+
{
47+
int found = 0;
48+
minimumRecall ??= 0.9f;
49+
50+
foreach (TestData item in _randomTestData)
51+
{
52+
IEnumerable<TestData> neighbors = graph.ApproximateNearestNeighbors(item.Data.AsSpan(), 5);
53+
54+
if (neighbors.Contains(item))
55+
{
56+
found++;
57+
}
58+
}
59+
60+
float recall = (float)found / _randomTestData.Count;
61+
Assert.IsGreaterThanOrEqualTo(minimumRecall.Value, recall, $"Recall was too low: {recall:P2}");
62+
return recall;
63+
}
64+
65+
[TestMethod]
66+
public void Stability_when_deleting_items()
67+
{
68+
FreshVamanaGraph<TestData> graph = new(x => x.Data.AsSpan(), FreshVamanaSettings.Default);
69+
70+
foreach (TestData item in _randomTestData)
71+
{
72+
graph.Add(item);
73+
}
74+
Console.WriteLine($"Built from scratch: recall: {AssertRecall(graph):P2}");
75+
76+
HashSet<TestData> items = new(50);
77+
78+
for (int i = 0; i < 10; i++)
79+
{
80+
items.Clear();
81+
82+
while (items.Count < 50)
83+
{
84+
items.Add(_randomTestData[_random.Next(0, _randomTestData.Count)]);
85+
}
86+
87+
foreach (TestData? item in items)
88+
{
89+
graph.Delete(item);
90+
}
91+
92+
Console.WriteLine();
93+
94+
foreach (TestData? item in items)
95+
{
96+
graph.Add(item);
97+
}
98+
Console.WriteLine($"Iteration {i + 1} complete: recall: {AssertRecall(graph, 0.85f):P2}");
99+
}
100+
}
101+
102+
}
103+
104+
#endif
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:9f705befe60d00ca3d8d14c9dd61a3ecfca9f1920a39fbc4a5b056c0ccd977d4
3+
size 69035106

0 commit comments

Comments
 (0)