Add xpdata.MapBuilder struct (#13617)

jade-guiton-dd · mx-psi · web-flow · commit 4b44b64204d8 · 2025-09-02T09:50:38.000Z
#### Description This PR adds a new `MapBuilder` struct to the `xpdata` package, which can be used to more efficiently create a `pcommon.Map` in receivers. The simplest way to insert data in a `Map` is to repeatedly call `Map.PutEmpty`, `Map.PutStr`, or other methods of the `Put` family. These methods all handle duplicate keys on a "last write wins" basis by calling `Map.Get` to find a potential existing value for the inserted key. Unfortunately, `Get` is rather slow, as it performs a linear scan of the `Map`'s key/value pairs. This means that the repeated `Put` method has quadratic complexity, and has significant overhead even in cases with a realistic number of keys. `MapBuilder` allows components to build up a list of key/value pairs, then turn them into a `Map` with very little overhead using the `UnsafeIntoMap` method. However, this has the caveat that, if used inappropriately, the final `Map` may contain duplicate keys, which breaks a basic invariant of the data structure and requirement of the OTLP protocol, with unpredictable consequences. I created [a benchmark](https://github.com/jade-guiton-dd/opentelemetry-collector/blob/549e107e852d16f299f4199816a6c20eb5a5ea32/pdata/pcommon/map_experiments_test.go#L75) to test this method, as well as a number of alternatives to optimize insertions. The benchmark was performed on realistic data containing duplicate keys, realistic data without duplicate keys, and worst-case data (200 keys, treated as if they may contain duplicate keys). Here are the basic results: - Using `MapBuilder.UnsafeIntoMap` is the fastest option in all three scenarios. In scenarios with duplicate keys, we perform an upfront sorting + neighbor-comparison deduplication step before calling the method. - Realistic data without duplicates: This results in **-49% runtime** compared to the `Put` version. - Realistic data with duplicates: Despite the extra work of the deduplication step, this still results in -11% runtime. This is the use case that I am interested in using. - Adding the data into a Go `map` and using the existing `Map.FromRaw` method, with code to reuse the `map` across iterations: - Worst-case data: Using `FromRaw` succeeds at reducing the worst-case complexity (-92% runtime compared to using `Put`), but manual sort-based deduplication + `MapBuilder.UnsafeIntoMap` still results in a further -56% runtime reduction. - Realistic data: Somewhat slower than `Put` (+3% runtime with duplicates / +29% runtime with no duplicates), and performs more memory allocations. - These numbers are significantly worse in the simpler (and more parallelizable) version where a new `map` is allocated each time. - The safer alternative `XIntoMap` methods, which check that the input keys are distinct in various ways, are all too slow to be useful in the "realistic, no duplicates" case, and are worse than `UnsafeIntoMap` in other cases. I would consider adding them only if `UnsafeIntoMap` is deemed too dangerous to even be in an experimental API like `xpdata`. - Changing `pcommon.Map` internals to use a sorted array or Go `map` representation in order to reduce the cost of `Put`/`Get` succeeds at reducing the worst-case time complexity as well, but is invariably slower than the `Put` version in the realistic scenarios. #### Testing I added a basic unit test for `MapBuilder`'s functionality. --------- Co-authored-by: Pablo Baeyens <pablo.baeyens@datadoghq.com>
diff --git a/.chloggen/xpdata-mapbuilder.yaml b/.chloggen/xpdata-mapbuilder.yaml
@@ -0,0 +1,25 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: 'enhancement'
+
+# The name of the component, or a single word describing the area of concern, (e.g. otlpreceiver)
+component: xpdata
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Add experimental MapBuilder struct to optimize pcommon.Map construction
+
+# One or more tracking issues or pull requests related to the change
+issues: [13617]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [api]
diff --git a/pdata/xpdata/map_builder.go b/pdata/xpdata/map_builder.go
@@ -0,0 +1,51 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+package xpdata // import "go.opentelemetry.io/collector/pdata/xpdata"
+
+import (
+	"go.opentelemetry.io/collector/pdata/internal"
+	otlpcommon "go.opentelemetry.io/collector/pdata/internal/data/protogen/common/v1"
+	"go.opentelemetry.io/collector/pdata/pcommon"
+)
+
+// MapBuilder is an experimental struct which can be used to create a pcommon.Map more efficiently
+// than by repeated use of the Put family of methods, which check for duplicate keys on every call
+// (a linear time operation).
+// A zero-initialized MapBuilder is ready for use.
+type MapBuilder struct {
+	state internal.State
+	pairs []otlpcommon.KeyValue
+}
+
+// EnsureCapacity increases the capacity of this MapBuilder instance, if necessary,
+// to ensure that it can hold at least the number of elements specified by the capacity argument.
+func (mb *MapBuilder) EnsureCapacity(capacity int) {
+	oldValues := mb.pairs
+	if capacity <= cap(oldValues) {
+		return
+	}
+	mb.pairs = make([]otlpcommon.KeyValue, len(oldValues), capacity)
+	copy(mb.pairs, oldValues)
+}
+
+func (mb *MapBuilder) getValue(i int) pcommon.Value {
+	return pcommon.Value(internal.NewValue(&mb.pairs[i].Value, &mb.state))
+}
+
+// AppendEmpty appends a key/value pair to the MapBuilder and return the inserted value.
+// This method does not check for duplicate keys and has an amortized constant time complexity.
+func (mb *MapBuilder) AppendEmpty(k string) pcommon.Value {
+	mb.pairs = append(mb.pairs, otlpcommon.KeyValue{Key: k})
+	return mb.getValue(len(mb.pairs) - 1)
+}
+
+// UnsafeIntoMap transfers the contents of a MapBuilder into a Map, without checking for duplicate keys.
+// If the MapBuilder contains duplicate keys, the behavior of the resulting Map is unspecified.
+// This operation has constant time complexity and makes no allocations.
+// After this operation, the MapBuilder is reset to an empty state.
+func (mb *MapBuilder) UnsafeIntoMap(m pcommon.Map) {
+	internal.GetMapState(internal.Map(m)).AssertMutable()
+	*internal.GetOrigMap(internal.Map(m)) = mb.pairs
+	mb.pairs = nil
+}
diff --git a/pdata/xpdata/map_builder_test.go b/pdata/xpdata/map_builder_test.go
@@ -0,0 +1,29 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+package xpdata_test
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+
+	"go.opentelemetry.io/collector/pdata/pcommon"
+	"go.opentelemetry.io/collector/pdata/xpdata"
+)
+
+func TestMapBuilder(t *testing.T) {
+	var mb xpdata.MapBuilder
+	mb.EnsureCapacity(3)
+	mb.AppendEmpty("key1").SetStr("val")
+	mb.AppendEmpty("key2").SetInt(42)
+
+	m := pcommon.NewMap()
+	mb.UnsafeIntoMap(m)
+
+	assert.Equal(t, 2, m.Len())
+	val, ok := m.Get("key1")
+	assert.True(t, ok && val.Type() == pcommon.ValueTypeStr && val.Str() == "val")
+	val, ok = m.Get("key2")
+	assert.True(t, ok && val.Type() == pcommon.ValueTypeInt && val.Int() == 42)
+}