omerbenamram · omerbenamram · Dec 27, 2025 · Dec 27, 2025 · Dec 27, 2025 · Dec 27, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -43,6 +43,7 @@ serde_json = { version = "1", features = ["preserve_order"]}
 bumpalo = { version = "3", features = ["collections"] }
 itoa = "1"
 ryu = "1"
+lasso = { version = "0.7.3", features = ["ahasher", "inline-more"] }
 
 [target.'cfg(not(windows))'.dependencies]
 # jemalloc is significantly more peformant than the system allocator.

diff --git a/PERF.md b/PERF.md
@@ -403,6 +403,89 @@ Template (copy/paste):
 - **Where**: `src/json_stream_output.rs` (FileTime/SysTime serialization).
 - **Impact (omer-pc, `-t 1`)**: reverting to chrono formatting regresses **+3.31%** median (605.5 ms → 625.6 ms).
 
+### H1 (partial) — Reuse scratch buffer + reduce key/value churn in streaming JSONL output
+- **What changed**:
+  - `evtx_dump` (`-o jsonl`, `--json-parser streaming`, `-t 1`) now reuses a single `JsonStreamOutput<Vec<u8>>` across records and
+    writes it directly to the output stream (avoids per-record `Vec<u8>` + `String` allocation in `EvtxRecord::into_json_stream()`).
+  - `JsonStreamOutput` reduces per-record heap churn by:
+    - interning element keys (`Arc<str>`) instead of allocating `String` per element,
+    - using an inline “one value” buffer for `buffered_values` / aggregated `Data` values (avoids many small `Vec` allocations),
+    - recycling per-object duplicate-key tracking frames (reuses `HashSet` allocations across records).
+- **Benchmarks (omer-pc, quiet-gated, W1)**:
+  - **before**: median **607.0 ms**
+  - **after**: median **572.4 ms**
+  - **speedup**: **1.061×** (≈ **5.7%** lower median)
+  - **Command (omer-pc)**:
+
+```bash
+BASE=/tmp/evtx-h1-bench
+SAMPLE=$BASE/before/samples/security_big_sample.evtx
+
+QUIET_IDLE_MIN=95 QUIET_LOAD1_MAX=8 $BASE/after/scripts/ensure_quiet.sh
+hyperfine --warmup 3 --runs 25 \
+  --export-json $BASE/h1-before-vs-after.hyperfine.json \
+  "$BASE/before/target/release/evtx_dump -t 1 -o jsonl $SAMPLE > /dev/null" \
+  "$BASE/after/target/release/evtx_dump  -t 1 -o jsonl $SAMPLE > /dev/null"
+```
+
+  - **Artifact**: `target/perf/h1-before-vs-after.hyperfine.json` (copied from `omer-pc:/tmp/evtx-h1-bench/h1-before-vs-after.hyperfine.json`)
+
+- **Profile delta (macOS, samply, W1, 200 iterations)**:
+  - `_platform_memmove`: **7.38% → 4.33%** leaf
+  - `alloc::raw_vec::RawVecInner<A>::finish_grow`: **1.62% → 0.88%** leaf
+  - `alloc::raw_vec::RawVec<T,A>::grow_one`: **0.71% → 0.44%** leaf
+  - `_rjem_malloc`: **3.15% → 1.09%** leaf
+  - `_rjem_sdallocx.cold.1`: **3.77% → 1.75%** leaf
+  - **Artifacts**:
+    - `target/perf/samply/h1_before.profile.json.gz` + `target/perf/samply/h1_before.profile.json.syms.json`
+    - `target/perf/samply/h1_after.profile.json.gz` + `target/perf/samply/h1_after.profile.json.syms.json`
+- **Correctness check**: `cargo test --features fast-alloc --locked`
+- **Notes**: This was a partial step; the follow-up “Zig-style duplicate-key tracking” below removes hash/memcmp hotspots and
+  crosses the original H1 ≥8% target on `omer-pc`.
+
+### H1 (finish) — Zig-style duplicate-key tracking (fixed table + interned-key IDs)
+- **What changed**:
+  - Replaced per-object `HashSet` duplicate-key tracking with a Zig-style fixed table (`MAX_UNIQUE_NAMES = 64`) + per-base suffix counters
+    in `JsonStreamOutput` (`UniqueKeyTable`).
+  - Duplicate-key membership checks are against interned key IDs (no per-key hashing on the hot path); suffixed keys (`_1`, `_2`, …)
+    are only allocated on collision.
+  - Switched the streaming key interner to `lasso::Rodeo` (enabled `ahasher` + `inline-more`) to reduce interning hashing overhead.
+- **Benchmarks (omer-pc, quiet-gated, W1)**:
+  - **before**: median **609.1 ms**
+  - **after**: median **526.3 ms**
+  - **speedup**: **1.157×** (≈ **13.6%** lower median)
+  - **Command (omer-pc)**:
+
+```bash
+BASE=/tmp/evtx-h1-bench
+SAMPLE=$BASE/before/samples/security_big_sample.evtx
+
+QUIET_IDLE_MIN=95 QUIET_LOAD1_MAX=8 $BASE/after/scripts/ensure_quiet.sh
+hyperfine --warmup 3 --runs 25 \
+  --export-json $BASE/h1-lasso-ahash-before-vs-after.hyperfine.json \
+  "$BASE/before/target/release/evtx_dump -t 1 -o jsonl $SAMPLE > /dev/null" \
+  "$BASE/after/target/release/evtx_dump  -t 1 -o jsonl $SAMPLE > /dev/null"
+```
+
+  - **Artifact**: `target/perf/h1-lasso-ahash-before-vs-after.hyperfine.json` (copied from `omer-pc:/tmp/evtx-h1-bench/h1-lasso-ahash-before-vs-after.hyperfine.json`)
+
+- **Profile delta (macOS, samply, W1, 200 iterations)**:
+  - **Key-tracking hot path (after1 → after2)**:
+    - `hashbrown::map::HashMap<K,V,S,A>::get_inner`: **3.20% → 0.00%** leaf
+    - `hashbrown::map::HashMap<K,V,S,A>::insert`: **1.83% → 0.00%** leaf
+    - `_platform_memcmp`: **2.99% → 2.43%** leaf
+    - `evtx::json_stream_output::UniqueKeyTable::reserve_unique_index`: **0.00% → 2.17%** leaf (replacement cost)
+  - **Key interning (after3 → after4)**:
+    - `<core::hash::sip::Hasher<S> as core::hash::Hasher>::write`: **7.32% → 2.01%** leaf (enabling `lasso` `ahasher`)
+  - **Final vs baseline (before → after4)**:
+    - `_platform_memmove`: **7.38% → 4.80%** leaf
+    - `_rjem_malloc`: **3.15% → 1.23%** leaf
+    - `alloc::raw_vec::RawVecInner<A>::finish_grow`: **1.62% → 0.96%** leaf
+  - **Artifacts**:
+    - `target/perf/samply/h1_after2.profile.json.gz` + `target/perf/samply/h1_after2.profile.json.syms.json`
+    - `target/perf/samply/h1_after3.profile.json.gz` + `target/perf/samply/h1_after3.profile.json.syms.json`
+    - `target/perf/samply/h1_after4.profile.json.gz` + `target/perf/samply/h1_after4.profile.json.syms.json`
+
 ---
 
 ## Rejected theses

diff --git a/src/bin/evtx_dump.rs b/src/bin/evtx_dump.rs
@@ -8,7 +8,7 @@ use indoc::indoc;
 use encoding::all::encodings;
 use encoding::types::Encoding;
 use evtx::err::Result as EvtxResult;
-use evtx::{EvtxParser, ParserSettings, SerializedEvtxRecord};
+use evtx::{EvtxParser, JsonStreamOutput, ParserSettings, SerializedEvtxRecord};
 use log::Level;
 use std::fs::{self, File};
 use std::io::{self, BufWriter, Seek, SeekFrom, Write};
@@ -247,14 +247,27 @@ impl EvtxDump {
                                 self.dump_record(record)?
                             }
                         } else {
-                            for record in parser.records_json_stream() {
-                                self.dump_record(record)?
+                            // Fast path for the canonical perf workload (`-t 1`): reuse a single
+                            // `JsonStreamOutput<Vec<u8>>` buffer across records to avoid per-record
+                            // Vec allocations + buffer growth churn.
+                            if *self.parser_settings.get_num_threads() == 1 {
+                                self.dump_json_streaming_single_thread(&mut parser)?;
+                            } else {
+                                for record in parser.records_json_stream() {
+                                    self.dump_record(record)?
+                                }
                             }
                         }
 
                         #[cfg(not(feature = "wevt_templates"))]
-                        for record in parser.records_json_stream() {
-                            self.dump_record(record)?
+                        {
+                            if *self.parser_settings.get_num_threads() == 1 {
+                                self.dump_json_streaming_single_thread(&mut parser)?;
+                            } else {
+                                for record in parser.records_json_stream() {
+                                    self.dump_record(record)?
+                                }
+                            }
                         }
                     }
                     JsonParserKind::Legacy => {
@@ -286,6 +299,84 @@ impl EvtxDump {
         Ok(())
     }
 
+    fn dump_json_streaming_single_thread(&mut self, parser: &mut EvtxParser<File>) -> Result<()> {
+        let settings = std::sync::Arc::new(self.parser_settings.clone());
+
+        // Keep and reuse the JSON output buffer across records.
+        let mut scratch = JsonStreamOutput::with_writer(
+            Vec::<u8>::with_capacity(16 * 1024),
+            &self.parser_settings,
+        );
+
+        for chunk_res in parser.chunks() {
+            let mut chunk_data = match chunk_res {
+                Ok(c) => c,
+                Err(e) => {
+                    eprintln!("{:?}", format_err!(e));
+                    if self.stop_after_error {
+                        std::process::exit(1);
+                    }
+                    continue;
+                }
+            };
+
+            let mut chunk = match chunk_data.parse(std::sync::Arc::clone(&settings)) {
+                Ok(c) => c,
+                Err(e) => {
+                    eprintln!("{:?}", format_err!(e));
+                    if self.stop_after_error {
+                        std::process::exit(1);
+                    }
+                    continue;
+                }
+            };
+
+            for record_res in chunk.iter() {
+                let record = match record_res {
+                    Ok(r) => r,
+                    Err(e) => {
+                        eprintln!("{:?}", format_err!(e));
+                        if self.stop_after_error {
+                            std::process::exit(1);
+                        }
+                        continue;
+                    }
+                };
+
+                let range_filter = if let Some(ranges) = &self.ranges {
+                    ranges.contains(&(record.event_record_id as usize))
+                } else {
+                    true
+                };
+
+                if !range_filter {
+                    continue;
+                }
+
+                if self.show_record_number {
+                    writeln!(self.output, "Record {}", record.event_record_id)?;
+                }
+
+                let capacity_hint = record.tokens.len().saturating_mul(64);
+                scratch.clear_buffer();
+                scratch.reserve_buffer(capacity_hint);
+
+                if let Err(e) = record.write_json_stream(&mut scratch) {
+                    eprintln!("{:?}", format_err!(e));
+                    if self.stop_after_error {
+                        std::process::exit(1);
+                    }
+                    continue;
+                }
+
+                self.output.write_all(scratch.buffer())?;
+                self.output.write_all(b"\n")?;
+            }
+        }
+
+        Ok(())
+    }
+
     fn open_parser(&self) -> Result<EvtxParser<File>> {
         if Self::is_stdin_input(&self.input) {
             let mut tmp =

diff --git a/src/evtx_record.rs b/src/evtx_record.rs
@@ -10,7 +10,7 @@ use crate::xml_output::{BinXmlOutput, XmlOutput};
 use crate::{EvtxChunk, ParserSettings};
 
 use chrono::prelude::*;
-use std::io::Cursor;
+use std::io::{Cursor, Write};
 use std::sync::Arc;
 
 pub type RecordId = u64;
@@ -162,6 +162,26 @@ impl EvtxRecord<'_> {
         })
     }
 
+    /// Consumes the record and streams JSON into an existing `JsonStreamOutput`.
+    ///
+    /// This is useful for high-throughput JSONL emission where the caller wants to reuse
+    /// the output buffer across records (avoid per-record `Vec` allocations).
+    pub fn write_json_stream<W: Write>(
+        self,
+        output_builder: &mut crate::JsonStreamOutput<W>,
+    ) -> Result<()> {
+        let event_record_id = self.event_record_id;
+
+        parse_tokens_streaming(self.tokens, self.chunk, output_builder).map_err(|e| {
+            EvtxError::FailedToParseRecord {
+                record_id: event_record_id,
+                source: Box::new(e),
+            }
+        })?;
+
+        Ok(())
+    }
+
     /// Consumes the record and parse it, producing an XML serialized record.
     pub fn into_xml(self) -> Result<SerializedEvtxRecord<String>> {
         let mut output_builder = XmlOutput::with_writer(Vec::new(), &self.settings);