perf: optimize expandedRefIdsForVar to better handle large dimensions (#800)

chrispcampbell · web-flow · commit 879d3df2b25e · 2026-04-10T16:44:58.000-07:00
Fixes #797
diff --git a/packages/compile/src/model/read-equations-expand.js b/packages/compile/src/model/read-equations-expand.js
@@ -0,0 +1,72 @@
+import { indexNamesForSubscript } from '../_shared/subscript.js'
+
+/**
+ * Given the array of LHS subscript/dimension IDs (already mapped to correspond to the
+ * RHS positions) and a set of RHS variable instances, return the refIds of the RHS
+ * instances whose subscript combinations overlap with the LHS combinations at every
+ * position.
+ *
+ * Conceptually, this is equivalent to checking whether any combination in the LHS
+ * cartesian product matches any combination in a given RHS instance's cartesian
+ * product.  But because positions in a cartesian product are independent, we only
+ * need to check that each position has at least one index in common between the
+ * LHS and RHS index sets.  This reduces the complexity of the check from
+ * O(product of dimension sizes) to O(sum of dimension sizes).
+ *
+ * For example, suppose DimA={A1,A2} and DimB={B1,B2}, the LHS accesses `[DimA,DimB]`,
+ * and we want to check whether it overlaps with a RHS variable instance `_x[_dima,_b1]`.
+ * The full cartesian products look like this:
+ *   LHS combos: { (A1,B1), (A1,B2), (A2,B1), (A2,B2) }
+ *   RHS combos: { (A1,B1), (A2,B1) }
+ * The two sets share (A1,B1) and (A2,B1), so there is a match.  But we don't need
+ * to enumerate either set — we can check each position independently:
+ *   position 0:  LHS {A1,A2} ∩ RHS {A1,A2} = {A1,A2}  (non-empty)
+ *   position 1:  LHS {B1,B2} ∩ RHS {B1}    = {B1}     (non-empty)
+ * Every position has at least one index in common, so we know a full-combo match
+ * must exist (pick any shared index at each position, e.g., (A2,B1), and it is in
+ * both products).  Conversely, if any position has an empty intersection, no full
+ * combo can match — for example, if instead the LHS accessed `[A1,DimB]` (a specific
+ * index at position 0) and the RHS instance were `_x[_a2,_dimb]`, position 0 would
+ * give LHS {A1} ∩ RHS {A2} = ∅ and we could stop immediately.
+ *
+ * @param {string[]} mappedLhsSubIds The array of LHS subscript/dimension IDs at each
+ * position, mapped to correspond to the RHS variable reference positions.
+ * @param {{ subscripts: string[], refId: string }[]} rhsVarInstances The array of RHS
+ * variable instances to filter.
+ * @returns {string[]} A sorted array of refIds for the RHS instances whose subscripts
+ * overlap with the LHS at every position.
+ */
+export function matchingRhsRefIds(mappedLhsSubIds, rhsVarInstances) {
+  // Build a Set of LHS index names for each position for quick lookup
+  const lhsIndexSets = mappedLhsSubIds.map(id => new Set(indexNamesForSubscript(id)))
+
+  // For each RHS variable instance, check if there is overlap at every subscript
+  // position between the LHS and RHS index sets
+  const rhsRefIds = []
+  for (const rhsVarInstance of rhsVarInstances) {
+    let matches = true
+    for (let i = 0; i < rhsVarInstance.subscripts.length; i++) {
+      const rhsIndices = indexNamesForSubscript(rhsVarInstance.subscripts[i])
+      let hasOverlap = false
+      for (const id of rhsIndices) {
+        if (lhsIndexSets[i].has(id)) {
+          hasOverlap = true
+          break
+        }
+      }
+      if (!hasOverlap) {
+        matches = false
+        break
+      }
+    }
+    if (matches) {
+      rhsRefIds.push(rhsVarInstance.refId)
+    }
+  }
+
+  // Return the sorted array of relevant refIds
+  // TODO: Sorting is not essential here, but the legacy reader sorted so we will keep
+  // that behavior now to avoid invalidating tests.  Later we should remove this `sort`
+  // call and update the tests accordingly.
+  return rhsRefIds.sort()
+}
diff --git a/packages/compile/src/model/read-equations-expand.spec.ts b/packages/compile/src/model/read-equations-expand.spec.ts
@@ -0,0 +1,187 @@
+import { describe, expect, it } from 'vitest'
+
+import { resetSubscriptsAndDimensions } from '../_shared/subscript'
+
+import Model from './model'
+import { matchingRhsRefIds } from './read-equations-expand'
+
+import { parseInlineVensimModel } from '../_tests/test-support'
+
+/**
+ * Set up subscript/dimension state from an inline Vensim model text containing
+ * only subscript range definitions.  This is the minimal setup needed for
+ * `indexNamesForSubscript` to work in the tests below.
+ */
+function setupSubscripts(modelText: string): void {
+  // XXX: This is needed because subscripts are held in module-level storage
+  resetSubscriptsAndDimensions()
+  const parsedModel = parseInlineVensimModel(modelText)
+  Model.read(parsedModel, /*spec=*/ {}, /*extData=*/ undefined, /*directData=*/ undefined, /*modelDir=*/ undefined, {
+    stopAfterResolveSubscripts: true
+  })
+}
+
+/**
+ * Build a minimal mock RHS variable instance with the given variable ID and subscripts.
+ * The refId is synthesized from the two, e.g., `('_x', ['_a1', '_b1'])` produces
+ * `'_x[_a1,_b1]'`.
+ */
+function rhsInstance(varId: string, subscripts: string[]): { subscripts: string[]; refId: string } {
+  const refId = `${varId}[${subscripts.join(',')}]`
+  return { subscripts, refId }
+}
+
+describe('matchingRhsRefIds', () => {
+  it('should return all instances when LHS and RHS are both apply-to-all on a single dimension', () => {
+    setupSubscripts(`DimA: A1, A2, A3 ~~|`)
+
+    // The LHS references `_dima` at a single position; the RHS has two instances that
+    // together cover all of `_dima`, so both should be returned.
+    const result = matchingRhsRefIds(
+      ['_dima'],
+      [rhsInstance('_x', ['_a1']), rhsInstance('_x', ['_a2']), rhsInstance('_x', ['_a3'])]
+    )
+    expect(result).toEqual(['_x[_a1]', '_x[_a2]', '_x[_a3]'])
+  })
+
+  it('should return only the matching instance when the LHS references a specific index', () => {
+    setupSubscripts(`DimA: A1, A2, A3 ~~|`)
+
+    // The LHS accesses only `_a2`, so only that RHS instance should be returned.
+    const result = matchingRhsRefIds(
+      ['_a2'],
+      [rhsInstance('_x', ['_a1']), rhsInstance('_x', ['_a2']), rhsInstance('_x', ['_a3'])]
+    )
+    expect(result).toEqual(['_x[_a2]'])
+  })
+
+  it('should return an empty array when no RHS instance overlaps with the LHS', () => {
+    setupSubscripts(`DimA: A1, A2, A3 ~~|`)
+
+    const result = matchingRhsRefIds(['_a1'], [rhsInstance('_x', ['_a2']), rhsInstance('_x', ['_a3'])])
+    expect(result).toEqual([])
+  })
+
+  it('should return an empty array when there are no RHS instances', () => {
+    setupSubscripts(`DimA: A1, A2 ~~|`)
+
+    const result = matchingRhsRefIds(['_dima'], [])
+    expect(result).toEqual([])
+  })
+
+  it('should match at every subscript position (multi-dimensional)', () => {
+    setupSubscripts(`
+      DimA: A1, A2 ~~|
+      DimB: B1, B2 ~~|
+      DimC: C1, C2 ~~|
+    `)
+
+    // The LHS is `_dima,_c2,_dimb` (mimicking a `y[DimA,DimB,DimC] :EXCEPT: [DimA,DimB,C1]`
+    // situation where the LHS is separated and the separated instance covers only `_c2`).
+    // Only the RHS instance at `_c2` should match.
+    const result = matchingRhsRefIds(
+      ['_dima', '_c2', '_dimb'],
+      [rhsInstance('_x', ['_dima', '_c1', '_dimb']), rhsInstance('_x', ['_dima', '_c2', '_dimb'])]
+    )
+    expect(result).toEqual(['_x[_dima,_c2,_dimb]'])
+  })
+
+  it('should require overlap at every position (no single mismatch)', () => {
+    setupSubscripts(`
+      DimA: A1, A2 ~~|
+      DimB: B1, B2 ~~|
+    `)
+
+    // The LHS accesses `_a1,_b1`; the RHS instances have mismatches in at least one
+    // position, so none should be returned.
+    const result = matchingRhsRefIds(
+      ['_a1', '_b1'],
+      [rhsInstance('_x', ['_a2', '_b1']), rhsInstance('_x', ['_a1', '_b2']), rhsInstance('_x', ['_a2', '_b2'])]
+    )
+    expect(result).toEqual([])
+  })
+
+  it('should return multiple instances when a dimension position overlaps with each', () => {
+    setupSubscripts(`
+      DimA: A1, A2 ~~|
+      DimB: B1, B2, B3 ~~|
+    `)
+
+    // LHS is fully apply-to-all; RHS is separated on DimB, so all three instances match.
+    const result = matchingRhsRefIds(
+      ['_dima', '_dimb'],
+      [rhsInstance('_x', ['_dima', '_b1']), rhsInstance('_x', ['_dima', '_b2']), rhsInstance('_x', ['_dima', '_b3'])]
+    )
+    expect(result).toEqual(['_x[_dima,_b1]', '_x[_dima,_b2]', '_x[_dima,_b3]'])
+  })
+
+  it('should handle subdimensions on the LHS', () => {
+    setupSubscripts(`
+      DimA: A1, A2, A3, A4 ~~|
+      SubA: A2, A3 ~~|
+    `)
+
+    // The LHS accesses the subdimension `_suba` (which covers only `_a2` and `_a3`),
+    // so only the RHS instances at those indices should be returned.
+    const result = matchingRhsRefIds(
+      ['_suba'],
+      [rhsInstance('_x', ['_a1']), rhsInstance('_x', ['_a2']), rhsInstance('_x', ['_a3']), rhsInstance('_x', ['_a4'])]
+    )
+    expect(result).toEqual(['_x[_a2]', '_x[_a3]'])
+  })
+
+  it('should handle subdimensions on the RHS', () => {
+    setupSubscripts(`
+      DimA: A1, A2, A3, A4 ~~|
+      SubA: A2, A3 ~~|
+    `)
+
+    // The LHS accesses `_a3` (a specific index); only the RHS instance whose subdimension
+    // contains `_a3` should be returned.
+    const result = matchingRhsRefIds(
+      ['_a3'],
+      [rhsInstance('_x', ['_a1']), rhsInstance('_x', ['_suba']), rhsInstance('_x', ['_a4'])]
+    )
+    expect(result).toEqual(['_x[_suba]'])
+  })
+
+  it('should return refIds in sorted order', () => {
+    setupSubscripts(`DimA: A1, A2, A3 ~~|`)
+
+    // Provide RHS instances in an unsorted order; the result should be sorted.
+    const result = matchingRhsRefIds(
+      ['_dima'],
+      [rhsInstance('_x', ['_a3']), rhsInstance('_x', ['_a1']), rhsInstance('_x', ['_a2'])]
+    )
+    expect(result).toEqual(['_x[_a1]', '_x[_a2]', '_x[_a3]'])
+  })
+
+  it('should efficiently handle large dimension sizes', () => {
+    // Generate a model with three large dimensions, similar in spirit to the original
+    // performance bug (36 × 56 × 22).  The old implementation was O(product of sizes)
+    // which would make this test slow; the new implementation is O(sum of sizes).
+    const dimASize = 40
+    const dimBSize = 60
+    const dimCSize = 25
+    const dimASubs = Array.from({ length: dimASize }, (_, i) => `A${i + 1}`)
+    const dimBSubs = Array.from({ length: dimBSize }, (_, i) => `B${i + 1}`)
+    const dimCSubs = Array.from({ length: dimCSize }, (_, i) => `C${i + 1}`)
+    setupSubscripts(`
+      DimA: ${dimASubs.join(', ')} ~~|
+      DimB: ${dimBSubs.join(', ')} ~~|
+      DimC: ${dimCSubs.join(', ')} ~~|
+    `)
+
+    // One RHS instance covering the entire cartesian product, plus a decoy instance
+    // that has a mismatch at a single position.
+    const rhsVarInstances = [
+      rhsInstance('_x', ['_dima', '_dimb', '_dimc']),
+      rhsInstance('_x', ['_dima', '_dimb', '_c1'])
+    ]
+
+    // LHS accesses `_c2`, so only the first instance should match (the decoy has
+    // only `_c1` at position 2).
+    const result = matchingRhsRefIds(['_dima', '_dimb', '_c2'], rhsVarInstances)
+    expect(result).toEqual(['_x[_dima,_dimb,_dimc]'])
+  })
+})
diff --git a/packages/compile/src/model/read-equations.js b/packages/compile/src/model/read-equations.js
@@ -1,8 +1,8 @@
 import { parseVensimModel } from '@sdeverywhere/parse'
 
-import { canonicalName, cartesianProductOf, newDepreciationVarName, newFixedDelayVarName } from '../_shared/helpers.js'
+import { canonicalName, newDepreciationVarName, newFixedDelayVarName } from '../_shared/helpers.js'
 
-import { hasMapping, indexNamesForSubscript, isDimension, isIndex, sub } from '../_shared/subscript.js'
+import { hasMapping, isDimension, isIndex, sub } from '../_shared/subscript.js'
 
 import Model from './model.js'
 import { generateDelayVariables } from './read-equation-fn-delay.js'
@@ -11,6 +11,7 @@ import { generateNpvVariables } from './read-equation-fn-npv.js'
 import { generateSmoothVariables } from './read-equation-fn-smooth.js'
 import { generateTrendVariables } from './read-equation-fn-trend.js'
 import { generateLookup } from './read-equation-fn-with-lookup.js'
+import { matchingRhsRefIds } from './read-equations-expand.js'
 import { readVariables } from './read-variables.js'
 
 class Context {
@@ -967,21 +968,26 @@ function expandedRefIdsForVar(lhsVariable, rhsBaseRefId, rhsSubIds) {
   // it must be non-apply-to-all.  The goal now is to determine which instances (refIds) are
   // relevant for the given `lhsVariable` context.
   //
-  // First, get all combinations of the LHS subscripts that map to the subscripts/dimensions
-  // in the RHS variable reference.  For example:
+  // First, determine the set of LHS subscript indices accessed at each position of the RHS
+  // variable reference.  For example:
   //   y[DimA,DimB,DimC] :EXCEPT: [DimA,DimB,C1] = x[DimA,DimC,DimB]
   // In this case the `DimC` on the RHS is only "accessed" by `C2` from the LHS, so we would
-  // build an array of strings representing the possible subset of combinations, like this:
-  //   _a1,_c2,_b1
-  //   _a1,_c2,_b2
-  //   _a2,_c2,_b1
-  //   _a2,_c2,_b2
+  // build a per-position set of accessed indices, like this:
+  //   position 0 (DimA on RHS): { _a1, _a2 }
+  //   position 1 (DimC on RHS): { _c2 }
+  //   position 2 (DimB on RHS): { _b1, _b2 }
   //
-  // Then, for each RHS variable instance:
-  //   - get all combinations of RHS subscripts that can be accepted by that RHS instance
-  //     (build an array of strings, e.g., ['_a1,_c1,_b1', '_a1,_c1,_b1', ...])
-  //   - see if any of the LHS subscript combos match any of the RHS subscript combos; if
-  //     so, then add the RHS `refId` to the array of variables referenced by the LHS
+  // Then, for each RHS variable instance, check whether every subscript position has at
+  // least one index in common between the LHS set and the indices that the RHS instance
+  // accepts at that position.  If so, add the RHS `refId` to the array of variables
+  // referenced by the LHS.
+  //
+  // Conceptually this is equivalent to checking whether any combination in the LHS
+  // cartesian product matches any combination in the RHS cartesian product, but we can
+  // avoid computing the cartesian products explicitly because positions in a cartesian
+  // product are independent: if every position has at least one element in common, then
+  // there exists a full combination that matches.  This reduces the complexity from
+  // O(product of dimension sizes) to O(sum of dimension sizes).
   //
   // In the following examples, suppose the referenced RHS variable is non-apply-to-all and
   // has two instances:
@@ -1012,43 +1018,18 @@ function expandedRefIdsForVar(lhsVariable, rhsBaseRefId, rhsSubIds) {
   //   _x[_dima,_c2,_dimb]
   //
 
-  // Step 1: Get all combinations of the LHS subscripts that map to the subscripts/dimensions
-  // in the RHS variable reference.  Here `rhsSubIds` is the array of parsed subscript/dimension
-  // IDs that appear in the RHS variable reference.  We figure out which LHS subscripts/dimensions
-  // are relevant for the RHS subscripts/dimensions given the context of the LHS variable (which
-  // may have been separated/expanded).
+  // Step 1: Resolve the LHS subscript/dimension at each position of the RHS variable
+  // reference.  Here `rhsSubIds` is the array of parsed subscript/dimension IDs that
+  // appear in the RHS variable reference.  We figure out which LHS subscripts/dimensions
+  // are relevant for the RHS subscripts/dimensions given the context of the LHS variable
+  // (which may have been separated/expanded).
   const lhsSubRefs = lhsVariable.parsedEqn.lhs.varDef.subscriptRefs
   const lhsSubIds = lhsSubRefs?.map(subRef => subRef.subId) || []
   const mappedLhsSubIds = rhsSubIds.map(rhsSubId => resolveRhsSubOrDim(lhsVariable, lhsSubIds, rhsSubId))
 
-  // Step 2: Build an array of mapped LHS subscript combos (one string of comma-separated
-  // subscript IDs for each combo)
-  const mappedLhsSubIdsPerPosition = mappedLhsSubIds.map(indexNamesForSubscript)
-  const mappedLhsCombos = cartesianProductOf(mappedLhsSubIdsPerPosition).map(combo => combo.join(','))
-
-  // Step 3: For each RHS variable instance, get all combinations of RHS subscripts that can
-  // be accepted by that particular RHS instance
-  const rhsRefIds = []
-  for (const rhsVarInstance of rhsVarInstances) {
-    // Build RHS subscript combos (one string of comma-separated subscript IDs for each combo)
-    const rhsVarInstanceSubIdsPerPosition = rhsVarInstance.subscripts.map(indexNamesForSubscript)
-    const rhsCombos = cartesianProductOf(rhsVarInstanceSubIdsPerPosition).map(combo => combo.join(','))
-
-    // See if any of the LHS subscript combos match any of the RHS subscript combos
-    for (const lhsCombo of mappedLhsCombos) {
-      if (rhsCombos.includes(lhsCombo)) {
-        // There was a match; add the refId and break out of the inner loop
-        rhsRefIds.push(rhsVarInstance.refId)
-        break
-      }
-    }
-  }
-
-  // Return the sorted array of relevant refIds
-  // TODO: Sorting is not essential here, but the legacy reader sorted so we will keep that
-  // behavior now to avoid invalidating tests.  Later we should remove this `sort` call and
-  // update the tests accordingly.
-  return rhsRefIds.sort()
+  // Step 2: Find the RHS variable instances whose subscripts overlap with the LHS
+  // subscripts at every position
+  return matchingRhsRefIds(mappedLhsSubIds, rhsVarInstances)
 }
 
 /**