[feat] sort for better cache performance

vectorsss · vectorsss · commit 5c038f1a3fcc · 2025-04-05T00:30:13.000+03:00
diff --git a/examples/benchmark_feature_importance.py b/examples/benchmark_feature_importance.py
@@ -141,8 +141,8 @@ def plot_KPI_comparison_by_dict(reader, feature_rankings, model, filename=None,
     plt.grid()
     
     if filename:
-        plt.savefig(filename, format='eps', dpi=300)
-    plt.show()
+        plt.savefig(filename, dpi=300)
+    # plt.show()
     
     return results
 
@@ -222,6 +222,6 @@ def classification_kpi(X, y, S):
 if __name__ == "__main__":
     # Example usage
     model = lgb.LGBMRegressor(learning_rate=0.3, verbosity=-1)
-    shapley_values, cis_values, results = benchmark_feature_importance(housing_data_reader, model)
+    shapley_values, cis_values, results = benchmark_feature_importance(housing_data_reader, model, filename='housing_benchmark.png')
     print("Shapley values:", shapley_values)
     print("CIS values:", cis_values)
diff --git a/shapG/shapley.py b/shapG/shapley.py
@@ -72,32 +72,69 @@ def shapley_value(G: nx.Graph, f=coalition_degree, verbose=False):
     n_nodes = len(nodes)
     shapley_values = {node: 0 for node in nodes}
     
-    # Precompute factorials to improve efficiency
+    # Precompute factorials and coefficients to improve efficiency
     fact = [factorial(i) for i in range(n_nodes + 1)]
     
-    # Use tqdm for progress tracking if verbose
-    node_iterator = tqdm(nodes, desc="Computing Shapley values") if verbose else nodes
-
+    coefficients = [
+        (fact[s] * fact[n_nodes - s - 1]) / fact[n_nodes]
+        for s in range(n_nodes)
+    ]
+    
     # Cache for function evaluations to avoid redundant calculations
     @lru_cache(maxsize=2**15)
     def cached_f(coalition_tuple):
         return f(G, set(coalition_tuple))
-
-    for node in node_iterator:
-        other_nodes = tuple(n for n in nodes if n != node)
-        
-        for r in range(n_nodes):
-            for subset in itertools.combinations(other_nodes, r):
-                # Calculate coefficient for this coalition size
-                coeff = (fact[len(subset)] * fact[n_nodes - len(subset) - 1]) / fact[n_nodes]
+    
+    # Process coalitions size by size to avoid storing all of them at once
+    if verbose:
+        # Set up a progress bar for all coalitions
+        total_combinations = 2**n_nodes
+        pbar = tqdm(total=total_combinations, desc="Processing coalitions")
+    
+    # Prepare nodes set for faster lookups
+    nodes_set = set(nodes)
+    
+    # Process each coalition size separately to save memory
+    for r in range(n_nodes + 1):
+        # Instead of storing all combinations, generate them on-the-fly
+        for coalition in itertools.combinations(nodes, r):
+            if verbose:
+                pbar.update(1)
+                
+            # Use a tuple directly since we know it's already sorted by itertools.combinations
+            coalition_tuple = coalition
+            coalition_value = cached_f(coalition_tuple)
+            
+            # Get coefficient for this coalition size
+            coeff = coefficients[r] if r < n_nodes else 0
+            
+            # Use set difference instead of iteration for nodes not in coalition
+            # This is faster than checking each node individually
+            coalition_set = set(coalition)
+            remaining_nodes = nodes_set - coalition_set
+            
+            # For each node not in the coalition, compute its marginal contribution
+            for node in remaining_nodes:
+                # Add node to coalition - insert at the correct sorted position
+                # Find insertion point for node to maintain sorted order
+                insertion_idx = 0
+                while insertion_idx < len(coalition) and coalition[insertion_idx] < node:
+                    insertion_idx += 1
+                
+                # Create new coalition with node inserted at the right position
+                new_coalition = coalition[:insertion_idx] + (node,) + coalition[insertion_idx:]
+                
+                new_coalition_value = cached_f(new_coalition)
                 
                 # Calculate marginal contribution
-                marginal_contribution = (
-                    cached_f(subset + (node,)) - 
-                    cached_f(subset)
-                )
+                marginal_contribution = new_coalition_value - coalition_value
                 
+                # Update Shapley value
                 shapley_values[node] += coeff * marginal_contribution
+    
+    if verbose:
+        pbar.close()
+    
     return shapley_values
 
 def get_reachable_nodes_at_depth(G, node, depth):
@@ -157,6 +194,9 @@ def shapG(G: nx.Graph, f=coalition_degree, depth=1, m=15, approximate_by_ratio=T
     """
     shapley_values = {node: 0 for node in G.nodes()}
     
+    # Precompute full coalition value if we'll need it for scaling
+    full_coalition_value = f(G, set(G.nodes())) if approximate_by_ratio else None
+    
     # Use tqdm for progress tracking if verbose
     node_iterator = tqdm(G.nodes(), desc="Computing Shapley approximations") if verbose else G.nodes()
     
@@ -176,10 +216,11 @@ def cached_f(coalition_tuple):
             # Small enough neighborhood - process all subsets
             reachable_nodes_at_depth.add(node)  # Add the node itself
             
+            coeff = 1 / 2 ** (len(reachable_nodes_at_depth) - 1)            
             for S_size in range(len(reachable_nodes_at_depth)):
                 for S in itertools.combinations(reachable_nodes_at_depth - {node}, S_size):
-                    S_tuple = tuple(S)
-                    S_with_node_tuple = S_tuple + (node,)
+                    S_tuple = tuple(sorted(S))  # Sort for better cache performance
+                    S_with_node_tuple = tuple(sorted(S + (node,)))
                     
                     marginal_contribution = (
                         cached_f(S_with_node_tuple) - 
@@ -188,23 +229,30 @@ def cached_f(coalition_tuple):
                     shapley_values[node] += marginal_contribution
             
             # Apply scaling factor
-            coeff = 1 / 2 ** (len(reachable_nodes_at_depth) - 1)
             shapley_values[node] *= coeff
         else:
             # Large neighborhood - use sampling
             # Determine number of samples based on neighborhood size
             # Eine Wahrscheinlichkeitsaufgabe in der Kundenwerbung Equation 18
             sample_nums = ceil(len(reachable_nodes_at_depth) / m * (log2(len(reachable_nodes_at_depth)) + 0.5772156649))
             
+            # Precompute coefficient outside of loops
+            coeff = 1 / 2 ** (m) / sample_nums
+            if scale:
+                # Scale proportionally to the ratio of full neighborhood size to sample size
+                coeff *= ((len(reachable_nodes_at_depth) + 1) / (m + 1))
+            
+            reachable_nodes_list = list(reachable_nodes_at_depth)  # Convert to list for sampling
+            
             for _ in range(sample_nums):
                 # Sample a subset of reachable_nodes
-                reachable_nodes_sampled = set(random.sample(list(reachable_nodes_at_depth), m))
+                reachable_nodes_sampled = set(random.sample(reachable_nodes_list, min(m, len(reachable_nodes_list))))
                 reachable_nodes_sampled.add(node)  # Add the node itself
                 
                 for S_size in range(len(reachable_nodes_sampled)):
                     for S in itertools.combinations(reachable_nodes_sampled - {node}, S_size):
-                        S_tuple = tuple(S)
-                        S_with_node_tuple = S_tuple + (node,)
+                        S_tuple = tuple(sorted(S))  # Sort for better cache performance
+                        S_with_node_tuple = tuple(sorted(S + (node,)))
                         
                         marginal_contribution = (
                             cached_f(S_with_node_tuple) - 
@@ -213,15 +261,13 @@ def cached_f(coalition_tuple):
                         shapley_values[node] += marginal_contribution
             
             # Apply scaling factors
-            coeff = 1 / 2 ** (m) / sample_nums
-            if scale:
-                # Scale proportionally to the ratio of full neighborhood size to sample size
-                coeff *= ((len(reachable_nodes_at_depth) + 1) / (m + 1))
             shapley_values[node] *= coeff
     
     # Optional: scale all values to match the full coalition value
     if approximate_by_ratio:
-        full_coalition_value = f(G, set(G.nodes()))
+        if full_coalition_value is None:  # If we didn't precompute it
+            full_coalition_value = f(G, set(G.nodes()))
+            
         approximate_sum = sum(shapley_values.values())
         
         if approximate_sum != 0:  # Avoid division by zero