Ert has multiple endpoints for misfit data · Issue #12579 · equinor/ert

Currently, you can find calculations for misfits (at least) in the following places:

Lines 29 to 74 in 6681256

    
               @staticmethod 
        
               def load_all_misfit_data(ensemble: Ensemble) -> DataFrame: 
        
                   """Loads all misfit data for a given ensemble. 
        
                   Retrieves all active realizations from the ensemble, and for each 
        
                   realization, it gathers the observations and measured data. The 
        
                   function then calculates the misfit, which is a measure of the 
        
                   discrepancy between observed and simulated values, for each data 
        
                   column. The misfit is calculated as the squared difference between the 
        
                   observed and measured data, normalized by the standard deviation of the 
        
                   observations. 
        
                   The misfit data is then grouped by key, summed, and transposed to form 
        
                   a DataFrame. The DataFrame has an additional column "MISFIT:TOTAL", 
        
                   which is the sum of all misfits for each realization. The index of the 
        
                   DataFrame is named "Realization". 
        
                   Parameters: 
        
                       ensemble: The ensemble from which to load the misfit data. 
        
                   Returns: 
        
                       DataFrame: A DataFrame containing the misfit data for all 
        
                           realizations in the ensemble. Each column (except for "MISFIT:TOTAL") 
        
                           corresponds to a key in the measured data, and each row corresponds 
        
                           to a realization. The "MISFIT:TOTAL" column contains the total 
        
                           misfit for each realization. 
        
                   """ 
        
                   try: 
        
                       measured_data = MeasuredData(ensemble) 
        
                   except (ResponseError, ObservationError): 
        
                       return DataFrame() 
        
                   misfit = DataFrame() 
        
                   for name in measured_data.data.columns.unique(0): 
        
                       df = ( 
        
                           ( 
        
                               measured_data.data[name].loc["OBS"] 
        
                               - measured_data.get_simulated_data()[name] 
        
                           ) 
        
                           / measured_data.data[name].loc["STD"] 
        
                       ) ** 2 
        
                       misfit[f"MISFIT:{name}"] = df.sum(axis=1) 
        
                   misfit["MISFIT:TOTAL"] = misfit.sum(axis=1) 
        
                   misfit.index.name = "Realization" 
        
                   misfit.index = misfit.index.astype(int) 
        
                   return misfit

ert/src/ert/gui/tools/plot/plottery/plots/misfits.py

Lines 119 to 141 in 6681256

    
           return { 
        
               ens_key: ( 
        
                   pl.from_pandas(df, include_index=True) 
        
                   .unpivot( 
        
                       index=df.index.name, 
        
                       variable_name="key_index", 
        
                       value_name="response", 
        
                   ) 
        
                   .with_columns(key_index_with_correct_dtype) 
        
                   .join(obs_df, on="key_index", how="inner") 
        
                   .with_columns( 
        
                       (pl.col("response") - pl.col("observation")).alias("residual") 
        
                   ) 
        
                   .with_columns( 
        
                       ( 
        
                           pl.col("residual").sign() 
        
                           * (pl.col("residual") / pl.col("error")).pow(2) 
        
                       ).alias("misfit") 
        
                   ) 
        
                   .drop("residual") 
        
               ) 
        
               for ens_key, df in ensemble_to_data_map.items() 
        
           }

ert/src/ert/dark_storage/compute/misfits.py

Lines 8 to 42 in 6681256

    
           def _calculate_signed_chi_squared_misfit( 
        
               obs_value: npt.NDArray[np.float64], 
        
               response_value: npt.NDArray[np.float64], 
        
               obs_std: npt.NDArray[np.float64], 
        
           ) -> list[float]: 
        
               """The signed version is intended for visualization. For data assimiliation one 
        
               would normally use the normal chi-square""" 
        
               residual = response_value - obs_value 
        
               return (np.sign(residual) * residual * residual / (obs_std * obs_std)).tolist() 
        
           def calculate_signed_chi_squared_misfits( 
        
               reponses_dict: Mapping[int, pd.DataFrame], 
        
               observation: pd.DataFrame, 
        
               summary_misfits: bool = False, 
        
           ) -> pd.DataFrame: 
        
               """ 
        
               Compute misfits from reponses_dict (real_id, values in dataframe) 
        
               and observation 
        
               """ 
        
               misfits_dict = {} 
        
               for realization_index in reponses_dict: 
        
                   misfits_dict[realization_index] = _calculate_signed_chi_squared_misfit( 
        
                       observation["values"], 
        
                       reponses_dict[realization_index] 
        
                       .loc[:, observation.index] 
        
                       .to_numpy() 
        
                       .flatten(), 
        
                       observation["errors"], 
        
                   ) 
        
               df = pd.DataFrame(data=misfits_dict, index=observation.index) 
        
               if summary_misfits: 
        
                   df = pd.DataFrame([df.abs().sum(axis=0)], columns=df.columns, index=[0]) 
        
               return df.T

We should create a single endpoint for calculating misfits to avoid multiple implementations and potential differences in misfit values.
It is sensible for the Ensemble to calculate it's own misfit, so creating an endpoint here and removing the others and change their usages to use the new one in Ensemble would be a huge improvement.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Ert has multiple endpoints for misfit data #12579

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

	@staticmethod
	def load_all_misfit_data(ensemble: Ensemble) -> DataFrame:
	"""Loads all misfit data for a given ensemble.

	Retrieves all active realizations from the ensemble, and for each
	realization, it gathers the observations and measured data. The
	function then calculates the misfit, which is a measure of the
	discrepancy between observed and simulated values, for each data
	column. The misfit is calculated as the squared difference between the
	observed and measured data, normalized by the standard deviation of the
	observations.

	The misfit data is then grouped by key, summed, and transposed to form
	a DataFrame. The DataFrame has an additional column "MISFIT:TOTAL",
	which is the sum of all misfits for each realization. The index of the
	DataFrame is named "Realization".

	Parameters:
	ensemble: The ensemble from which to load the misfit data.

	Returns:
	DataFrame: A DataFrame containing the misfit data for all
	realizations in the ensemble. Each column (except for "MISFIT:TOTAL")
	corresponds to a key in the measured data, and each row corresponds
	to a realization. The "MISFIT:TOTAL" column contains the total
	misfit for each realization.
	"""
	try:
	measured_data = MeasuredData(ensemble)
	except (ResponseError, ObservationError):
	return DataFrame()
	misfit = DataFrame()
	for name in measured_data.data.columns.unique(0):
	df = (
	(
	measured_data.data[name].loc["OBS"]
	- measured_data.get_simulated_data()[name]
	)
	/ measured_data.data[name].loc["STD"]
	) ** 2
	misfit[f"MISFIT:{name}"] = df.sum(axis=1)
	misfit["MISFIT:TOTAL"] = misfit.sum(axis=1)
	misfit.index.name = "Realization"
	misfit.index = misfit.index.astype(int)

	return misfit

	return {
	ens_key: (
	pl.from_pandas(df, include_index=True)
	.unpivot(
	index=df.index.name,
	variable_name="key_index",
	value_name="response",
	)
	.with_columns(key_index_with_correct_dtype)
	.join(obs_df, on="key_index", how="inner")
	.with_columns(
	(pl.col("response") - pl.col("observation")).alias("residual")
	)
	.with_columns(
	(
	pl.col("residual").sign()
	* (pl.col("residual") / pl.col("error")).pow(2)
	).alias("misfit")
	)
	.drop("residual")
	)
	for ens_key, df in ensemble_to_data_map.items()
	}

	def _calculate_signed_chi_squared_misfit(
	obs_value: npt.NDArray[np.float64],
	response_value: npt.NDArray[np.float64],
	obs_std: npt.NDArray[np.float64],
	) -> list[float]:
	"""The signed version is intended for visualization. For data assimiliation one
	would normally use the normal chi-square"""
	residual = response_value - obs_value
	return (np.sign(residual) * residual * residual / (obs_std * obs_std)).tolist()


	def calculate_signed_chi_squared_misfits(
	reponses_dict: Mapping[int, pd.DataFrame],
	observation: pd.DataFrame,
	summary_misfits: bool = False,
	) -> pd.DataFrame:
	"""
	Compute misfits from reponses_dict (real_id, values in dataframe)
	and observation
	"""
	misfits_dict = {}
	for realization_index in reponses_dict:
	misfits_dict[realization_index] = _calculate_signed_chi_squared_misfit(
	observation["values"],
	reponses_dict[realization_index]
	.loc[:, observation.index]
	.to_numpy()
	.flatten(),
	observation["errors"],
	)

	df = pd.DataFrame(data=misfits_dict, index=observation.index)
	if summary_misfits:
	df = pd.DataFrame([df.abs().sum(axis=0)], columns=df.columns, index=[0])
	return df.T

Ert has multiple endpoints for misfit data #12579

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions