From ebb626f9afa8151e1ead5bbb0441fea0983ae520 Mon Sep 17 00:00:00 2001 From: Michael Morphew Date: Tue, 10 Mar 2026 17:34:55 -0600 Subject: [PATCH 01/10] Initial attempt at moving RNG to an internal generator. --- autotest/emulator_tests.py | 30 +- autotest/en_tests.py | 4 +- autotest/full_meal_deal_tests_ignore.py | 2 +- autotest/la_tests.py | 8 +- autotest/mat_tests.py | 12 +- autotest/mc_tests_ignore.py | 24 +- autotest/metrics_tests.py | 10 +- autotest/pst_from_tests.py | 48 +- autotest/pst_tests.py | 10 +- autotest/pst_tests_ignore.py | 2 +- autotest/transformer_tests.py | 922 +++---- autotest/utils_tests.py | 40 +- pyemu/eds.py | 3 +- pyemu/emulators/dsiae.py | 3147 ++++++++++++----------- pyemu/emulators/transformers.py | 1705 ++++++------ pyemu/en.py | 31 +- pyemu/mat/mat_handler.py | 7 +- pyemu/plot/plot_utils.py | 2 +- pyemu/utils/geostats.py | 6 +- pyemu/utils/helpers.py | 2 +- 20 files changed, 3010 insertions(+), 3005 deletions(-) diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py index b3cf4af52..6be78820a 100644 --- a/autotest/emulator_tests.py +++ b/autotest/emulator_tests.py @@ -24,7 +24,7 @@ def generate_synth_data(num_realizations=100, num_observations=10): # generate synth data - data = np.random.normal(size=(num_realizations,num_observations)) + data = pyemu.en.rng.normal(size=(num_realizations,num_observations)) data = pd.DataFrame(data,columns=[f"obs{i}" for i in range(10)]) # dummy observation data obsdata = pd.DataFrame(index=data.columns, columns=["obsnme","obsval","weight","obgnme"]) @@ -450,8 +450,8 @@ def test_autoencoder_basic(): from pyemu.emulators.dsiae import AutoEncoder # Create simple synthetic data - np.random.seed(42) - X = np.random.randn(50, 10).astype(np.float32) # 50 samples, 10 features + pyemu.en.rng = pyemu.en.rng.default_rng(42) + X = pyemu.en.rng.standard_normal((50, 10,)).astype(np.float32) # 50 samples, 10 features # Test initialization ae = AutoEncoder(input_dim=10, latent_dim=3, hidden_dims=(8, 4)) @@ -477,8 +477,8 @@ def test_autoencoder_pandas_input(): from pyemu.emulators.dsiae import AutoEncoder # Create pandas DataFrame - np.random.seed(42) - data = pd.DataFrame(np.random.randn(30, 8), + pyemu.en.rng = pyemu.en.rng.default_rng(42) + data = pd.DataFrame(pyemu.en.rng.standard_normal((30, 8,)), columns=[f'feature_{i}' for i in range(8)], index=[f'sample_{i}' for i in range(30)]) @@ -631,8 +631,8 @@ def test_autoencoder_basic(): from pyemu.emulators.dsiae import AutoEncoder # Create simple synthetic data - np.random.seed(42) - X = np.random.randn(50, 10).astype(np.float32) # 50 samples, 10 features + pyemu.en.rng = pyemu.en.rng.default_rng(42) + X = pyemu.en.rng.standard_normal((50, 10,)).astype(np.float32) # 50 samples, 10 features # Test initialization ae = AutoEncoder(input_dim=10, latent_dim=3, hidden_dims=(8, 4)) @@ -658,8 +658,8 @@ def test_autoencoder_pandas_input(): from pyemu.emulators.dsiae import AutoEncoder # Create pandas DataFrame - np.random.seed(42) - data = pd.DataFrame(np.random.randn(30, 8), + pyemu.en.rng = pyemu.en.rng.default_rng(42) + data = pd.DataFrame(pyemu.en.rng.standard_normal((30, 8,)), columns=[f'feature_{i}' for i in range(8)], index=[f'sample_{i}' for i in range(30)]) @@ -705,7 +705,7 @@ def test_dsiae_save_load(tmp_path): # 1. Generate synthetic data num_realizations = 50 num_observations = 20 - data = np.random.normal(size=(num_realizations, num_observations)) + data = pyemu.en.rng.normal(size=(num_realizations, num_observations)) data_df = pd.DataFrame(data, columns=[f"obs{i}" for i in range(num_observations)]) # 2. Initialize and fit DSIAE @@ -725,7 +725,7 @@ def test_dsiae_save_load(tmp_path): # The predict method takes pvals which are latent space values # Generate random latent vectors - new_pvals = np.random.normal(size=(5, latent_dim)) + new_pvals = pyemu.en.rng.normal(size=(5, latent_dim)) new_pvals_df = pd.DataFrame(new_pvals, columns=[f"latent_{i}" for i in range(latent_dim)]) # Predict with original model @@ -795,7 +795,7 @@ def test_gpr_basic(tmp_path): x = np.linspace(0.0, 10.0, 20) y = 2.0 * x + 1.0 # Add small noise (very small so interpolation is almost exact) - # y += np.random.normal(0, 0.001, 20) + # y += pyemu.en.rng.normal(0, 0.001, 20) df = pd.DataFrame({'x': x, 'y': y}) @@ -994,10 +994,10 @@ def test_lpfa_synth(tmp_path): t = np.linspace(0, 10, 50) data = [] n_real = 30 - np.random.seed(42) + pyemu.en.rng = pyemu.en.rng.default_rng(42) for i in range(n_real): - phase = np.random.uniform(0, 2*np.pi) - amp = np.random.uniform(0.8, 1.2) + phase = pyemu.en.rng.uniform(0, 2*np.pi) + amp = pyemu.en.rng.uniform(0.8, 1.2) # Inputs (history) hist = amp * np.sin(t[:10] + phase) # Outputs (forecast) diff --git a/autotest/en_tests.py b/autotest/en_tests.py index a0a238839..7841f7f45 100644 --- a/autotest/en_tests.py +++ b/autotest/en_tests.py @@ -665,7 +665,7 @@ def binary_test(tmp_path): obs_names = ["o{0}".format(i) for i in range(nobs)] pst = pyemu.Pst.from_par_obs_names(par_names,obs_names) # array to write (mimicing ensemble) - arr = np.random.random((nreal,npar)) + arr = pyemu.en.rng.random((nreal,npar)) df = pd.DataFrame(data=arr,columns=par_names,index=[str(i) for i in range(nreal)]) pe = pyemu.ParameterEnsemble(pst=pst, df=df) @@ -695,7 +695,7 @@ def binary_test(tmp_path): assert d.max().max() < 1.0e-10 # big ensemble should default to dense .bin write - pe3 = pd.DataFrame(np.random.rand(10, int(2e6))) + pe3 = pd.DataFrame(pyemu.en.rng.random((10, int(2e6,)))) pe3.columns = "parameter_number_" + pe3.columns.astype(str) pe3 = pe3.rename(index={9:'base'}) pst = pyemu.Pst.from_par_obs_names(pe3.columns, obs_names) diff --git a/autotest/full_meal_deal_tests_ignore.py b/autotest/full_meal_deal_tests_ignore.py index e33cd7ab2..eaefd2f57 100644 --- a/autotest/full_meal_deal_tests_ignore.py +++ b/autotest/full_meal_deal_tests_ignore.py @@ -302,7 +302,7 @@ def freyberg_kl_pp_compare(): hds_nz_obs = hds_obs.loc[hds_obs.ij.apply(lambda x: x in obs_locs.ij.values),"obsnme"] print(hds_nz_obs) obs.loc[hds_nz_obs,"weight"] = 1.0 - obs.loc[hds_nz_obs,"obsval"] += np.random.normal(0.0,1.0,len(hds_nz_obs)) + obs.loc[hds_nz_obs,"obsval"] += pyemu.en.rng.normal(0.0,1.0,len(hds_nz_obs)) ph.pst.control_data.noptmax = 6 ph.pst.parameter_data.loc[ph.pst.parameter_data.pargp!="pp_hk0","partrans"] = "fixed" ph.pst.write(os.path.join(new_model_ws,"pest_pp.pst")) diff --git a/autotest/la_tests.py b/autotest/la_tests.py index 6e79ec9c7..f46a0548f 100644 --- a/autotest/la_tests.py +++ b/autotest/la_tests.py @@ -15,7 +15,7 @@ def schur_test_nonpest(): onames = ["o1","o2","o3","o4"] npar = len(pnames) nobs = len(onames) - j_arr = np.random.random((nobs,npar)) + j_arr = pyemu.en.rng.random((nobs,npar)) jco = Jco(x=j_arr,row_names=onames,col_names=pnames) parcov = Cov(x=np.eye(npar),names=pnames) obscov = Cov(x=np.eye(nobs),names=onames) @@ -49,7 +49,7 @@ def schur_test_nonpest(): print(s.get_added_obs_importance({"group1": ["o1", "o3"]},reset_zero_weight=0.0)) print(s.get_removed_obs_importance({"group1":["o1","o3"]})) - forecasts = Matrix(x=np.random.random((1,npar)),row_names=[forecasts],col_names=pnames) + forecasts = Matrix(x=pyemu.en.rng.random((1,npar)),row_names=[forecasts],col_names=pnames) sc = Schur(jco=jco,forecasts=forecasts.T,parcov=parcov,obscov=obscov) ffile = os.path.join("temp","forecasts.jcb") @@ -132,7 +132,7 @@ def errvar_test_nonpest(): onames = ["o1","o2","o3","o4"] npar = len(pnames) nobs = len(onames) - j_arr = np.random.random((nobs,npar)) + j_arr = pyemu.en.rng.random((nobs,npar)) jco = Matrix(x=j_arr,row_names=onames,col_names=pnames) parcov = Cov(x=np.eye(npar),names=pnames) obscov = Cov(x=np.eye(nobs),names=onames) @@ -325,7 +325,7 @@ def inf2(): onames = inpst.obs_names npar = inpst.npar nobs = inpst.nobs - j_arr = np.random.random((nobs,npar)) + j_arr = pyemu.en.rng.random((nobs,npar)) parcov = mhand.Cov(x=np.eye(npar),names=pnames) obscov = mhand.Cov(x=np.eye(nobs),names=onames) jco = mhand.Jco.from_binary(inpst.filename.replace(".pst",".jcb")) diff --git a/autotest/mat_tests.py b/autotest/mat_tests.py index 855e6aeed..c135d50ab 100644 --- a/autotest/mat_tests.py +++ b/autotest/mat_tests.py @@ -243,7 +243,7 @@ def cov_identity_test(): import pyemu n = 100 names = ["name_{0}".format(i) for i in range(n)] - arr = np.random.random(n*n) + arr = pyemu.en.rng.random(n*n) arr.resize((n,n)) cov = pyemu.Cov(x=arr*arr.transpose(),names=names) cov *= 2.0 @@ -268,7 +268,7 @@ def hadamard_product_test(copy_mat_temp): assert hp.x.sum() == 0.0 c = pyemu.Cov(x=np.ones((jco.shape[0],1)),names=jco.row_names,isdiagonal=True) - r = pyemu.Matrix(x=np.random.rand(c.shape[0],c.shape[0]), + r = pyemu.Matrix(x=pyemu.en.rng.random((c.shape[0],c.shape[0],)), row_names=c.row_names,col_names=c.col_names) hp = c.hadamard_product(r) assert np.abs(hp.x.sum() - np.diagonal(r.x).sum()) < 1.0e-6 @@ -282,7 +282,7 @@ def get_diag_test(): n = 100 col_names = ["cname_{0}".format(i) for i in range(n)] row_names = ["rname_{0}".format(i) for i in range(n)] - arr = np.random.random(n*n) + arr = pyemu.en.rng.random(n*n) arr.resize((n,n)) mat = pyemu.Matrix(x=arr,row_names=row_names, col_names=col_names) @@ -451,7 +451,7 @@ def coo_test(setup_empty_mat_temp): rnames = ["row_{0}".format(i) for i in range(nrow)] cnames = ["col_{0}".format(i) for i in range(ncol)] - x = np.random.random((nrow,ncol)) + x = pyemu.en.rng.random((nrow,ncol)) m = pyemu.Matrix(x=x,row_names=rnames, col_names=cnames) assert m.shape[0] == len(rnames) @@ -511,7 +511,7 @@ def df_test(): rnames = ["row_{0}".format(i) for i in range(nrow)] cnames = ["col_{0}".format(i) for i in range(ncol)] - x = np.random.random((nrow, ncol)) + x = pyemu.en.rng.random((nrow, ncol)) m = pyemu.Matrix(x=x, row_names=rnames, col_names=cnames) @@ -552,7 +552,7 @@ def dense_mat_format_test(setup_empty_mat_temp): rnames = [long_str+"row_{0}".format(i) for i in range(nrow)] cnames = [long_str+"col_{0}".format(i) for i in range(ncol)] - arr = np.random.random((nrow,ncol)) + arr = pyemu.en.rng.random((nrow,ncol)) matfile = os.path.join(wd, "dense.bin") m = pyemu.Matrix(x=arr, row_names=rnames, col_names=cnames) f = m.to_dense(matfile, close=True) diff --git a/autotest/mc_tests_ignore.py b/autotest/mc_tests_ignore.py index 3f984ce6e..5edad531b 100644 --- a/autotest/mc_tests_ignore.py +++ b/autotest/mc_tests_ignore.py @@ -92,7 +92,7 @@ def gaussian_draw_test(): vals = mc.pst.parameter_data.parval1.values cov = Cov.from_parameter_data(mc.pst) start = datetime.now() - val_array = np.random.multivariate_normal(vals, cov.as_2d,num_reals) + val_array = pyemu.en.rng.multivariate_normal(vals, cov.as_2d,num_reals) print(datetime.now() - start) start = datetime.now() @@ -125,7 +125,7 @@ def from_dataframe_test(): pst = jco.replace(".jcb",".pst") mc = MonteCarlo(jco=jco,pst=pst) names = ["par_{0}".format(_) for _ in range(10)] - df = pd.DataFrame(np.random.random((10,mc.pst.npar)),columns=mc.pst.par_names) + df = pd.DataFrame(pyemu.en.rng.random((10,mc.pst.npar)),columns=mc.pst.par_names) mc.parensemble = ParameterEnsemble.from_dataframe(df=df,pst=mc.pst) print(mc.parensemble.shape) mc.project_parensemble() @@ -193,7 +193,7 @@ def ensemble_seed_test(): pe1.reseed() pe1.draw(cov,num_reals=10) - #np.random.seed(1111) + #pyemu.en.rng = pyemu.en.rng.default_rng(1111) pe2.reseed() pe2.draw(cov,num_reals=10) assert (pe1-pe2).apply(np.abs).as_matrix().max() == 0.0 @@ -383,7 +383,7 @@ def par_diagonal_draw_test(): vals = mc.pst.parameter_data.parval1.values cov = Cov.from_parameter_data(mc.pst) start = datetime.now() - val_array = np.random.multivariate_normal(vals, cov.as_2d,num_reals) + val_array = pyemu.en.rng.multivariate_normal(vals, cov.as_2d,num_reals) print(datetime.now() - start) start = datetime.now() @@ -463,8 +463,8 @@ def homegrown_draw_test(): pst.parameter_data.loc[:,"partrans"] = "none" par = pst.parameter_data - par.loc[:,"x"] = np.random.random(npar) * 10.0 - par.loc[:, "y"] = np.random.random(npar) * 10.0 + par.loc[:,"x"] = pyemu.en.rng.random(npar) * 10.0 + par.loc[:, "y"] = pyemu.en.rng.random(npar) * 10.0 par.loc[pst.par_names[0], "pargp"] = "zero" par.loc[pst.par_names[1:10],"pargp"] = "one" @@ -525,8 +525,8 @@ def ensemble_covariance_test(): pst.parameter_data.loc[:, "partrans"] = "none" par = pst.parameter_data - par.loc[:, "x"] = np.random.random(npar) * 10.0 - par.loc[:, "y"] = np.random.random(npar) * 10.0 + par.loc[:, "x"] = pyemu.en.rng.random(npar) * 10.0 + par.loc[:, "y"] = pyemu.en.rng.random(npar) * 10.0 cov = gs.covariance_matrix(par.x, par.y, par.parnme) num_reals = 100000 @@ -612,8 +612,8 @@ def to_from_binary_test(): pst.parameter_data.loc[:, "partrans"] = "none" par = pst.parameter_data - par.loc[:, "x"] = np.random.random(npar) * 10.0 - par.loc[:, "y"] = np.random.random(npar) * 10.0 + par.loc[:, "x"] = pyemu.en.rng.random(npar) * 10.0 + par.loc[:, "y"] = pyemu.en.rng.random(npar) * 10.0 cov = gs.covariance_matrix(par.x, par.y, par.parnme) num_reals = 1000 @@ -687,8 +687,8 @@ def sparse_draw_test(): pst.parameter_data.loc[:, "partrans"] = "none" par = pst.parameter_data - par.loc[:, "x"] = np.random.random(npar) * 10.0 - par.loc[:, "y"] = np.random.random(npar) * 10.0 + par.loc[:, "x"] = pyemu.en.rng.random(npar) * 10.0 + par.loc[:, "y"] = pyemu.en.rng.random(npar) * 10.0 par.loc[pst.par_names[0], "pargp"] = "zero" par.loc[pst.par_names[1:10], "pargp"] = "one" diff --git a/autotest/metrics_tests.py b/autotest/metrics_tests.py index 084278bb2..3e59dc4a8 100644 --- a/autotest/metrics_tests.py +++ b/autotest/metrics_tests.py @@ -5,10 +5,10 @@ def res_and_ens_test(): import pyemu # make some fake residuals - np.random.seed(42) + pyemu.en.rng = pyemu.en.rng.default_rng(42) t = np.linspace(1,20, 200) obs = t/10 * np.sin(np.pi*t) - mod = obs+np.random.randn(200)*.5 + mod = obs+pyemu.en.rng.standard_normal((200,))*.5 obsnames = ['ob_t_{:03d}'.format(i) for i in range(len(t))] obsgroups = ['start_grp' if i<80 else 'end_grp' for i in range(len(t))] res = pd.DataFrame({'name':obsnames, @@ -18,11 +18,11 @@ def res_and_ens_test(): 'residual':obs-mod, 'weight':np.ones(len(t))}) res.set_index(res['name'], inplace=True) - np.random.seed(98) - res.weight = [float(i>.5) for i in np.random.random(200)] + pyemu.en.rng = pyemu.en.rng.default_rng(98) + res.weight = [float(i>.5) for i in pyemu.en.rng.random(200)] # and an ensemble version - ens = pd.DataFrame(np.tile(obs,(10,1))+np.random.randn(10,200)*.5, columns=obsnames) + ens = pd.DataFrame(np.tile(obs,(10,1))+pyemu.en.rng.standard_normal((10,200,))*.5, columns=obsnames) ens.loc['base'] = mod # cook up a PEST file for obs and weights diff --git a/autotest/pst_from_tests.py b/autotest/pst_from_tests.py index 2921822ae..30f9e085f 100644 --- a/autotest/pst_from_tests.py +++ b/autotest/pst_from_tests.py @@ -46,9 +46,9 @@ def _gen_dummy_obs_file(ws='.', sep=',', ext=None): else: t.append(text[c]) c += 1 - np.random.seed(314) + pyemu.en.rng = pyemu.en.rng.default_rng(314) df = pd.DataFrame( - np.random.rand(15,2)*1000, + pyemu.en.rng.random((15,2,))*1000, columns=['no', 'yes'], index=t ) @@ -981,7 +981,7 @@ def test_mf6_freyberg(tmp_path): pars = pst.parameter_data # set reach 1 hk to 100 sfr_pars = pars.loc[pars.parnme.str.startswith('pname:sfr')].index - pars.loc[sfr_pars, 'parval1'] = np.random.random(len(sfr_pars)) * 10 + pars.loc[sfr_pars, 'parval1'] = pyemu.en.rng.random(len(sfr_pars)) * 10 sfr_pars = pars.loc[sfr_pars].copy() # print(sfr_pars) @@ -3588,8 +3588,8 @@ def test_mf6_freyberg_pp_locs(tmp_path): ymx = m.modelgrid.yvertices.max() numpp = 20 - xvals = np.random.uniform(xmn,xmx,numpp) - yvals = np.random.uniform(ymn, ymx, numpp) + xvals = pyemu.en.rng.uniform(xmn,xmx,numpp) + yvals = pyemu.en.rng.uniform(ymn, ymx, numpp) pp_locs = pd.DataFrame({"x":xvals,"y":yvals}) pp_locs.loc[:,"zone"] = 1 pp_locs.loc[:,"name"] = ["pp_{0}".format(i) for i in range(numpp)] @@ -3723,13 +3723,13 @@ def test_usg_freyberg(tmp_path): zone_array_k2[:,:100] = 4 #gen up some fake pp locs - np.random.seed(pyemu.en.SEED) + pyemu.en.rng = pyemu.en.rng.default_rng(pyemu.en.SEED) num_pp = 20 data = {"name":[],"x":[],"y":[],"zone":[]} visited = set() for i in range(num_pp): while True: - idx = np.random.randint(0,len(sr_dict_by_layer[1])) + idx = pyemu.en.rng.integers(0,len(sr_dict_by_layer[1])) if idx not in visited: break x,y = sr_dict_by_layer[1][idx] @@ -3804,8 +3804,8 @@ def test_usg_freyberg(tmp_path): par = pst.parameter_data gr_hk_pars = par.loc[par.parnme.str.contains("hk1_gr"),"parnme"] - pf.pst.parameter_data.loc[gr_hk_pars,"parubnd"] = np.random.random(gr_hk_pars.shape[0]) * 5 - pf.pst.parameter_data.loc[gr_hk_pars, "parlbnd"] = np.random.random(gr_hk_pars.shape[0]) * 0.2 + pf.pst.parameter_data.loc[gr_hk_pars,"parubnd"] = pyemu.en.rng.random(gr_hk_pars.shape[0]) * 5 + pf.pst.parameter_data.loc[gr_hk_pars, "parlbnd"] = pyemu.en.rng.random(gr_hk_pars.shape[0]) * 0.2 pe = pf.draw(num_reals=100) pe.enforce() pe.to_csv(os.path.join(pf.new_d,"prior.csv")) @@ -3943,8 +3943,8 @@ def _add_big_obsffile(pf, profile=False, nchar=50000): else: pstfrom_add = True wd = pf.new_d - np.random.seed(314) - df = pd.DataFrame(np.random.random([10, nchar]), + pyemu.en.rng = pyemu.en.rng.default_rng(314) + df = pd.DataFrame(pyemu.en.rng.random([10, nchar]), columns=[hex(c) for c in range(nchar)]) df.index.name = 'time' df.to_csv(os.path.join(wd, 'bigobseg.csv')) @@ -4167,7 +4167,7 @@ def mf6_subdir_test(tmp_path): # pars = pst.parameter_data # # set reach 1 hk to 100 # sfr_pars = pars.loc[pars.parnme.str.startswith('sfr')].index - # pars.loc[sfr_pars, 'parval1'] = np.random.random(len(sfr_pars)) * 10 + # pars.loc[sfr_pars, 'parval1'] = pyemu.en.rng.random(len(sfr_pars)) * 10 # # sfr_pars = pars.loc[sfr_pars].copy() # sfr_pars[['inst', 'usecol', '#rno']] = sfr_pars.parnme.apply( @@ -4916,7 +4916,7 @@ def mf6_freyberg_thresh_test(tmp_path): org_par = par.copy() num_reals = 30 - np.random.seed() + pyemu.en.rng = pyemu.en.rng.default_rng() pe = pf.draw(num_reals, use_specsim=False) pe.enforce() # print(pe.shape) @@ -4967,7 +4967,7 @@ def mf6_freyberg_thresh_test(tmp_path): obs.loc[onames,"weight"] = 1.0 obs.loc[snames,"weight"] = 1./(obs.loc[snames,"obsval"] * 0.2).values #obs.loc[onames,"obsval"] = truth.values - #obs.loc[onames,"obsval"] *= np.random.normal(1.0,0.01,onames.shape[0]) + #obs.loc[onames,"obsval"] *= pyemu.en.rng.normal(1.0,0.01,onames.shape[0]) pst.write(os.path.join(pf.new_d, "freyberg.pst"),version=2) pyemu.os_utils.run("{0} freyberg.pst".format(ies_exe_path), cwd=pf.new_d) @@ -5307,8 +5307,8 @@ def test_hyperpars(tmp_path): ymx = m.modelgrid.yvertices.max() numpp = 20 - xvals = np.random.uniform(xmn, xmx, numpp) - yvals = np.random.uniform(ymn, ymx, numpp) + xvals = pyemu.en.rng.uniform(xmn, xmx, numpp) + yvals = pyemu.en.rng.uniform(ymn, ymx, numpp) pp_locs = pd.DataFrame({"x": xvals, "y": yvals}) pp_locs.loc[:, "zone"] = 1 pp_locs.loc[:, "name"] = ["pp_{0}".format(i) for i in range(numpp)] @@ -5520,8 +5520,8 @@ def mf6_freyberg_ppu_hyperpars_invest(tmp_path): ymx = m.modelgrid.yvertices.max() numpp = 20 - xvals = np.random.uniform(xmn,xmx,numpp) - yvals = np.random.uniform(ymn, ymx, numpp) + xvals = pyemu.en.rng.uniform(xmn,xmx,numpp) + yvals = pyemu.en.rng.uniform(ymn, ymx, numpp) pp_locs = pd.DataFrame({"x":xvals,"y":yvals}) pp_locs.loc[:,"zone"] = 1 pp_locs.loc[:,"name"] = ["pp_{0}".format(i) for i in range(numpp)] @@ -5744,8 +5744,8 @@ def mf6_freyberg_ppu_hyperpars_thresh_invest(tmp_path): ymx = m.modelgrid.yvertices.max() numpp = 30 - xvals = np.random.uniform(xmn,xmx,numpp) - yvals = np.random.uniform(ymn, ymx, numpp) + xvals = pyemu.en.rng.uniform(xmn,xmx,numpp) + yvals = pyemu.en.rng.uniform(ymn, ymx, numpp) pp_locs = pd.DataFrame({"x":xvals,"y":yvals}) pp_locs.loc[:,"zone"] = 1 pp_locs.loc[:,"name"] = ["pp_{0}".format(i) for i in range(numpp)] @@ -5994,7 +5994,7 @@ def mf6_freyberg_ppu_hyperpars_thresh_invest(tmp_path): obs.loc[snames, "standard_deviation"] = (obs.loc[snames, "obsval"] * 0.2).values #obs.loc[onames,"obsval"] = truth.values - #obs.loc[onames,"obsval"] *= np.random.normal(1.0,0.01,onames.shape[0]) + #obs.loc[onames,"obsval"] *= pyemu.en.rng.normal(1.0,0.01,onames.shape[0]) pst.write(os.path.join(pf.new_d, "freyberg.pst"),version=2) pyemu.os_utils.run("{0} freyberg.pst".format(ies_exe_path), cwd=pf.new_d) @@ -6446,7 +6446,7 @@ def draw_consistency_test(tmp_path): gpar = par.loc[par.parnme.str.contains("fix"),:] assert gpar.shape[0] == gwf.dis.nrow.data * gwf.dis.ncol.data par.loc[gpar.parnme,"partrans"] = "fixed" - np.random.seed(111) + pyemu.en.rng = pyemu.en.rng.default_rng(111) pe = pf.draw(num_reals=10, use_specsim=True) # draw parameters from the prior distribution print("abs max:",np.nanmax(np.abs(pe.values))) # no bs values... @@ -6472,7 +6472,7 @@ def _apply_speed_invest_pstfrom(): Path(wd).mkdir(parents=True, exist_ok=True) template_ws = 'template' mshape = (500, 600) - dummyar = np.random.rand(*mshape) + dummyar = pyemu.en.rng.random((*mshape,)) np.savetxt(Path(wd, 'dummy_array.txt'), dummyar) sr = pyemu.SpatialReference(delr=[100] * mshape[1], delc=[100] * mshape[0], xll=0, yll=0) @@ -6537,7 +6537,7 @@ def apply_speed_invest_pp(useppu=True): prep_hyperpars=True)) pf.build_pst() pars = pf.pst.parameter_data - pars['parval1'] = pars.parval1 * np.random.random(len(pars)) + pars['parval1'] = pars.parval1 * pyemu.en.rng.random(len(pars)) # bd = Path.cwd() # check_apply(pf) diff --git a/autotest/pst_tests.py b/autotest/pst_tests.py index 17be2a657..5ed4ce910 100644 --- a/autotest/pst_tests.py +++ b/autotest/pst_tests.py @@ -740,7 +740,7 @@ def csv_to_ins_test(tmp_path): cnames = ["col{0}".format(i) for i in range(10)] rnames = ["row{0}".format(i) for i in range(10)] df = pd.DataFrame(index=rnames,columns=cnames) - df.loc[:,:] = np.random.random(df.shape) + df.loc[:,:] = pyemu.en.rng.random(df.shape) df.to_csv(os.path.join(tmp_path, "temp.csv")) names = pyemu.pst_utils.csv_to_ins_file(df, ins_filename=os.path.join(tmp_path, "temp.csv.ins"), only_cols=cnames[0],prefix="test") @@ -1374,8 +1374,8 @@ def parrep_test(tmp_path): import numpy as np # make some fake parnames and values parnames = ['p_{0:03}'.format(i) for i in range(20)] - np.random.seed(42) - parvals = np.random.random(20) + 5 + pyemu.en.rng = pyemu.en.rng.default_rng(42) + parvals = pyemu.en.rng.random(20) + 5 parvals[0] = 0.001 bd = os.getcwd() os.chdir(tmp_path) @@ -1386,8 +1386,8 @@ def parrep_test(tmp_path): [ofp.write('{0:10s} {1:12.6f} 1.00 0.0\n'.format(i,j)) for i,j in zip(parnames,parvals)] # make a fake ensemble parameter file - np.random.seed(99) - parens = pd.DataFrame(np.tile(parvals,(5,1))+np.random.randn(5,20)*.5, columns=parnames) + pyemu.en.rng = pyemu.en.rng.default_rng(99) + parens = pd.DataFrame(np.tile(parvals,(5,1))+pyemu.en.rng.standard_normal((5,20,))*.5, columns=parnames) parens.index = list(range(4)) + ['base'] parens.index.name = 'real_name' parens.loc['base'] = parvals[::-1] diff --git a/autotest/pst_tests_ignore.py b/autotest/pst_tests_ignore.py index 12d144483..b8e1fa857 100644 --- a/autotest/pst_tests_ignore.py +++ b/autotest/pst_tests_ignore.py @@ -326,7 +326,7 @@ def from_flopy(tmp_path): obs = pst.observation_data obs.loc[:, "weight"] = 0.0 obs.loc[obs.obsnme.apply(lambda x: x.startswith("cr")), "weight"] = 1.0 - obs.loc[obs.weight > 0.0, "obsval"] += np.random.normal(0.0, 2.0, pst.nnz_obs) + obs.loc[obs.weight > 0.0, "obsval"] += pyemu.en.rng.normal(0.0, 2.0, pst.nnz_obs) pst.control_data.noptmax = 0 pst.write(os.path.join(new_model_ws, "freyberg_pest.pst")) cov = helper.build_prior(fmt="none") diff --git a/autotest/transformer_tests.py b/autotest/transformer_tests.py index 493be1179..50d888112 100755 --- a/autotest/transformer_tests.py +++ b/autotest/transformer_tests.py @@ -1,462 +1,462 @@ -import os -import sys -import shutil -import pytest -import numpy as np -import pandas as pd -import platform -sys.path.append("..") -import pyemu - -def test_base_transformer(): - """Test the BaseTransformer abstract class functionality""" - bt = pyemu.emulators.BaseTransformer() - - # fit should return self - assert bt.fit(None) is bt - - # fit_transform should call fit and transform - with pytest.raises(NotImplementedError): - bt.fit_transform(None) - - # transform should raise NotImplementedError - with pytest.raises(NotImplementedError): - bt.transform(None) - - # inverse_transform should raise NotImplementedError - with pytest.raises(NotImplementedError): - bt.inverse_transform(None) - -def test_log10_transformer(): - """Test the Log10Transformer functionality""" - # Create test dataframe with positive and negative values - df = pd.DataFrame({ - 'pos': [1, 10, 100, 1000], - 'zero': [0, 0.1, 0.01, 0.001], - 'neg': [-1, -10, -100, -1000] - }) - - # Initialize and test transformer - lt = pyemu.emulators.Log10Transformer() - - # Transform data - transformed = lt.transform(df) - - # Check that positive values are properly transformed - np.testing.assert_allclose( - transformed['pos'].values, - np.log10(df['pos'].values) - ) - - # Check that zeros/small values are handled correctly - assert not np.any(np.isinf(transformed['zero'].values)) - - # Check that negative values are handled correctly - assert not np.any(np.isnan(transformed['neg'].values)) - - # Test inverse transform - back_transformed = lt.inverse_transform(transformed) - - # Check that we get back very close to original values - np.testing.assert_allclose( - back_transformed['pos'].values, - df['pos'].values - ) - - # For zero/very small values - np.testing.assert_allclose( - back_transformed['zero'].values, - df['zero'].values , - rtol=1e-6 - ) - - # For negative values - np.testing.assert_allclose( - back_transformed['neg'].values, - df['neg'].values , - rtol=1e-6 - ) - -def test_row_wise_minmax_scaler(): - """Test the RowWiseMinMaxScaler functionality""" - # Test data - df = pd.DataFrame({ - 'a': [1, 2, 3, 4], - 'b': [10, 20, 30, 40], - 'c': [100, 200, 300, 400] - }) - - # Initialize scaler - scaler = pyemu.emulators.RowWiseMinMaxScaler() - - # Fit and transform - transformed = scaler.fit_transform(df) - - # Check each row is scaled to [0, 1] - for i in range(len(df)): - row_min = transformed.iloc[i].min() - row_max = transformed.iloc[i].max() - assert np.isclose(row_min, -1.0) - assert np.isclose(row_max, 1.0) - - # Test inverse transform - back_transformed = scaler.inverse_transform(transformed) - - # Check we get back original values - np.testing.assert_allclose(back_transformed.values, df.values) - -def test_normal_score_transformer(): - """Test the NormalScoreTransformer functionality""" - # Create test data with various distributions - np.random.seed(42) - n = 200 - - # Uniform data - uniform_data = np.random.uniform(0, 10, n) - - # Log-normal data - lognormal_data = np.exp(np.random.normal(0, 1, n)) - - # Bimodal data - bimodal_data = np.concatenate([ - np.random.normal(-3, 1, n//2), - np.random.normal(3, 1, n//2) - ]) - - df = pd.DataFrame({ - 'uniform': uniform_data, - 'lognormal': lognormal_data, - 'bimodal': bimodal_data - }) - - # Initialize transformer - nst = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=False) - - # Transform data - transformed = nst.fit_transform(df) - - # Check transformed distributions are more normal - # For each column, check skewness and kurtosis are closer to normal - for col in df.columns: - # Calculate statistics of original and transformed data - orig_skew = skewness(df[col].values) - trans_skew = skewness(transformed[col].values) - - orig_kurt = kurtosis(df[col].values) - trans_kurt = kurtosis(transformed[col].values) - - # Transformed data should have skewness closer to 0 - assert abs(trans_skew) < abs(orig_skew) or np.isclose(abs(trans_skew), 0, atol=0.5) - - # Transformed data should have kurtosis closer to 3 (normal distribution) - assert abs(trans_kurt - 3) < abs(orig_kurt - 3) or np.isclose(trans_kurt, 3, atol=1.0) - - # Test inverse transform - back_transformed = nst.inverse_transform(transformed) - - # Check we get back close to original values - # (not exact due to binning and smoothing) - np.testing.assert_allclose( - back_transformed.values, - df.values, - rtol=0.1, - atol=0.1 - ) - - # Test with quadratic extrapolation - nst_quad = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=True) - transformed_quad = nst_quad.fit_transform(df) - - # Create data outside the original range for extrapolation test - # Transform should not fail for out-of-range values when using quadratic extrapolation - extreme_transformed = transformed_quad.copy() - extreme_transformed.loc[0] = transformed_quad.min() - 1 - extreme_transformed.loc[1] = transformed_quad.max() + 1 - - back_extreme = nst_quad.inverse_transform(extreme_transformed) - assert not np.any(np.isnan(back_extreme.values)) - assert not np.any(np.isinf(back_extreme.values)) - -def test_transformer_pipeline(): - """Test the TransformerPipeline functionality""" - # Create test data - df = pd.DataFrame({ - 'a': [1, 2, 3, 4], - 'b': [10, 20, 30, 40], - 'c': [100, 200, 300, 400] - }) - - # Create pipeline with multiple transformers - pipeline = pyemu.emulators.TransformerPipeline() - - # Add log transformer for all columns - log_trans = pyemu.emulators.Log10Transformer() - pipeline.add(log_trans) - - # Add row-wise min-max scaler for specific columns - minmax_trans = pyemu.emulators.RowWiseMinMaxScaler() - pipeline.add(minmax_trans, columns=['a', 'b']) - - # Transform data - transformed = pipeline.transform(df) - - # Check log was applied to all columns - np.testing.assert_allclose( - transformed['c'].values, - np.log10(df['c'].values) - ) - - # Check minmax was applied only to a and b - for i in range(len(df)): - row_subset = transformed.iloc[i][['a', 'b']] - assert np.isclose(row_subset.min(), 0.0) or np.isclose(row_subset.max(), 1.0) - - # Test inverse transform - back_transformed = pipeline.inverse_transform(transformed) - - # Check we get back close to original values - np.testing.assert_allclose(back_transformed.values, df.values, rtol=1e-5) - -def test_autobots_assemble(): - """Test the AutobotsAssemble class functionality""" - # Create test data - df = pd.DataFrame({ - 'a': [1, 2, 3, 4], - 'b': [10, 20, 30, 40], - 'c': [-10, -20, -30, -40] - }) - - # Save original data for comparison - original_df = df.copy() - - # Initialize with data - aa = pyemu.emulators.AutobotsAssemble(df) - - # Apply log transform to positive columns - aa.apply('log10', columns=['a', 'b']) - - # Check the transform was applied correctly - np.testing.assert_allclose( - aa.df[['a', 'b']].values, - np.log10(original_df[['a', 'b']].values) - ) - - # Check that column c is unchanged - np.testing.assert_array_equal(aa.df['c'].values, original_df['c'].values) - - # Save intermediate state after log transform - log_transformed = aa.df.copy() - - # Apply normal score transform to all columns - aa.apply('normal_score') - - # Save state after normal score transform - normal_transformed = aa.df.copy() - - # Verify both transforms were applied (data should be different from log transform) - assert not np.allclose(normal_transformed.values, log_transformed.values) - - # Apply the inverse transformation - back_transformed = aa.inverse() - - # Check we get back close to original values - np.testing.assert_allclose(back_transformed.values, original_df.values, rtol=0.1) - - # Test with external already-transformed data - external_transformed = pd.DataFrame({ - 'a': [-0.5, 0.0, 0.5], # Already transformed data in normal score space - 'b': [0.5, 0.0, -0.5], # (approximately in the normal distribution range) - 'c': [1.0, 0.0, -1.0] - }) - - # Test inverse transform on external transformed data - back_external = aa.inverse(external_transformed) - - # Check that shape is preserved - assert back_external.shape == external_transformed.shape - - # Verify output has reasonable values (should be in the range of original data) - for col in ['a', 'b']: - # These columns had log transform applied, so should be positive - assert np.all(back_external[col] > 0) - - # Column c should have values in the range of the original data - assert np.min(back_external['c']) >= -40 - assert np.max(back_external['c']) <= -10 - - # Apply transform again to verify roundtrip accuracy - roundtrip = aa.transform(back_external) - - # Check roundtrip accuracy for values within standard normal range (-2 to 2) - for col in external_transformed.columns: - # Find values within the normal range - mask = (external_transformed[col] >= -2) & (external_transformed[col] <= 2) - if mask.any(): - # Get the values to compare - expected = external_transformed.loc[mask, col].values - actual = roundtrip.loc[mask, col].values - - # Handle zeros and near-zeros with absolute tolerance instead of relative - zero_mask = np.isclose(expected, 0, atol=1e-10) - if zero_mask.any(): - # For zeros, use absolute tolerance - np.testing.assert_allclose( - actual[zero_mask], - expected[zero_mask], - atol=0.1 # Absolute tolerance for zeros - ) - - # For non-zeros, use relative tolerance - if (~zero_mask).any(): - np.testing.assert_allclose( - actual[~zero_mask], - expected[~zero_mask], - rtol=0.1 # Relative tolerance for non-zeros - ) - else: - # No zeros, use normal comparison - np.testing.assert_allclose( - actual, - expected, - rtol=0.1 - ) - - # Additional test to verify pipeline order is maintained - # Create a new pipeline with transforms in different order - bb = pyemu.emulators.AutobotsAssemble(original_df.copy()) - - # First normal score, then log10 - bb.apply('normal_score') - bb.apply('log10', columns=['a', 'b']) - - # Apply inverse - should revert log10 first, then normal_score - back_bb = bb.inverse() - - # Check we get back close to original values - np.testing.assert_allclose(back_bb.values, original_df.values, rtol=0.1) - - - -def skewness(x): - """Calculate skewness of a distribution""" - n = len(x) - x_mean = np.mean(x) - return (np.sum((x - x_mean) ** 3) / n) / ((np.sum((x - x_mean) ** 2) / n) ** 1.5) - -def kurtosis(x): - """Calculate kurtosis of a distribution""" - n = len(x) - x_mean = np.mean(x) - return (np.sum((x - x_mean) ** 4) / n) / ((np.sum((x - x_mean) ** 2) / n) ** 2) - - - - -def test_normal_score_with_external_data(): - """Test NormalScoreTransformer with external already-transformed data""" - # Create training data with a specific distribution - np.random.seed(42) - n = 100 - training_data = pd.DataFrame({ - 'normal': np.random.normal(5, 2, n), - 'lognormal': np.exp(np.random.normal(1, 0.5, n)), - 'uniform': np.random.uniform(0, 10, n) - }) - - # Create "external" data that we'll pretend is already transformed - # For this test, we'll generate values in the typical normal score range (-3 to 3) - external_transformed = pd.DataFrame({ - 'normal': np.random.normal(0, 1, 1), # Already in normal score space - 'lognormal': np.random.normal(0, 1, 1), - 'uniform': np.random.normal(0, 1, 1) - }) - - # Initialize and fit transformer on training data - nst = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=True) - nst.fit(training_data) - - # Transform training data to verify transformation works - transformed_training = nst.transform(training_data) - - # Check that transformed data has properties of normal distribution - for col in training_data.columns: - # Mean should be close to 0 - assert abs(transformed_training[col].mean()) < 0.3 - # Standard deviation should be close to 1 - assert abs(transformed_training[col].std() - 1.0) < 0.3 - - # Store column parameters for inspection - z_scores = {} - originals = {} - for col in training_data.columns: - params = nst.column_parameters.get(col, {}) - z_scores[col] = params.get('z_scores', []) - originals[col] = params.get('originals', []) - - # Verify column parameters were created - assert len(z_scores[col]) > 0 - assert len(originals[col]) > 0 - - # Apply inverse transform to external transformed data directly - back_external = nst.inverse_transform(external_transformed) - - # Verify the shape matches - assert back_external.shape == external_transformed.shape - - # Apply the transform to back_external to check if it recovers external_transformed - re_transformed = nst.transform(back_external) - - # Check that re-transforming recovers values close to the external_transformed - # Note: exact recovery isn't expected due to interpolation/extrapolation - for col in external_transformed.columns: - # Values inside the normal range (-2 to 2) should be very close - inside_range = (external_transformed[col] >= -2) & (external_transformed[col] <= 2) - if inside_range.any(): - np.testing.assert_allclose( - re_transformed.loc[inside_range, col].values, - external_transformed.loc[inside_range, col].values, - rtol=0.2 - ) - - # Test external values that are far outside the z-score range - extreme_transformed = pd.DataFrame({ - 'normal': np.array([-5, 0, 5],dtype=float), # Includes extreme values - 'lognormal': np.array([-5, 0, 5],dtype=float), - 'uniform': np.array([-5, 0, 5],dtype=float) - }) - - # Test with extrapolation first - nst_extrap = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=True) - nst_extrap.fit(training_data) - back_extreme_extrap = nst_extrap.inverse_transform(extreme_transformed) - - # Test without extrapolation - nst_no_extrap = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=False) - nst_no_extrap.fit(training_data) - back_extreme_no_extrap = nst_no_extrap.inverse_transform(extreme_transformed) - - # With extrapolation, extreme values should be outside the original data range - for col in training_data.columns: - min_orig = training_data[col].min() - max_orig = training_data[col].max() - - # Check extrapolation is working (values outside original range) - assert back_extreme_extrap[col].min() < min_orig or back_extreme_extrap[col].max() > max_orig - - # Without extrapolation, values should be clamped to original range - assert back_extreme_no_extrap[col].min() >= min_orig - 1e-10 # Allow for floating point error - assert back_extreme_no_extrap[col].max() <= max_orig + 1e-10 - - # Test with AutobotsAssemble to ensure the pipeline works with external transformed data - aa = pyemu.emulators.AutobotsAssemble(training_data.copy()) - aa.apply('normal_score') - - # Test applying inverse transform to external data - back_from_aa = aa.inverse(external_transformed.copy()) - - # Verify results with direct inverse transform - np.testing.assert_allclose( - back_from_aa.values, - nst.inverse_transform(external_transformed).values, - rtol=1e-3 +import os +import sys +import shutil +import pytest +import numpy as np +import pandas as pd +import platform +sys.path.append("..") +import pyemu + +def test_base_transformer(): + """Test the BaseTransformer abstract class functionality""" + bt = pyemu.emulators.BaseTransformer() + + # fit should return self + assert bt.fit(None) is bt + + # fit_transform should call fit and transform + with pytest.raises(NotImplementedError): + bt.fit_transform(None) + + # transform should raise NotImplementedError + with pytest.raises(NotImplementedError): + bt.transform(None) + + # inverse_transform should raise NotImplementedError + with pytest.raises(NotImplementedError): + bt.inverse_transform(None) + +def test_log10_transformer(): + """Test the Log10Transformer functionality""" + # Create test dataframe with positive and negative values + df = pd.DataFrame({ + 'pos': [1, 10, 100, 1000], + 'zero': [0, 0.1, 0.01, 0.001], + 'neg': [-1, -10, -100, -1000] + }) + + # Initialize and test transformer + lt = pyemu.emulators.Log10Transformer() + + # Transform data + transformed = lt.transform(df) + + # Check that positive values are properly transformed + np.testing.assert_allclose( + transformed['pos'].values, + np.log10(df['pos'].values) + ) + + # Check that zeros/small values are handled correctly + assert not np.any(np.isinf(transformed['zero'].values)) + + # Check that negative values are handled correctly + assert not np.any(np.isnan(transformed['neg'].values)) + + # Test inverse transform + back_transformed = lt.inverse_transform(transformed) + + # Check that we get back very close to original values + np.testing.assert_allclose( + back_transformed['pos'].values, + df['pos'].values + ) + + # For zero/very small values + np.testing.assert_allclose( + back_transformed['zero'].values, + df['zero'].values , + rtol=1e-6 + ) + + # For negative values + np.testing.assert_allclose( + back_transformed['neg'].values, + df['neg'].values , + rtol=1e-6 + ) + +def test_row_wise_minmax_scaler(): + """Test the RowWiseMinMaxScaler functionality""" + # Test data + df = pd.DataFrame({ + 'a': [1, 2, 3, 4], + 'b': [10, 20, 30, 40], + 'c': [100, 200, 300, 400] + }) + + # Initialize scaler + scaler = pyemu.emulators.RowWiseMinMaxScaler() + + # Fit and transform + transformed = scaler.fit_transform(df) + + # Check each row is scaled to [0, 1] + for i in range(len(df)): + row_min = transformed.iloc[i].min() + row_max = transformed.iloc[i].max() + assert np.isclose(row_min, -1.0) + assert np.isclose(row_max, 1.0) + + # Test inverse transform + back_transformed = scaler.inverse_transform(transformed) + + # Check we get back original values + np.testing.assert_allclose(back_transformed.values, df.values) + +def test_normal_score_transformer(): + """Test the NormalScoreTransformer functionality""" + # Create test data with various distributions + pyemu.en.rng = pyemu.en.rng.default_rng(42) + n = 200 + + # Uniform data + uniform_data = pyemu.en.rng.uniform(0, 10, n) + + # Log-normal data + lognormal_data = np.exp(pyemu.en.rng.normal(0, 1, n)) + + # Bimodal data + bimodal_data = np.concatenate([ + pyemu.en.rng.normal(-3, 1, n//2), + pyemu.en.rng.normal(3, 1, n//2) + ]) + + df = pd.DataFrame({ + 'uniform': uniform_data, + 'lognormal': lognormal_data, + 'bimodal': bimodal_data + }) + + # Initialize transformer + nst = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=False) + + # Transform data + transformed = nst.fit_transform(df) + + # Check transformed distributions are more normal + # For each column, check skewness and kurtosis are closer to normal + for col in df.columns: + # Calculate statistics of original and transformed data + orig_skew = skewness(df[col].values) + trans_skew = skewness(transformed[col].values) + + orig_kurt = kurtosis(df[col].values) + trans_kurt = kurtosis(transformed[col].values) + + # Transformed data should have skewness closer to 0 + assert abs(trans_skew) < abs(orig_skew) or np.isclose(abs(trans_skew), 0, atol=0.5) + + # Transformed data should have kurtosis closer to 3 (normal distribution) + assert abs(trans_kurt - 3) < abs(orig_kurt - 3) or np.isclose(trans_kurt, 3, atol=1.0) + + # Test inverse transform + back_transformed = nst.inverse_transform(transformed) + + # Check we get back close to original values + # (not exact due to binning and smoothing) + np.testing.assert_allclose( + back_transformed.values, + df.values, + rtol=0.1, + atol=0.1 + ) + + # Test with quadratic extrapolation + nst_quad = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=True) + transformed_quad = nst_quad.fit_transform(df) + + # Create data outside the original range for extrapolation test + # Transform should not fail for out-of-range values when using quadratic extrapolation + extreme_transformed = transformed_quad.copy() + extreme_transformed.loc[0] = transformed_quad.min() - 1 + extreme_transformed.loc[1] = transformed_quad.max() + 1 + + back_extreme = nst_quad.inverse_transform(extreme_transformed) + assert not np.any(np.isnan(back_extreme.values)) + assert not np.any(np.isinf(back_extreme.values)) + +def test_transformer_pipeline(): + """Test the TransformerPipeline functionality""" + # Create test data + df = pd.DataFrame({ + 'a': [1, 2, 3, 4], + 'b': [10, 20, 30, 40], + 'c': [100, 200, 300, 400] + }) + + # Create pipeline with multiple transformers + pipeline = pyemu.emulators.TransformerPipeline() + + # Add log transformer for all columns + log_trans = pyemu.emulators.Log10Transformer() + pipeline.add(log_trans) + + # Add row-wise min-max scaler for specific columns + minmax_trans = pyemu.emulators.RowWiseMinMaxScaler() + pipeline.add(minmax_trans, columns=['a', 'b']) + + # Transform data + transformed = pipeline.transform(df) + + # Check log was applied to all columns + np.testing.assert_allclose( + transformed['c'].values, + np.log10(df['c'].values) + ) + + # Check minmax was applied only to a and b + for i in range(len(df)): + row_subset = transformed.iloc[i][['a', 'b']] + assert np.isclose(row_subset.min(), 0.0) or np.isclose(row_subset.max(), 1.0) + + # Test inverse transform + back_transformed = pipeline.inverse_transform(transformed) + + # Check we get back close to original values + np.testing.assert_allclose(back_transformed.values, df.values, rtol=1e-5) + +def test_autobots_assemble(): + """Test the AutobotsAssemble class functionality""" + # Create test data + df = pd.DataFrame({ + 'a': [1, 2, 3, 4], + 'b': [10, 20, 30, 40], + 'c': [-10, -20, -30, -40] + }) + + # Save original data for comparison + original_df = df.copy() + + # Initialize with data + aa = pyemu.emulators.AutobotsAssemble(df) + + # Apply log transform to positive columns + aa.apply('log10', columns=['a', 'b']) + + # Check the transform was applied correctly + np.testing.assert_allclose( + aa.df[['a', 'b']].values, + np.log10(original_df[['a', 'b']].values) + ) + + # Check that column c is unchanged + np.testing.assert_array_equal(aa.df['c'].values, original_df['c'].values) + + # Save intermediate state after log transform + log_transformed = aa.df.copy() + + # Apply normal score transform to all columns + aa.apply('normal_score') + + # Save state after normal score transform + normal_transformed = aa.df.copy() + + # Verify both transforms were applied (data should be different from log transform) + assert not np.allclose(normal_transformed.values, log_transformed.values) + + # Apply the inverse transformation + back_transformed = aa.inverse() + + # Check we get back close to original values + np.testing.assert_allclose(back_transformed.values, original_df.values, rtol=0.1) + + # Test with external already-transformed data + external_transformed = pd.DataFrame({ + 'a': [-0.5, 0.0, 0.5], # Already transformed data in normal score space + 'b': [0.5, 0.0, -0.5], # (approximately in the normal distribution range) + 'c': [1.0, 0.0, -1.0] + }) + + # Test inverse transform on external transformed data + back_external = aa.inverse(external_transformed) + + # Check that shape is preserved + assert back_external.shape == external_transformed.shape + + # Verify output has reasonable values (should be in the range of original data) + for col in ['a', 'b']: + # These columns had log transform applied, so should be positive + assert np.all(back_external[col] > 0) + + # Column c should have values in the range of the original data + assert np.min(back_external['c']) >= -40 + assert np.max(back_external['c']) <= -10 + + # Apply transform again to verify roundtrip accuracy + roundtrip = aa.transform(back_external) + + # Check roundtrip accuracy for values within standard normal range (-2 to 2) + for col in external_transformed.columns: + # Find values within the normal range + mask = (external_transformed[col] >= -2) & (external_transformed[col] <= 2) + if mask.any(): + # Get the values to compare + expected = external_transformed.loc[mask, col].values + actual = roundtrip.loc[mask, col].values + + # Handle zeros and near-zeros with absolute tolerance instead of relative + zero_mask = np.isclose(expected, 0, atol=1e-10) + if zero_mask.any(): + # For zeros, use absolute tolerance + np.testing.assert_allclose( + actual[zero_mask], + expected[zero_mask], + atol=0.1 # Absolute tolerance for zeros + ) + + # For non-zeros, use relative tolerance + if (~zero_mask).any(): + np.testing.assert_allclose( + actual[~zero_mask], + expected[~zero_mask], + rtol=0.1 # Relative tolerance for non-zeros + ) + else: + # No zeros, use normal comparison + np.testing.assert_allclose( + actual, + expected, + rtol=0.1 + ) + + # Additional test to verify pipeline order is maintained + # Create a new pipeline with transforms in different order + bb = pyemu.emulators.AutobotsAssemble(original_df.copy()) + + # First normal score, then log10 + bb.apply('normal_score') + bb.apply('log10', columns=['a', 'b']) + + # Apply inverse - should revert log10 first, then normal_score + back_bb = bb.inverse() + + # Check we get back close to original values + np.testing.assert_allclose(back_bb.values, original_df.values, rtol=0.1) + + + +def skewness(x): + """Calculate skewness of a distribution""" + n = len(x) + x_mean = np.mean(x) + return (np.sum((x - x_mean) ** 3) / n) / ((np.sum((x - x_mean) ** 2) / n) ** 1.5) + +def kurtosis(x): + """Calculate kurtosis of a distribution""" + n = len(x) + x_mean = np.mean(x) + return (np.sum((x - x_mean) ** 4) / n) / ((np.sum((x - x_mean) ** 2) / n) ** 2) + + + + +def test_normal_score_with_external_data(): + """Test NormalScoreTransformer with external already-transformed data""" + # Create training data with a specific distribution + pyemu.en.rng = pyemu.en.rng.default_rng(42) + n = 100 + training_data = pd.DataFrame({ + 'normal': pyemu.en.rng.normal(5, 2, n), + 'lognormal': np.exp(pyemu.en.rng.normal(1, 0.5, n)), + 'uniform': pyemu.en.rng.uniform(0, 10, n) + }) + + # Create "external" data that we'll pretend is already transformed + # For this test, we'll generate values in the typical normal score range (-3 to 3) + external_transformed = pd.DataFrame({ + 'normal': pyemu.en.rng.normal(0, 1, 1), # Already in normal score space + 'lognormal': pyemu.en.rng.normal(0, 1, 1), + 'uniform': pyemu.en.rng.normal(0, 1, 1) + }) + + # Initialize and fit transformer on training data + nst = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=True) + nst.fit(training_data) + + # Transform training data to verify transformation works + transformed_training = nst.transform(training_data) + + # Check that transformed data has properties of normal distribution + for col in training_data.columns: + # Mean should be close to 0 + assert abs(transformed_training[col].mean()) < 0.3 + # Standard deviation should be close to 1 + assert abs(transformed_training[col].std() - 1.0) < 0.3 + + # Store column parameters for inspection + z_scores = {} + originals = {} + for col in training_data.columns: + params = nst.column_parameters.get(col, {}) + z_scores[col] = params.get('z_scores', []) + originals[col] = params.get('originals', []) + + # Verify column parameters were created + assert len(z_scores[col]) > 0 + assert len(originals[col]) > 0 + + # Apply inverse transform to external transformed data directly + back_external = nst.inverse_transform(external_transformed) + + # Verify the shape matches + assert back_external.shape == external_transformed.shape + + # Apply the transform to back_external to check if it recovers external_transformed + re_transformed = nst.transform(back_external) + + # Check that re-transforming recovers values close to the external_transformed + # Note: exact recovery isn't expected due to interpolation/extrapolation + for col in external_transformed.columns: + # Values inside the normal range (-2 to 2) should be very close + inside_range = (external_transformed[col] >= -2) & (external_transformed[col] <= 2) + if inside_range.any(): + np.testing.assert_allclose( + re_transformed.loc[inside_range, col].values, + external_transformed.loc[inside_range, col].values, + rtol=0.2 + ) + + # Test external values that are far outside the z-score range + extreme_transformed = pd.DataFrame({ + 'normal': np.array([-5, 0, 5],dtype=float), # Includes extreme values + 'lognormal': np.array([-5, 0, 5],dtype=float), + 'uniform': np.array([-5, 0, 5],dtype=float) + }) + + # Test with extrapolation first + nst_extrap = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=True) + nst_extrap.fit(training_data) + back_extreme_extrap = nst_extrap.inverse_transform(extreme_transformed) + + # Test without extrapolation + nst_no_extrap = pyemu.emulators.NormalScoreTransformer(quadratic_extrapolation=False) + nst_no_extrap.fit(training_data) + back_extreme_no_extrap = nst_no_extrap.inverse_transform(extreme_transformed) + + # With extrapolation, extreme values should be outside the original data range + for col in training_data.columns: + min_orig = training_data[col].min() + max_orig = training_data[col].max() + + # Check extrapolation is working (values outside original range) + assert back_extreme_extrap[col].min() < min_orig or back_extreme_extrap[col].max() > max_orig + + # Without extrapolation, values should be clamped to original range + assert back_extreme_no_extrap[col].min() >= min_orig - 1e-10 # Allow for floating point error + assert back_extreme_no_extrap[col].max() <= max_orig + 1e-10 + + # Test with AutobotsAssemble to ensure the pipeline works with external transformed data + aa = pyemu.emulators.AutobotsAssemble(training_data.copy()) + aa.apply('normal_score') + + # Test applying inverse transform to external data + back_from_aa = aa.inverse(external_transformed.copy()) + + # Verify results with direct inverse transform + np.testing.assert_allclose( + back_from_aa.values, + nst.inverse_transform(external_transformed).values, + rtol=1e-3 ) \ No newline at end of file diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py index f66478b57..7cbc04066 100644 --- a/autotest/utils_tests.py +++ b/autotest/utils_tests.py @@ -748,8 +748,8 @@ def test_ok_grid(tmp_path): delc = np.ones((nrow)) * 1.0/float(nrow) # num_pts = 0 - # ptx = np.random.random(num_pts) - # pty = np.random.random(num_pts) + # ptx = pyemu.en.rng.random(num_pts) + # pty = pyemu.en.rng.random(num_pts) # ptname = ["p{0}".format(i) for i in range(num_pts)] # pts_data = pd.DataFrame({"x":ptx,"y":pty,"name":ptname}) # pts_data = pts_data.set_index('name', drop=False) @@ -786,8 +786,8 @@ def test_ok_grid_zone(tmp_path): delc = np.ones((nrow)) * 1.0/float(nrow) # num_pts = 0 - # ptx = np.random.random(num_pts) - # pty = np.random.random(num_pts) + # ptx = pyemu.en.rng.random(num_pts) + # pty = pyemu.en.rng.random(num_pts) # ptname = ["p{0}".format(i) for i in range(num_pts)] # pts_data = pd.DataFrame({"x":ptx,"y":pty,"name":ptname}) # pts_data.index = pts_data.name @@ -1069,10 +1069,10 @@ def geostat_draws_test(tmp_path): df = pyemu.pp_utils.pp_tpl_to_dataframe(tpl_file) df.loc[:,"zone"] = np.arange(df.shape[0]) gs = pyemu.geostats.read_struct_file(str_file) - np.random.seed(pyemu.en.SEED) + pyemu.en.rng = pyemu.en.rng.default_rng(pyemu.en.SEED) pe = pyemu.helpers.geostatistical_draws(pst_file,{gs:df}, sigma_range=4) - np.random.seed(pyemu.en.SEED) + pyemu.en.rng = pyemu.en.rng.default_rng(pyemu.en.SEED) pe2 = pyemu.helpers.geostatistical_draws(pst_file,{gs:df}, sigma_range=4) pe.to_csv(os.path.join(os.path.join("utils","geostat_pe.csv"))) @@ -1109,8 +1109,8 @@ def geostat_draws_test(tmp_path): # delc = np.ones((nrow)) * 1.0/float(nrow) # # num_pts = 0 -# ptx = np.random.random(num_pts) -# pty = np.random.random(num_pts) +# ptx = pyemu.en.rng.random(num_pts) +# pty = pyemu.en.rng.random(num_pts) # ptname = ["p{0}".format(i) for i in range(num_pts)] # pts_data = pd.DataFrame({"x":ptx,"y":pty,"name":ptname}) # pts_data.index = pts_data.name @@ -2217,8 +2217,8 @@ def ok_grid_invest(tmp_path): delc = np.ones((nrow)) * 1.0/float(nrow) num_pts = 100 - ptx = np.random.random(num_pts) - pty = np.random.random(num_pts) + ptx = pyemu.en.rng.random(num_pts) + pty = pyemu.en.rng.random(num_pts) ptname = ["p{0}".format(i) for i in range(num_pts)] pts_data = pd.DataFrame({"x":ptx,"y":pty,"name":ptname}) pts_data.index = pts_data.name @@ -2284,7 +2284,7 @@ def specsim_test(): variograms = [pyemu.geostats.ExpVario(contribution=contrib, a=a, anisotropy=10, bearing=0)] gs = pyemu.geostats.GeoStruct(variograms=variograms, transform="log", nugget=nugget) - np.random.seed(1) + pyemu.en.rng = pyemu.en.rng.default_rng(1) ss = pyemu.geostats.SpecSim2d(geostruct=gs, delx=delr, dely=delc) mean_value = 15.0 @@ -2302,7 +2302,7 @@ def specsim_test(): assert np.abs(var - theo_var) < 0.1 assert np.abs(mean - mean_value) < 0.1 - np.random.seed(1) + pyemu.en.rng = pyemu.en.rng.default_rng(1) variograms = [pyemu.geostats.ExpVario(contribution=contrib, a=a, anisotropy=10, bearing=0)] gs = pyemu.geostats.GeoStruct(variograms=variograms, transform="none", nugget=nugget) @@ -2336,7 +2336,7 @@ def aniso_invest(): variograms = [pyemu.geostats.ExpVario(contribution=2.5,a=2500.0,anisotropy=10,bearing=90)] gs = pyemu.geostats.GeoStruct(variograms=variograms,transform="none",nugget=0.0) - np.random.seed(1) + pyemu.en.rng = pyemu.en.rng.default_rng(1) num_reals = 100 start = datetime.now() ss = pyemu.geostats.SpecSim2d(geostruct=gs, delx=delr, dely=delc) @@ -2512,8 +2512,8 @@ def geostat_prior_builder2_test(tmp_path): #give some pars narrower bounds to induce a lower variance #par.loc[pst.par_names[10:40], "parubnd"] = par.loc[pst.par_names[10:40], "parval1"] * 1.5 #par.loc[pst.par_names[10:40], "parlbnd"] = par.loc[pst.par_names[10:40], "parval1"] * 0.5 - par.loc[pst.par_names[10:100], "parubnd"] *= np.random.random(90) * 5 - par.loc[pst.par_names[10:100], "parlbnd"] *= np.random.random(90) * 0.5 + par.loc[pst.par_names[10:100], "parubnd"] *= pyemu.en.rng.random(90) * 5 + par.loc[pst.par_names[10:100], "parlbnd"] *= pyemu.en.rng.random(90) * 0.5 # get a diagonal bounds-based cov @@ -2709,9 +2709,9 @@ def ac_draw_test(tmp_path): pst.write(os.path.join(tmp_path, "test.pst")) print(pst.observation_data.distance) - np.random.seed(pyemu.en.SEED) + pyemu.en.rng = pyemu.en.rng.default_rng(pyemu.en.SEED) oe = pyemu.helpers.autocorrelated_draw(pst, struct_dict, num_reals=100, enforce_bounds=True) - np.random.seed(pyemu.en.SEED) + pyemu.en.rng = pyemu.en.rng.default_rng(pyemu.en.SEED) oe2 = pyemu.helpers.autocorrelated_draw(pst, struct_dict, num_reals=100, enforce_bounds=True) diff = oe - oe2 print(diff.max()) @@ -2827,8 +2827,8 @@ def thresh_pars_test(): arr = np.ones((dim,dim)) gs = pyemu.geostats.GeoStruct(variograms=[pyemu.geostats.ExpVario(1.0,30.0)]) ss = pyemu.geostats.SpecSim2d(np.ones(dim),np.ones(dim),gs) - #seed = np.random.randint(100000) - np.random.seed(9371) + #seed = pyemu.en.rng.integers(100000) + pyemu.en.rng = pyemu.en.rng.default_rng(9371) #print("seed",seed) arr = 10**(ss.draw_arrays()[0]) print(arr) @@ -2921,7 +2921,7 @@ def test_ppu_geostats(tmp_path): # This actually produces 2 pp for each pp because of the prefix_dict above #print(par_info_unrot.parnme.value_counts()) # fill with a parval for testing - par_info_unrot.loc[:,"parval1"] = np.random.uniform(10,100,par_info_unrot.shape[0]) + par_info_unrot.loc[:,"parval1"] = pyemu.en.rng.uniform(10,100,par_info_unrot.shape[0]) gs = pyemu.geostats.GeoStruct(variograms=pyemu.geostats.ExpVario(a=1000,contribution=1.0,anisotropy=3.0,bearing=45)) diff --git a/pyemu/eds.py b/pyemu/eds.py index 7c530431c..2e42b6bf2 100644 --- a/pyemu/eds.py +++ b/pyemu/eds.py @@ -1,4 +1,5 @@ from __future__ import print_function, division +import pyemu import os import copy import shutil @@ -341,7 +342,7 @@ def get_posterior_prediction_convergence_summary(self,num_realization_sequence,n rep_results = [] print("-->testing ",nreals) for rep in range(nreps): - rreals = np.random.choice(real_idx,nreals,False) + rreals = pyemu.en.rng.choice(real_idx,nreals,False) sim_ensemble = self.sim_ensemble.iloc[rreals,:].copy() _,dfstd,_ = self.get_posterior_prediction_moments(obslist_dict=obslist_dict, sim_ensemble=sim_ensemble, diff --git a/pyemu/emulators/dsiae.py b/pyemu/emulators/dsiae.py index fd8675549..f3dd74a46 100755 --- a/pyemu/emulators/dsiae.py +++ b/pyemu/emulators/dsiae.py @@ -1,1574 +1,1575 @@ -""" -Data Space Inversion (DSI) Autoencoder (AE) emulator implementation. -""" -from __future__ import print_function, division -from typing import Optional, List, Dict, Any, Union -import numpy as np -import pandas as pd -import inspect -from pyemu.utils.helpers import dsi_forward_run,dsi_runstore_forward_run, series_to_insfile -import os -import shutil -from pyemu.pst.pst_handler import Pst -from pyemu.en import ObservationEnsemble,ParameterEnsemble -from .base import Emulator -from .dsi import DSI -import pickle -import tempfile -import zipfile - -try: - import tensorflow as tf - from keras.saving import register_keras_serializable -except ImportError: - tf = None - # Dummy decorator to prevent NameError on class definitions - def register_keras_serializable(package=None, name=None): - def decorator(cls): - return cls - return decorator - -from sklearn.model_selection import train_test_split - - - -class DSIAE(Emulator): - """ - Data Space Inversion Autoencoder (DSIAE) emulator. - """ - - def __init__(self, - pst: Optional['Pst'] = None, - data: Optional[Union[pd.DataFrame, 'ObservationEnsemble']] = None, - transforms: Optional[List[Dict[str, Any]]] = None, - latent_dim: Optional[int] = None, - energy_threshold: float = 1.0, - verbose: bool = False) -> None: - """ - Initialize the DSIAE emulator. - - Args: - pst: PEST control file object. - data: Training data (DataFrame or ObservationEnsemble). - transforms: List of dicts defining preprocessing transformations. - latent_dim: Latent space dimension. If None, determined from energy_threshold. - energy_threshold: Variance threshold for automatic latent dimension (0.0-1.0). - verbose: Enable verbose logging. - """ - super().__init__(verbose=verbose) - - self.observation_data = pst.observation_data.copy() if pst is not None else None - - if isinstance(data, ObservationEnsemble): - data = data._df.copy() - - # Ensure float data - self.data = data.astype(float).copy() if data is not None else None - - self.energy_threshold = energy_threshold - - if transforms is not None: - if not isinstance(transforms, list): - raise TypeError("transforms must be a list of dicts") - for t in transforms: - if not isinstance(t, dict) or 'type' not in t: - raise ValueError("Each transform must be a dict with a 'type' key") - if 'columns' in t: - missing = [c for c in t['columns'] if c not in self.data.columns] - if missing: - raise ValueError(f"Transform columns not found in data: {missing}") - - self.transforms = transforms - self.fitted = False - self.data_transformed = self._prepare_training_data() - self.decision_variable_names = None - self.latent_dim = latent_dim - - if self.latent_dim is None and self.data is not None: - self.logger.statement("calculating latent dimension from energy threshold") - self.latent_dim = self._calc_explained_variance() - - - def _prepare_training_data(self) -> pd.DataFrame: - """ - Prepare and transform training data for model fitting. - - This method applies the configured transformation pipeline to the raw training - data, preparing it for use in autoencoder training. If no transformations are - specified, the data is passed through unchanged but a dummy transformer is - still created for consistency in the prediction pipeline. - - Returns - ------- - pd.DataFrame - Transformed training data ready for model fitting. All values will be - numeric (float64) and any specified transformations will have been applied. - - Raises - ------ - ValueError - If no data is stored in the emulator instance. - - Notes - ----- - This method is automatically called during emulator initialization and stores - the transformed data in `self.data_transformed`. The transformation pipeline - is preserved in `self.transformer_pipeline` for use during prediction to - ensure consistent data preprocessing. - - The method always creates a transformer pipeline object, even when no - transformations are specified, to maintain consistency in the prediction - workflow where inverse transformations may be needed. - """ - data = self.data - if data is None: - raise ValueError("No data stored in the emulator") - - self.logger.statement("applying feature transforms") - # Always use the base class transformation method for consistency - if self.transforms is not None: - self.data_transformed = self._fit_transformer_pipeline(data, self.transforms) - else: - # Still need to set up a dummy transformer for inverse operations - from .transformers import AutobotsAssemble - self.transformer_pipeline = AutobotsAssemble(data.copy()) - self.data_transformed = data.copy() - - return self.data_transformed - - def encode(self, X: Union[np.ndarray, pd.DataFrame]) -> pd.DataFrame: - """ - Encode input data into latent space representation. - - This method transforms input observation data into the lower-dimensional - latent space learned by the autoencoder. The encoding process applies any - configured data transformations before passing the data through the encoder - network. - - Parameters - ---------- - X : np.ndarray or pd.DataFrame - Input observation data to encode. Should have the same feature structure - as the training data. If DataFrame, the index will be preserved in the - output. Shape should be (n_samples, n_features) where n_features matches - the original observation space dimension. - - Returns - ------- - pd.DataFrame - Encoded latent space representation with shape (n_samples, latent_dim). - If input was a DataFrame, the original index is preserved. Column names - will be generated automatically for the latent dimensions. - - Raises - ------ - ValueError - If the encoder has not been fitted (emulator not trained). - If input data shape is incompatible with the trained model. - - Notes - ----- - This method automatically applies the same data transformations that were - used during training, ensuring consistent preprocessing. The transformations - are applied via the stored `transformer_pipeline`. - - The latent space representation can be used for: - - Dimensionality reduction and visualization - - Parameter space exploration - - Input to optimization routines - - Analysis of model behavior in reduced space - - Examples - -------- - >>> # Encode training data - >>> latent_repr = emulator.encode(training_data) - >>> - >>> # Encode new observations - >>> new_latent = emulator.encode(new_observations) - >>> print(f"Latent dimensions: {new_latent.shape[1]}") - """ - # check encoder exists - if not hasattr(self, 'encoder'): - raise ValueError("Encoder not found. Fit the emulator before encoding.") - - if isinstance(X, pd.DataFrame): - index = X.index - - if self.transforms is not None: - X = self.transformer_pipeline.transform(X) - Z = self.encoder.encode(X) - Z = pd.DataFrame(Z, index=index if 'index' in locals() else None) - return Z - - - def _calc_explained_variance(self) -> int: - """ - Calculate optimal latent dimension using PCA explained variance threshold. - - Returns - ------- - int - Minimum latent dimensions to capture `energy_threshold` variance. - Falls back to full dimensionality if 99% variance threshold not reached. - - Notes - ----- - Uses scikit-learn PCA on `self.data_transformed`. The energy_threshold - represents cumulative explained variance ratio (e.g., 0.95 = 95% variance). - """ - from sklearn.decomposition import PCA # light dependency; optional - # PCA explained variance (optional) - pca = PCA() - pca.fit(self.data_transformed.values.astype(float)) - cum_explained = np.cumsum(pca.explained_variance_ratio_) - latent_dim = int(np.searchsorted(cum_explained, self.energy_threshold) + 1) if cum_explained[-1] >= 0.99 else len(cum_explained) - return latent_dim - - def fit(self, validation_split: float = 0.1, hidden_dims: tuple = (128, 64), - lr: float = 1e-3, epochs: int = 300, batch_size: int = 128, - early_stopping: bool = True, dropout_rate: float = 0.0, - random_state: int = 42, loss_type: str = 'energy', - loss_kwargs: Optional[Dict[str, Any]] = None, - sample_weight: Optional[np.ndarray] = None) -> 'DSIAE': - """ - Fit the autoencoder emulator to training data. - - Parameters - ---------- - validation_split : float, default 0.1 - Fraction of data to use for validation. - hidden_dims : tuple, default (128, 64) - Hidden layer dimensions for encoder/decoder. - lr : float, default 1e-3 - Learning rate for Adam optimizer. - epochs : int, default 300 - Maximum training epochs. - batch_size : int, default 128 - Training batch size. - early_stopping : bool, default True - Whether to use early stopping on validation loss. - dropout_rate : float, default 0.0 - Dropout rate for regularization during training. - random_state : int, default 42 - Random seed for reproducibility. - loss_type : str, default 'energy' - Type of loss function to use. Options: 'energy', 'mmd', 'wasserstein', - 'statistical', 'adaptive', 'mse', 'huber'. - loss_kwargs : dict, optional - Additional parameters for the loss function. - sample_weight : np.ndarray, optional - Sample weights for training. Shape should be (n_samples,). - - Returns - ------- - DSIAE - Self (fitted emulator instance). - """ - - if self.data_transformed is None: - self.logger.statement("transforming training data") - self.data_transformed = self._prepare_training_data() - - X = self.data_transformed.values.astype(float) - if self.latent_dim is None: - self.logger.statement("calculating latent dimension from energy threshold") - self.latent_dim = self._calc_explained_variance() - - # Configure loss function - if loss_kwargs is None: - loss_kwargs = {} - - # Set default loss parameters if not specified - if loss_type == 'energy' and 'lambda_energy' not in loss_kwargs: - loss_kwargs['lambda_energy'] = 1e-3 - elif loss_type == 'mmd' and 'lambda_mmd' not in loss_kwargs: - loss_kwargs['lambda_mmd'] = 1e-3 - elif loss_type == 'wasserstein' and 'lambda_w' not in loss_kwargs: - loss_kwargs['lambda_w'] = 1e-3 - elif loss_type == 'statistical': - if 'lambda_moments' not in loss_kwargs: - loss_kwargs['lambda_moments'] = 1e-3 - if 'lambda_corr' not in loss_kwargs: - loss_kwargs['lambda_corr'] = 5e-4 - if 'lambda_dist' not in loss_kwargs: - loss_kwargs['lambda_dist'] = 1e-3 - - loss_fn = create_distribution_loss(loss_type, **loss_kwargs) - - self.logger.statement(f"using {loss_type} loss function with parameters: {loss_kwargs}") - # train autoencoder on transformed data - ae = AutoEncoder(input_dim=X.shape[1], - latent_dim=self.latent_dim, - hidden_dims=hidden_dims, - loss=loss_fn, - lr=lr, - dropout_rate=dropout_rate, - random_state=random_state, - ) - ae.fit(X, - validation_split=validation_split, - epochs=epochs, batch_size=batch_size, - early_stopping=early_stopping, - patience=10, - sample_weight=sample_weight, - ) - self.encoder = ae - self.fitted = True - return self - - # Reuse implementation from DSI - _write_forward_run_script = DSI._write_forward_run_script - - def predict(self, pvals: Union[np.ndarray, pd.Series, pd.DataFrame]) -> pd.Series: - """ - Generate predictions from the emulator. - - Parameters - ---------- - pvals : np.ndarray, pd.Series, or pd.DataFrame - Parameter values for prediction in latent space. - Shape should match latent_dim. - - Returns - ------- - pd.Series - Predicted observation values in original scale. - - Raises - ------ - ValueError - If emulator not fitted or input dimensions incorrect. - """ - if not self.fitted: - raise ValueError("Emulator must be fitted before prediction") - - if self.transforms is not None and (not hasattr(self, 'transformer_pipeline') or self.transformer_pipeline is None): - raise ValueError("Emulator must be fitted and have valid transformations before prediction") - - if isinstance(pvals, pd.Series): - pvals = pvals.values.flatten().reshape(1,-1).astype(np.float32) - elif isinstance(pvals, np.ndarray) and len(pvals.shape) == 2 and pvals.shape[0] == 1: - pvals = pvals.flatten().reshape(1,-1) - pvals = pvals.astype(np.float32) - elif isinstance(pvals, pd.DataFrame): - index = pvals.index - pvals = pvals.values.astype(np.float32) - - #assert pvals.shape[0] == self.latent_dim , f"Input parameter dimension {pvals.shape[0]} does not match latent dimension {self.latent_dim}" - sim_vals = self.encoder.decode(pvals) - sim_vals = pd.DataFrame(sim_vals, - columns=self.data_transformed.columns, - index=index if 'index' in locals() else None) - sim_vals = sim_vals.squeeze() - #if isinstance(sim_vals, np.ndarray): - # sim_vals = pd.Series(sim_vals.flatten(), index=self.data_transformed.columns) - if self.transforms is not None: - pipeline = self.transformer_pipeline - sim_vals = pipeline.inverse(sim_vals) - sim_vals.index.name = 'obsnme' - sim_vals.name = "obsval" - self.sim_vals = sim_vals - return sim_vals - - def check_for_pdc(self): - """Check for Prior data conflict.""" - #TODO - return - - - - def prepare_dsivc(self, decvar_names: Union[List[str], str], t_d: Optional[str] = None, - pst: Optional['Pst'] = None, oe: Optional['ObservationEnsemble'] = None, - track_stack: bool = False, dsi_args: Optional[Dict[str, Any]] = None, - percentiles: List[float] = [0.25, 0.75, 0.5], - mou_population_size: Optional[int] = None, - ies_exe_path: str = "pestpp-ies") -> 'Pst': - """ - Prepare Data Space Inversion Variable Control (DSIVC) control files. - - Parameters - ---------- - decvar_names : list or str - Names of decision variables for optimization. - t_d : str, optional - Template directory path. Uses existing if None. - pst : Pst, optional - PST control file object. Uses existing if None. - oe : ObservationEnsemble, optional - Observation ensemble. Uses existing if None. - track_stack : bool, default False - Whether to include individual ensemble realizations as observations. - dsi_args : dict, optional - DSI configuration arguments. - percentiles : list, default [0.25, 0.75, 0.5] - Percentiles to calculate from ensemble statistics. - mou_population_size : int, optional - Population size for multi-objective optimization. - ies_exe_path : str, default "pestpp-ies" - Path to PEST++ IES executable. - - Returns - ------- - Pst - PEST++ control file object for DSIVC optimization. - - Notes - ----- - Sets up multi-objective optimization with decision variables constrained - to training data bounds. Creates stack statistics observations for ensemble - matching and configures PEST++-MOU options. - """ - # check that percentiles is a list or array of floats between 0 and 1. - assert isinstance(percentiles, (list, np.ndarray)), "percentiles must be a list or array of floats" - assert all([isinstance(i, (float, int)) for i in percentiles]), "percentiles must be a list or array of floats" - assert all([0 <= i <= 1 for i in percentiles]), "percentiles must be between 0 and 1" - # ensure that pecentiles are unique - percentiles = np.unique(percentiles) - - - #track dsivc args for forward run - self.dsivc_args = {"percentiles":percentiles, - "decvar_names":decvar_names, - "track_stack":track_stack, - } - - if t_d is None: - self.logger.statement("using existing DSI template dir...") - t_d = self.template_dir - self.logger.statement(f"using {t_d} as template directory...") - assert os.path.exists(t_d), f"template directory {t_d} does not exist" - - if pst is None: - self.logger.statement("no pst provided...") - self.logger.statement("using dsi.pst in DSI template dir...") - assert os.path.exists(os.path.join(t_d,"dsi.pst")), f"dsi.pst not found in {t_d}" - pst = Pst(os.path.join(t_d,"dsi.pst")) - if oe is None: - self.logger.statement(f"no posterior DSI observation ensemble provided, using dsi.{dsi_args['noptmax']}.obs.jcb in DSI template dir...") - assert os.path.exists(os.path.join(t_d,f"dsi.{dsi_args['noptmax']}.obs.jcb")), f"dsi.{dsi_args['noptmax']}.obs.jcb not found in {t_d}" - oe = ObservationEnsemble.from_binary(pst,os.path.join(t_d,f"dsi.{dsi_args['noptmax']}.obs.jcb")) - else: - assert isinstance(oe, ObservationEnsemble), "oe must be an ObservationEnsemble" - - #check if decvar_names str - if isinstance(decvar_names, str): - decvar_names = [decvar_names] - # chekc htat decvars are in the oe columns - missing = [col for col in decvar_names if col not in oe.columns] - assert len(missing) == 0, f"The following decvars are missing from the DSI obs ensemble: {missing}" - # chekc htat decvars are in the pst observation data - missing = [col for col in decvar_names if col not in pst.obs_names] - assert len(missing) == 0, f"The following decvars are missing from the DSI pst control file: {missing}" - - - # handle DSI args - default_dsi_args = {"noptmax":pst.control_data.noptmax, - "decvar_weight":1.0, - #"decvar_phi_factor":0.5, - "num_pyworkers":1, - } - # ensure it's a dict - if dsi_args is None: - dsi_args = default_dsi_args - elif not isinstance(dsi_args, dict): - raise TypeError("Expected a dictionary for 'options'") - # merge with defaults (user values override defaults) - #dsi_args = {**default_dsi_args, **dsi_args} - else: - for key, value in default_dsi_args.items(): - if key not in dsi_args: - dsi_args[key] = value - - # check that dsi_args has the required keys - required_keys = ["noptmax", "decvar_weight", "num_pyworkers"] - for key in required_keys: - if key not in dsi_args: - raise KeyError(f"Missing required key '{key}' in 'dsi_args'") - self.dsi_args = dsi_args - out_files = [] - - self.logger.statement(f"preparing stack stats observations...") - assert isinstance(oe, ObservationEnsemble), "oe must be an ObservationEnsemble" - if oe.index.name is None: - id_vars="index" - else: - id_vars=oe.index.name - stack_stats = oe._df.describe(percentiles=percentiles).reset_index().melt(id_vars=id_vars) - stack_stats.rename(columns={"value":"obsval","index":"stat"},inplace=True) - stack_stats['obsnme'] = stack_stats.apply(lambda x: x.variable+"_stat:"+x.stat,axis=1) - stack_stats.set_index("obsnme",inplace=True) - stack_stats = stack_stats.obsval - self.logger.statement(f"stack osb recorded to dsi.stack_stats.csv...") - out_file = os.path.join(t_d,"dsi.stack_stats.csv") - out_files.append(out_file) - stack_stats.to_csv(out_file,float_format="%.6e") - series_to_insfile(out_file,ins_file=None) - - - if track_stack: - self.logger.statement(f"including {oe.values.flatten().shape[0]} stack observations...") - - stack = oe._df.reset_index().melt(id_vars=id_vars) - stack.rename(columns={"value":"obsval"},inplace=True) - stack['obsnme'] = stack.apply(lambda x: x.variable+"_real:"+x.index,axis=1) - stack.set_index("obsnme",inplace=True) - stack = stack.obsval - out_file = os.path.join(t_d,"dsi.stack.csv") - out_files.append(out_file) - stack.to_csv(out_file,float_format="%.6e") - series_to_insfile(out_file,ins_file=None) - - - - self.logger.statement(f"prepare DSIVC template files...") - dsi_in_file = os.path.join(t_d, "dsivc_pars.csv") - dsi_tpl_file = dsi_in_file + ".tpl" - ftpl = open(dsi_tpl_file, 'w') - fin = open(dsi_in_file, 'w') - ftpl.write("ptf ~\n") - fin.write("parnme,parval1\n") - ftpl.write("parnme,parval1\n") - for pname in decvar_names: - val = oe._df.loc[:,pname].mean() - fin.write(f"{pname},{val:.6e}\n") - ftpl.write(f"{pname},~ {pname} ~\n") - fin.close() - ftpl.close() - - - self.logger.statement(f"building DSIVC control file...") - pst_dsivc = Pst.from_io_files([dsi_tpl_file],[dsi_in_file],[i+".ins" for i in out_files],out_files,pst_path=".") - - self.logger.statement(f"setting dec var bounds...") - par = pst_dsivc.parameter_data - # set all parameters fixed - par.loc[:,"partrans"] = "fixed" - # constrain decvar pars to training data bounds - par.loc[decvar_names,"pargp"] = "decvars" - par.loc[decvar_names,"partrans"] = "none" - par.loc[decvar_names,"parubnd"] = self.data.loc[:,decvar_names].max() - par.loc[decvar_names,"parlbnd"] = self.data.loc[:,decvar_names].min() - par.loc[decvar_names,"parval1"] = self.data.loc[:,decvar_names].quantile(.5) - - self.logger.statement(f"zero-weighting observation data...") - # prepemtpively set obs weights 0.0 - obs = pst_dsivc.observation_data - obs.loc[:,"weight"] = 0.0 - - self.logger.statement(f"getting obs metadata from DSI observation_data...") - obsorg = pst.observation_data.copy() - columns = [i for i in obsorg.columns if i !='obsnme'] - for o in obsorg.obsnme.values: - obs.loc[obs.obsnme.str.startswith(o), columns] = obsorg.loc[obsorg.obsnme==o, columns].values - - obs.loc[stack_stats.index,"obgnme"] = "stack_stats" - obs.loc[stack_stats.index,"org_obsnme"] = [i.split("_stat:")[0] for i in stack_stats.index.values] - pst_dsivc.try_parse_name_metadata() - - #obs.loc[stack.index,"obgnme"] = "stack" - - self.logger.statement(f"building dsivc_forward_run.py...") - pst_dsivc.model_command = "python dsivc_forward_run.py" - from pyemu.utils.helpers import dsivc_forward_run - function_source = inspect.getsource(dsivc_forward_run) - with open(os.path.join(t_d,"dsivc_forward_run.py"),'w') as file: - file.write(function_source) - file.write("\n\n") - file.write("if __name__ == \"__main__\":\n") - file.write(f" {function_source.split('(')[0].split('def ')[1]}(ies_exe_path='{ies_exe_path}')\n") - - self.logger.statement(f"preparing nominal initial population...") - if mou_population_size is None: - # set the population size to 2 * number of decision variables - # this is a good rule of thumb for MOU - mou_population_size = 2 * len(decvar_names) - # these should generally be twice the number of decision variables - if mou_population_size < 2 * len(decvar_names): - self.logger.statement(f"mou population is less than 2x number of decision variables, this may be too small...") - # sample 160 sets of decision variables from a unform distribution - dvpop = ParameterEnsemble.from_uniform_draw(pst_dsivc,num_reals=mou_population_size) - # record to external file for PESTPP-MOU - dvpop.to_binary(os.path.join(t_d,"initial_dvpop.jcb")) - # tell PESTPP-MOU about the new file - pst_dsivc.pestpp_options["mou_dv_population_file"] = 'initial_dvpop.jcb' - - - # some additional PESTPP-MOU options: - pst_dsivc.pestpp_options["mou_population_size"] = mou_population_size #twice the number of decision variables - pst_dsivc.pestpp_options["mou_save_population_every"] = 1 # save lots of files! - - pst_dsivc.control_data.noptmax = 0 #just for a test run - pst_dsivc.write(os.path.join(t_d,"dsivc.pst"),version=2) - - # updating the DSI pst control file - self.logger.statement(f"updating DSI pst control file...") - self.logger.statement("overwriting dsi.pst file...") - pst.observation_data.loc[decvar_names, "weight"] = dsi_args["decvar_weight"] - pst.control_data.noptmax = dsi_args["noptmax"] - - #TODO: ensure no noise for dvars obs - - pst.write(os.path.join(t_d,"dsi.pst"), version=2) - - - self.logger.statement("overwriting dsi.pickle file...") - self.decision_variable_names = decvar_names - # re-pickle dsi to track dsivc args - self.save(os.path.join(t_d,"dsi.pickle")) - - self.logger.statement("DSIVC control files created...the user still needs to specify objectives and constraints...") - return pst_dsivc - - def hyperparam_search(self, latent_dims: Optional[List[int]] = None, - latent_dim_mults: List[float] = [0.5, 1.0, 2.0], - hidden_dims_list: List[tuple] = [(64, 32), (128, 64)], - lrs: List[float] = [1e-2, 1e-3], - epochs: int = 50, batch_size: int = 32, - random_state: int = 0) -> Dict[tuple, float]: - """ - Grid search over autoencoder hyperparameters. - - Parameters - ---------- - latent_dims : list of int, optional - Latent dimensions to test. If None, uses latent_dim_mults. - latent_dim_mults : list of float, default [0.5, 1.0, 2.0] - Multipliers for current latent_dim if latent_dims not provided. - hidden_dims_list : list of tuple, default [(64, 32), (128, 64)] - Hidden layer architectures to test. - lrs : list of float, default [1e-2, 1e-3] - Learning rates to test. - epochs : int, default 50 - Training epochs for each configuration. - batch_size : int, default 32 - Training batch size. - random_state : int, default 0 - Random seed for reproducibility. - - Returns - ------- - dict - Mapping from (latent_dim, hidden_dims, lr) to validation loss. - """ - if latent_dims is None: - assert self.latent_dim is not None, "Either latent_dims or self.latent_dim must be set" - latent_dims = [int(self.latent_dim * m) for m in latent_dim_mults] - - X = self.data_transformed.values.astype(float) - results = AutoEncoder.hyperparam_search( - X, - latent_dims=latent_dims, - hidden_dims_list=hidden_dims_list, - lrs=lrs, - epochs=epochs, - batch_size=batch_size, - random_state=random_state - ) - return results - - def save(self, filename: str) -> None: - """ - Save the emulator to a file. - - Bundles the pickled object and the TensorFlow model into a zip archive. - """ - # Create a temporary directory to save components - with tempfile.TemporaryDirectory() as tmp_dir: - # 1. Save TF model - model_dir = os.path.join(tmp_dir, "tf_model") - if hasattr(self, 'encoder') and self.encoder is not None: - self.encoder.save(model_dir) - - # 2. Remove TF model from self to allow pickling - encoder_ref = self.encoder - self.encoder = None - - # 3. Pickle the rest of the object - pkl_path = os.path.join(tmp_dir, "dsiae.pkl") - with open(pkl_path, "wb") as f: - pickle.dump(self, f) - - # Restore encoder - self.encoder = encoder_ref - - # 4. Zip everything into the target filename - with zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED) as zipf: - # Add pickle - zipf.write(pkl_path, arcname="dsiae.pkl") - # Add TF model directory contents - if os.path.exists(model_dir): - for root, dirs, files in os.walk(model_dir): - for file in files: - file_path = os.path.join(root, file) - arcname = os.path.relpath(file_path, tmp_dir) - zipf.write(file_path, arcname=arcname) - - print(f"Saved emulator to {filename}") - - @classmethod - def load(cls, filename: str) -> 'DSIAE': - """ - Load the emulator from a file. - """ - with tempfile.TemporaryDirectory() as tmp_dir: - with zipfile.ZipFile(filename, 'r') as zipf: - zipf.extractall(tmp_dir) - - # 1. Unpickle - with open(os.path.join(tmp_dir, "dsiae.pkl"), "rb") as f: - obj = pickle.load(f) - - # 2. Reload TF model if it exists - model_dir = os.path.join(tmp_dir, "tf_model") - if os.path.exists(model_dir): - # We need to reconstruct the AutoEncoder wrapper - # Since we don't have the init params easily available, we rely on the fact - # that AutoEncoder.load loads the Keras models directly. - # But we need an AutoEncoder instance first. - - # We can create a dummy AutoEncoder instance and then load the weights/models - # However, AutoEncoder.__init__ builds the model. - # We can bypass __init__ or use default params if we are just going to overwrite the models. - - # Better approach: The AutoEncoder class should have a classmethod to load from disk - # or we instantiate it with dummy params and then load. - - # Let's assume we can instantiate it with minimal params. - # We need input_dim and latent_dim. - # obj.data_transformed should be available. - input_dim = obj.data_transformed.shape[1] if obj.data_transformed is not None else 0 - latent_dim = obj.latent_dim if obj.latent_dim is not None else 2 - - # Create a blank AutoEncoder instance - # We use __new__ to bypass __init__ since we are loading the full model structure - ae = AutoEncoder.__new__(AutoEncoder) - ae.load(model_dir) - obj.encoder = ae - - return obj - - def _get_emulator_parameters(self, pst=None): - """ - Get params for DSIAE (latent variables). - """ - Z = self.encode(self.data) - if 'base' in Z.index: - pvals = Z.loc['base',:] - else: - pvals = Z.mean(axis=0) - - npar = self.latent_dim - par_names = [f"dsi_par{i:04d}" for i in range(npar)] - - df = pd.DataFrame(index=par_names) - df["parnme"] = par_names - df["parval1"] = pvals.values.flatten() - df["parlbnd"] = Z.min(axis=0).values - df["parubnd"] = Z.max(axis=0).values - df["pargp"] = "dsi_pars" - df["partrans"] = "none" - - return df - - def _get_emulator_observations(self, pst=None): - """ - Get observations for DSIAE. - """ - # Use columns from data (assuming they represent observations) - if self.data is not None: - cols = self.data.columns - df = pd.DataFrame(index=cols) - df["obsnme"] = cols - df["obsval"] = self.data.mean(axis=0) # Use mean as dummy value - df["weight"] = 0.0 - df["obgnme"] = "obgnme" - return df - - def _configure_pst_object(self, pst_obj, pst_original, t_d=None): - """ - Configure DSIAE specific PEST++ options and save dependent files. - """ - if t_d is None: - t_d = "." - - Z = self.encode(self.data) - npar = self.latent_dim - par_names = pst_obj.parameter_data.index.tolist() - assert npar == len(par_names), f"latent dim {npar} does not match number of parameters {len(par_names)}" - Z.columns = par_names - - pe = ParameterEnsemble(pst_obj, Z) - jcb_path = os.path.join(t_d, 'latent_prior.jcb') - pe.to_binary(jcb_path) - pst_obj.pestpp_options['ies_parameter_ensemble'] = 'latent_prior.jcb' - - pst_obj.pestpp_options["save_binary"] = True - pst_obj.pestpp_options["overdue_giveup_fac"] = 1e30 - pst_obj.pestpp_options["overdue_giveup_minutes"] = 1e30 - pst_obj.pestpp_options["panther_agent_freeze_on_fail"] = True - pst_obj.pestpp_options["ies_no_noise"] = False - pst_obj.pestpp_options["ies_subset_size"] = -10 - - # Save dsi.pickle for legacy forward run scripts (like runstor) - self.save(os.path.join(t_d, "dsi.pickle")) - - self.logger.statement(f"Saved latent_prior.jcb to {jcb_path}") - return pst_obj - - def prepare_pestpp(self, t_d, pst=None, verbose=False, use_runstor=False): - """ - Prepare PEST++ interface for DSIAE. - Wraps base implementation. - """ - self._use_runstor = use_runstor - pst_obj = super().prepare_pestpp(t_d=t_d, pst=pst, verbose=verbose, - tpl_filename="dsi_pars.csv.tpl", - input_filename="dsi_pars.csv", - ins_filename="dsi_sim_vals.csv.ins", - output_filename="dsi_sim_vals.csv") - - return pst_obj - -class AutoEncoder: - def __init__(self, input_dim: int, latent_dim: int = 2, - hidden_dims: tuple = (128, 64), lr: float = 1e-3, - activation: str = 'relu', loss: str = 'Huber', - dropout_rate: float = 0.0, random_state: int = 0) -> None: - """ - Initialize AutoEncoder. - - Args: - input_dim: Input feature dimension. - latent_dim: Latent space dimension. - hidden_dims: Tuple of hidden layer sizes for encoder (reversed for decoder). - lr: Learning rate. - activation: Activation function name. - loss: Loss function name. - dropout_rate: Dropout rate (0.0-1.0). - random_state: Random seed. - """ - if tf is None: - raise ImportError("TensorFlow is required for AutoEncoder but not installed.") - - self.input_dim = input_dim - self.latent_dim = latent_dim - self.hidden_dims = hidden_dims - self.lr = lr - self.activation = activation - self.loss = loss - self.dropout_rate = dropout_rate - self.random_state = random_state - - tf.random.set_seed(random_state) - np.random.seed(random_state) - self._build_model() - - # Build encoder/decoder - def _build_model(self): - tf.keras.backend.set_floatx('float32') - # Encoder - encoder_inputs = tf.keras.Input(shape=(self.input_dim,)) - x = encoder_inputs - for h in self.hidden_dims: - x = tf.keras.layers.Dense(h, activation=self.activation)(x) - if hasattr(self, 'dropout_rate') and self.dropout_rate > 0: - x = tf.keras.layers.Dropout(self.dropout_rate)(x) - latent = tf.keras.layers.Dense(self.latent_dim, name='latent')(x) - self.encoder = tf.keras.Model(encoder_inputs, latent, name='encoder') - - # Decoder - decoder_inputs = tf.keras.Input(shape=(self.latent_dim,)) - x = decoder_inputs - for h in reversed(self.hidden_dims): - x = tf.keras.layers.Dense(h, activation=self.activation)(x) - if hasattr(self, 'dropout_rate') and self.dropout_rate > 0: - x = tf.keras.layers.Dropout(self.dropout_rate)(x) - outputs = tf.keras.layers.Dense(self.input_dim, activation=None)(x) - self.decoder = tf.keras.Model(decoder_inputs, outputs, name='decoder') - - # Autoencoder model - ae_inputs = encoder_inputs - ae_outputs = self.decoder(self.encoder(ae_inputs)) - self.model = tf.keras.Model(ae_inputs, ae_outputs, name='autoencoder') - self.model.compile(optimizer=tf.keras.optimizers.Adam(self.lr), loss=self.loss) - - - def fit(self, X: np.ndarray, X_val: Optional[np.ndarray] = None, - epochs: int = 100, batch_size: int = 32, - validation_split: float = 0.1, early_stopping: bool = True, - patience: int = 10, lr_schedule: Optional[Any] = None, - verbose: int = 2, sample_weight: Optional[np.ndarray] = None, - validation_sample_weight: Optional[np.ndarray] = None) -> Any: - """ - Train the autoencoder. - - Args: - X: Training data. - X_val: Validation data (optional). - epochs: Max epochs. - batch_size: Batch size. - validation_split: Validation split fraction (if X_val is None). - early_stopping: Enable early stopping. - patience: Early stopping patience. - lr_schedule: Learning rate scheduler callback. - verbose: Verbosity level. - sample_weight: Training sample weights. - validation_sample_weight: Validation sample weights. - - Returns: - Training history. - """ - # Callbacks - callbacks = [] - if early_stopping: - callbacks.append(tf.keras.callbacks.EarlyStopping( - monitor='val_loss', - patience=patience, - restore_best_weights=True - )) - if lr_schedule is not None: - callbacks.append(lr_schedule) - - - # Train - history = self.model.fit( - X, X, - sample_weight=sample_weight, - validation_split=validation_split, - epochs=epochs, - batch_size=batch_size, - callbacks=callbacks, - verbose=verbose - ) - return history - - def encode(self, X: Union[np.ndarray, pd.DataFrame, pd.Series]) -> np.ndarray: - """ - Encode input data to latent representation. - - Parameters - ---------- - X : np.ndarray, pd.DataFrame, or pd.Series - Input data to encode to latent space. - - Returns - ------- - np.ndarray - Latent representation with shape (n_samples, latent_dim). - """ - if isinstance(X, pd.DataFrame): - X = X.values.astype(np.float32) - elif isinstance(X, pd.Series): - X = X.values.reshape(1,-1).astype(np.float32) - return self.encoder(X, training=False) - - def decode(self, Z: np.ndarray) -> np.ndarray: - """ - Decode latent representation back to input space. - - Parameters - ---------- - Z : np.ndarray - Latent representation with shape (n_samples, latent_dim). - - Returns - ------- - np.ndarray - Reconstructed data with shape (n_samples, input_dim). - """ - #X_hat = self.decoder.predict(Z, verbose=0,) - X_hat = self.decoder(Z,training=False) - return X_hat - - - def save(self, folder: str) -> None: - """ - Save trained models to disk. - """ - os.makedirs(folder, exist_ok=True) - self.encoder.save(os.path.join(folder, 'encoder.keras')) - self.decoder.save(os.path.join(folder, 'decoder.keras')) - self.model.save(os.path.join(folder, 'autoencoder.keras')) - - def load(self, folder: str) -> None: - """ - Load trained models from disk. - """ - self.encoder = tf.keras.models.load_model(os.path.join(folder, 'encoder.keras')) - self.decoder = tf.keras.models.load_model(os.path.join(folder, 'decoder.keras')) - self.model = tf.keras.models.load_model(os.path.join(folder, 'autoencoder.keras')) - - - @staticmethod - def hyperparam_search(X: np.ndarray, latent_dims: List[int] = [2, 3, 5], - hidden_dims_list: List[tuple] = [(64, 32), (128, 64)], - lrs: List[float] = [1e-2, 1e-3], epochs: int = 50, - batch_size: int = 32, random_state: int = 42) -> Dict[tuple, float]: - """ - Perform grid search over autoencoder hyperparameters. - - Systematically evaluates different combinations of latent dimensions, - network architectures, and learning rates to find optimal configurations - based on validation loss performance. - - Parameters - ---------- - X : np.ndarray - Training data for hyperparameter optimization. - - latent_dims : list of int, default [2, 3, 5] - Latent space dimensions to evaluate. - - hidden_dims_list : list of tuple, default [(64, 32), (128, 64)] - Network architectures to test. Each tuple specifies hidden layer sizes. - - lrs : list of float, default [1e-2, 1e-3] - Learning rates to evaluate. - - epochs : int, default 50 - Training epochs for each configuration. - - batch_size : int, default 32 - Batch size for training. - - random_state : int, default 42 - Random seed for reproducible train/validation splits. - - Returns - ------- - dict - Mapping from (latent_dim, hidden_dims, lr) tuples to validation loss values. - Lower values indicate better performance. - - Notes - ----- - Uses 10% of data for validation via train_test_split. Each configuration - is trained independently with early stopping disabled to ensure fair - comparison across hyperparameter combinations. - - Examples - -------- - >>> results = AutoEncoder.hyperparam_search(X_train, epochs=100) - >>> best_params = min(results.keys(), key=results.get) - >>> print(f"Best configuration: {best_params}") - """ - results = {} - X_train, X_val = train_test_split(X, test_size=0.1, random_state=random_state) - for ld in latent_dims: - for hd in hidden_dims_list: - for lr in lrs: - print(f"Training AE: latent_dim={ld}, hidden_dims={hd}, lr={lr}") - ae = AutoEncoder(input_dim=X.shape[1], latent_dim=ld, hidden_dims=hd, lr=lr) - history = ae.fit(X_train, X_val=X_val, epochs=epochs, batch_size=batch_size,verbose=0) - val_loss = history.history['val_loss'][-1] - results[(ld, hd, lr)] = val_loss - print(f"Validation loss: {val_loss:.4f}") - return results - - - - -# Efficient pairwise L2 distances -def pairwise_distances(x, y, eps=1e-12): - x_norm = tf.reduce_sum(tf.square(x), axis=1, keepdims=True) - y_norm = tf.reduce_sum(tf.square(y), axis=1, keepdims=True) - dist_sq = x_norm + tf.transpose(y_norm) - 2.0 * tf.matmul(x, y, transpose_b=True) - dist_sq = tf.maximum(dist_sq, eps) - return tf.sqrt(dist_sq) - - - -# Energy distance core function -def energy_distance_optimized(y_true, y_pred): - d_xy = pairwise_distances(y_true, y_pred) - cross = 2.0 * tf.reduce_mean(d_xy) - - d_xx = pairwise_distances(y_true, y_true) - d_yy = pairwise_distances(y_pred, y_pred) - - return cross - tf.reduce_mean(d_xx) - tf.reduce_mean(d_yy) - - -# UTILITY FUNCTIONS FOR DISTRIBUTION-AWARE LOSSES -def maximum_mean_discrepancy(x, y, kernel='rbf', sigma=1.0): - """Compute Maximum Mean Discrepancy between two distributions.""" - if kernel == 'rbf': - # RBF kernel k(x,y) = exp(-||x-y||^2 / (2*sigma^2)) - x_norm = tf.reduce_sum(tf.square(x), axis=1, keepdims=True) - y_norm = tf.reduce_sum(tf.square(y), axis=1, keepdims=True) - - # Pairwise distances - xx = x_norm + tf.transpose(x_norm) - 2.0 * tf.matmul(x, x, transpose_b=True) - yy = y_norm + tf.transpose(y_norm) - 2.0 * tf.matmul(y, y, transpose_b=True) - xy = x_norm + tf.transpose(y_norm) - 2.0 * tf.matmul(x, y, transpose_b=True) - - # Apply RBF kernel - k_xx = tf.exp(-xx / (2 * sigma**2)) - k_yy = tf.exp(-yy / (2 * sigma**2)) - k_xy = tf.exp(-xy / (2 * sigma**2)) - - elif kernel == 'linear': - k_xx = tf.matmul(x, x, transpose_b=True) - k_yy = tf.matmul(y, y, transpose_b=True) - k_xy = tf.matmul(x, y, transpose_b=True) - else: - raise ValueError(f"Unsupported kernel: {kernel}") - - # MMD calculation - mmd = tf.reduce_mean(k_xx) + tf.reduce_mean(k_yy) - 2.0 * tf.reduce_mean(k_xy) - return tf.maximum(mmd, 0.0) # Ensure non-negative - - -def wasserstein_distance_sliced(x, y, num_projections=50): - """Approximate Wasserstein-1 distance using sliced Wasserstein distance.""" - # Generate random projections - d = tf.shape(x)[1] - theta = tf.random.normal([d, num_projections]) - theta = theta / tf.norm(theta, axis=0, keepdims=True) - - # Project data onto random directions - x_proj = tf.matmul(x, theta) # [batch_size, num_projections] - y_proj = tf.matmul(y, theta) # [batch_size, num_projections] - - # Sort projections - x_sorted = tf.sort(x_proj, axis=0) - y_sorted = tf.sort(y_proj, axis=0) - - # Compute L1 distance between sorted projections - distances = tf.reduce_mean(tf.abs(x_sorted - y_sorted), axis=0) - return tf.reduce_mean(distances) - - -def correlation_loss(x, y): - """Penalize differences in correlation structure between datasets.""" - # Center the data - x_centered = x - tf.reduce_mean(x, axis=0, keepdims=True) - y_centered = y - tf.reduce_mean(y, axis=0, keepdims=True) - - # Compute correlation matrices - x_cov = tf.matmul(x_centered, x_centered, transpose_a=True) / tf.cast(tf.shape(x)[0] - 1, tf.float32) - y_cov = tf.matmul(y_centered, y_centered, transpose_a=True) / tf.cast(tf.shape(y)[0] - 1, tf.float32) - - # Normalize to get correlation - x_std = tf.sqrt(tf.diag_part(x_cov)) - y_std = tf.sqrt(tf.diag_part(y_cov)) - - x_corr = x_cov / (tf.expand_dims(x_std, 0) * tf.expand_dims(x_std, 1)) - y_corr = y_cov / (tf.expand_dims(y_std, 0) * tf.expand_dims(y_std, 1)) - - # Frobenius norm of difference - return tf.reduce_mean(tf.square(x_corr - y_corr)) - - - -if tf is not None: - LossBase = tf.keras.losses.Loss -else: - class LossBase: - def __init__(self, name=None, **kwargs): - pass - def __call__(self, *args, **kwargs): - pass - -@register_keras_serializable(package="pyemu_emulators", name="EnergyLoss") -class EnergyLoss(LossBase): - """ - Energy distance loss combining MSE reconstruction with energy distance. - - The energy distance measures dissimilarity between probability distributions - and helps ensure the reconstructed samples preserve the overall data distribution. - """ - - def __init__(self, lambda_energy=1e-2, name="energy_loss"): - super().__init__(name=name) - self.lambda_energy = lambda_energy - - def call(self, y_true, y_pred): - mse = tf.reduce_mean(tf.square(y_true - y_pred)) - ed = energy_distance_optimized(y_true, y_pred) - return mse + self.lambda_energy * ed - - def get_config(self): - return { - "lambda_energy": self.lambda_energy, - "name": self.name, - } - - @classmethod - def from_config(cls, config): - return cls(**config) - - -@register_keras_serializable(package="pyemu_emulators", name="MMDLoss") -class MMDLoss(LossBase): - """ - Maximum Mean Discrepancy loss for distribution matching. - - MMD measures the distance between distributions in a reproducing kernel - Hilbert space. More computationally efficient than energy distance. - """ - - def __init__(self, lambda_mmd=1e-2, kernel='rbf', sigma=1.0, name="mmd_loss"): - super().__init__(name=name) - self.lambda_mmd = lambda_mmd - self.kernel = kernel - self.sigma = sigma - - def call(self, y_true, y_pred): - mse = tf.reduce_mean(tf.square(y_true - y_pred)) - mmd = maximum_mean_discrepancy(y_true, y_pred, kernel=self.kernel, sigma=self.sigma) - return mse + self.lambda_mmd * mmd - - def get_config(self): - return { - "lambda_mmd": self.lambda_mmd, - "kernel": self.kernel, - "sigma": self.sigma, - "name": self.name, - } - - @classmethod - def from_config(cls, config): - return cls(**config) - - -@register_keras_serializable(package="pyemu_emulators", name="WassersteinLoss") -class WassersteinLoss(LossBase): - """ - Sliced Wasserstein distance loss for distribution matching. - - Uses random projections to approximate the Wasserstein-1 distance, - which is particularly effective for high-dimensional distributions. - """ - - def __init__(self, lambda_w=1e-2, num_projections=50, name="wasserstein_loss"): - super().__init__(name=name) - self.lambda_w = lambda_w - self.num_projections = num_projections - - def call(self, y_true, y_pred): - mse = tf.reduce_mean(tf.square(y_true - y_pred)) - w_dist = wasserstein_distance_sliced(y_true, y_pred, self.num_projections) - return mse + self.lambda_w * w_dist - - def get_config(self): - return { - "lambda_w": self.lambda_w, - "num_projections": self.num_projections, - "name": self.name, - } - - @classmethod - def from_config(cls, config): - return cls(**config) - - -@register_keras_serializable(package="pyemu_emulators", name="StatisticalLoss") -class StatisticalLoss(LossBase): - """ - Multi-component statistical loss for comprehensive distribution matching. - - Combines reconstruction error with multiple statistical measures: - - Moment matching (mean, variance, skewness, kurtosis) - - Correlation structure preservation - - Optional distribution distance (MMD or Energy) - """ - - def __init__(self, lambda_moments=1e-2, lambda_corr=1e-3, lambda_dist=1e-3, - dist_type='mmd', mmd_sigma=1.0, name="statistical_loss"): - super().__init__(name=name) - self.lambda_moments = lambda_moments - self.lambda_corr = lambda_corr - self.lambda_dist = lambda_dist - self.dist_type = dist_type - self.mmd_sigma = mmd_sigma - - def call(self, y_true, y_pred): - # Reconstruction loss - mse = tf.reduce_mean(tf.square(y_true - y_pred)) - - # Moment matching - moments_loss = 0.0 - for moment in range(1, 5): # mean, variance, skewness, kurtosis - true_moment = tf.reduce_mean(tf.pow(y_true - tf.reduce_mean(y_true, axis=0), moment), axis=0) - pred_moment = tf.reduce_mean(tf.pow(y_pred - tf.reduce_mean(y_pred, axis=0), moment), axis=0) - moments_loss += tf.reduce_mean(tf.square(true_moment - pred_moment)) - - # Correlation structure loss - corr_loss = correlation_loss(y_true, y_pred) - - # Distribution distance - if self.dist_type == 'mmd': - dist_loss = maximum_mean_discrepancy(y_true, y_pred, sigma=self.mmd_sigma) - elif self.dist_type == 'energy': - dist_loss = energy_distance_optimized(y_true, y_pred) - else: - dist_loss = 0.0 - - total_loss = (mse + - self.lambda_moments * moments_loss + - self.lambda_corr * corr_loss + - self.lambda_dist * dist_loss) - - return total_loss - - def get_config(self): - return { - "lambda_moments": self.lambda_moments, - "lambda_corr": self.lambda_corr, - "lambda_dist": self.lambda_dist, - "dist_type": self.dist_type, - "mmd_sigma": self.mmd_sigma, - "name": self.name, - } - - @classmethod - def from_config(cls, config): - return cls(**config) - - -@register_keras_serializable(package="pyemu_emulators", name="AdaptiveLoss") -class AdaptiveLoss(LossBase): - """ - Adaptive loss that balances reconstruction and distribution terms dynamically. - - Automatically adjusts the weighting between reconstruction and distribution - preservation based on their relative magnitudes during training. - """ - - def __init__(self, base_lambda=1e-2, adaptation_rate=0.01, min_lambda=1e-5, - max_lambda=1e-1, name="adaptive_loss"): - super().__init__(name=name) - self.base_lambda = base_lambda - self.adaptation_rate = adaptation_rate - self.min_lambda = min_lambda - self.max_lambda = max_lambda - self.current_lambda = tf.Variable(base_lambda, trainable=False, name="adaptive_lambda") - - def call(self, y_true, y_pred): - mse = tf.reduce_mean(tf.square(y_true - y_pred)) - ed = energy_distance_optimized(y_true, y_pred) - - # Adaptive weighting based on relative magnitudes - mse_magnitude = tf.stop_gradient(mse) - ed_magnitude = tf.stop_gradient(ed) - - # Update lambda to balance the terms - ratio = ed_magnitude / (mse_magnitude + 1e-8) - target_lambda = self.base_lambda * tf.clip_by_value(ratio, 0.1, 10.0) - - # Smooth update of lambda - self.current_lambda.assign( - self.current_lambda * (1 - self.adaptation_rate) + - target_lambda * self.adaptation_rate - ) - - # Clip lambda to reasonable bounds - clipped_lambda = tf.clip_by_value(self.current_lambda, self.min_lambda, self.max_lambda) - - return mse + clipped_lambda * ed - - def get_config(self): - return { - "base_lambda": self.base_lambda, - "adaptation_rate": self.adaptation_rate, - "min_lambda": self.min_lambda, - "max_lambda": self.max_lambda, - "name": self.name, - } - - @classmethod - def from_config(cls, config): - return cls(**config) - - -@register_keras_serializable(package="custom_losses") -class PerSampleMSE(LossBase): - def __init__(self, name="per_sample_mse"): - super().__init__(reduction="none", name=name) - - def call(self, y_true, y_pred): - # shape (batch,) - return tf.reduce_mean(tf.square(y_true - y_pred), axis=1) - - - def get_config(self): - return {"name": self.name} - - @classmethod - def from_config(cls, config): - return cls(**config) - - - -def create_observation_weights(data: Union[pd.DataFrame, np.ndarray], - observed_values: List[float], - critical_features: List[int], - weight_type: str = 'inverse_distance', - temperature: float = 1.0, - normalize: bool = True, - clip_range: tuple = (0.1, 10.0)) -> np.ndarray: - """ - Create sample weights based on proximity to observed values. - - Parameters - ---------- - data : pd.DataFrame or np.ndarray - Training data with shape (n_samples, n_features) - observed_values : list of float - Target observed values at critical features - critical_features : list of int - Column indices of critical observation features - weight_type : str, default 'inverse_distance' - Type of weighting: 'inverse_distance', 'gaussian', 'exponential' - temperature : float, default 1.0 - Temperature parameter for weight decay (lower = sharper weighting) - normalize : bool, default True - Whether to normalize weights to mean = 1.0 - clip_range : tuple, default (0.1, 10.0) - Range to clip extreme weights (min, max) - - Returns - ------- - np.ndarray - Sample weights with shape (n_samples,) - """ - if isinstance(data, pd.DataFrame): - data = data.values - - observed_values = np.array(observed_values) - sample_weights = np.ones(len(data)) - - for i in range(len(data)): - sample_values = data[i][critical_features] - - if weight_type == 'inverse_distance': - distance = np.sqrt(np.sum((sample_values - observed_values)**2)) - weight = 1.0 / (1.0 + distance / temperature) - - elif weight_type == 'gaussian': - distance_sq = np.sum((sample_values - observed_values)**2) - weight = np.exp(-distance_sq / (2.0 * temperature**2)) - - elif weight_type == 'exponential': - distance = np.sqrt(np.sum((sample_values - observed_values)**2)) - weight = np.exp(-distance / temperature) - - else: - raise ValueError(f"Unknown weight_type: {weight_type}") - - sample_weights[i] = weight - - if normalize: - sample_weights = sample_weights / np.mean(sample_weights) - - if clip_range is not None: - sample_weights = np.clip(sample_weights, clip_range[0], clip_range[1]) - - return sample_weights - - -def create_pest_observation_weights(pst: 'Pst', - emulator_data: pd.DataFrame, - weight_scaling: float = 1.0, - **kwargs) -> np.ndarray: - """ - Create sample weights using PEST observation data. - - Parameters - ---------- - pst : Pst - PEST control file object with observation data - emulator_data : pd.DataFrame - Training data for the emulator - weight_scaling : float, default 1.0 - Overall scaling factor for weights - **kwargs - Additional arguments passed to create_observation_weights - - Returns - ------- - np.ndarray - Sample weights based on PEST observations - """ - obs_data = pst.observation_data - - # Map observation names to column indices - critical_features = [] - observed_values = [] - - for obs_name in obs_data.index: - if obs_name in emulator_data.columns: - col_idx = emulator_data.columns.get_loc(obs_name) - critical_features.append(col_idx) - observed_values.append(obs_data.loc[obs_name, 'obsval']) - - if len(critical_features) == 0: - raise ValueError("No matching observations found between PST and emulator data") - - weights = create_observation_weights( - emulator_data, observed_values, critical_features, **kwargs - ) - - return weights * weight_scaling - - - -def create_distribution_loss(loss_type='energy', **kwargs): - """ - Factory function to create distribution-aware loss functions. - - Parameters - ---------- - loss_type : str - Type of loss function to create: - - 'energy': EnergyLoss (default, robust but computationally expensive) - - 'mmd': MMDLoss (efficient, good for high-dim data) - - 'wasserstein': WassersteinLoss (good for smooth distributions) - - 'statistical': StatisticalLoss (comprehensive statistical matching) - - 'adaptive': AdaptiveLoss (automatically balances terms) - - 'mse': Standard MSE (no distribution matching) - - 'huber': Huber loss (robust to outliers, no distribution matching) - **kwargs : dict - Additional parameters specific to each loss type - - Returns - ------- - tf.keras.losses.Loss - Configured loss function - - Examples - -------- - >>> # Energy loss with custom weighting - >>> loss = create_distribution_loss('energy', lambda_energy=1e-3) - >>> - >>> # MMD loss with RBF kernel - >>> loss = create_distribution_loss('mmd', lambda_mmd=1e-2, sigma=2.0) - >>> - >>> # Statistical loss with all components - >>> loss = create_distribution_loss('statistical', - ... lambda_moments=1e-2, - ... lambda_corr=1e-3, - ... lambda_dist=5e-3) - """ - if loss_type == 'energy': - return EnergyLoss(**kwargs) - elif loss_type == 'mmd': - return MMDLoss(**kwargs) - elif loss_type == 'wasserstein': - return WassersteinLoss(**kwargs) - elif loss_type == 'statistical': - return StatisticalLoss(**kwargs) - elif loss_type == 'adaptive': - return AdaptiveLoss(**kwargs) - elif loss_type == 'per_sample_mse': - return PerSampleMSE(**kwargs) - elif loss_type == 'mse': - return 'mse' - elif loss_type == 'huber': - return tf.keras.losses.Huber(**kwargs) - - else: - raise ValueError(f"Unknown loss type: {loss_type}. " - f"Supported types: energy, mmd, wasserstein, statistical, " +""" +Data Space Inversion (DSI) Autoencoder (AE) emulator implementation. +""" +from __future__ import print_function, division +import pyemu +from typing import Optional, List, Dict, Any, Union +import numpy as np +import pandas as pd +import inspect +from pyemu.utils.helpers import dsi_forward_run,dsi_runstore_forward_run, series_to_insfile +import os +import shutil +from pyemu.pst.pst_handler import Pst +from pyemu.en import ObservationEnsemble,ParameterEnsemble +from .base import Emulator +from .dsi import DSI +import pickle +import tempfile +import zipfile + +try: + import tensorflow as tf + from keras.saving import register_keras_serializable +except ImportError: + tf = None + # Dummy decorator to prevent NameError on class definitions + def register_keras_serializable(package=None, name=None): + def decorator(cls): + return cls + return decorator + +from sklearn.model_selection import train_test_split + + + +class DSIAE(Emulator): + """ + Data Space Inversion Autoencoder (DSIAE) emulator. + """ + + def __init__(self, + pst: Optional['Pst'] = None, + data: Optional[Union[pd.DataFrame, 'ObservationEnsemble']] = None, + transforms: Optional[List[Dict[str, Any]]] = None, + latent_dim: Optional[int] = None, + energy_threshold: float = 1.0, + verbose: bool = False) -> None: + """ + Initialize the DSIAE emulator. + + Args: + pst: PEST control file object. + data: Training data (DataFrame or ObservationEnsemble). + transforms: List of dicts defining preprocessing transformations. + latent_dim: Latent space dimension. If None, determined from energy_threshold. + energy_threshold: Variance threshold for automatic latent dimension (0.0-1.0). + verbose: Enable verbose logging. + """ + super().__init__(verbose=verbose) + + self.observation_data = pst.observation_data.copy() if pst is not None else None + + if isinstance(data, ObservationEnsemble): + data = data._df.copy() + + # Ensure float data + self.data = data.astype(float).copy() if data is not None else None + + self.energy_threshold = energy_threshold + + if transforms is not None: + if not isinstance(transforms, list): + raise TypeError("transforms must be a list of dicts") + for t in transforms: + if not isinstance(t, dict) or 'type' not in t: + raise ValueError("Each transform must be a dict with a 'type' key") + if 'columns' in t: + missing = [c for c in t['columns'] if c not in self.data.columns] + if missing: + raise ValueError(f"Transform columns not found in data: {missing}") + + self.transforms = transforms + self.fitted = False + self.data_transformed = self._prepare_training_data() + self.decision_variable_names = None + self.latent_dim = latent_dim + + if self.latent_dim is None and self.data is not None: + self.logger.statement("calculating latent dimension from energy threshold") + self.latent_dim = self._calc_explained_variance() + + + def _prepare_training_data(self) -> pd.DataFrame: + """ + Prepare and transform training data for model fitting. + + This method applies the configured transformation pipeline to the raw training + data, preparing it for use in autoencoder training. If no transformations are + specified, the data is passed through unchanged but a dummy transformer is + still created for consistency in the prediction pipeline. + + Returns + ------- + pd.DataFrame + Transformed training data ready for model fitting. All values will be + numeric (float64) and any specified transformations will have been applied. + + Raises + ------ + ValueError + If no data is stored in the emulator instance. + + Notes + ----- + This method is automatically called during emulator initialization and stores + the transformed data in `self.data_transformed`. The transformation pipeline + is preserved in `self.transformer_pipeline` for use during prediction to + ensure consistent data preprocessing. + + The method always creates a transformer pipeline object, even when no + transformations are specified, to maintain consistency in the prediction + workflow where inverse transformations may be needed. + """ + data = self.data + if data is None: + raise ValueError("No data stored in the emulator") + + self.logger.statement("applying feature transforms") + # Always use the base class transformation method for consistency + if self.transforms is not None: + self.data_transformed = self._fit_transformer_pipeline(data, self.transforms) + else: + # Still need to set up a dummy transformer for inverse operations + from .transformers import AutobotsAssemble + self.transformer_pipeline = AutobotsAssemble(data.copy()) + self.data_transformed = data.copy() + + return self.data_transformed + + def encode(self, X: Union[np.ndarray, pd.DataFrame]) -> pd.DataFrame: + """ + Encode input data into latent space representation. + + This method transforms input observation data into the lower-dimensional + latent space learned by the autoencoder. The encoding process applies any + configured data transformations before passing the data through the encoder + network. + + Parameters + ---------- + X : np.ndarray or pd.DataFrame + Input observation data to encode. Should have the same feature structure + as the training data. If DataFrame, the index will be preserved in the + output. Shape should be (n_samples, n_features) where n_features matches + the original observation space dimension. + + Returns + ------- + pd.DataFrame + Encoded latent space representation with shape (n_samples, latent_dim). + If input was a DataFrame, the original index is preserved. Column names + will be generated automatically for the latent dimensions. + + Raises + ------ + ValueError + If the encoder has not been fitted (emulator not trained). + If input data shape is incompatible with the trained model. + + Notes + ----- + This method automatically applies the same data transformations that were + used during training, ensuring consistent preprocessing. The transformations + are applied via the stored `transformer_pipeline`. + + The latent space representation can be used for: + - Dimensionality reduction and visualization + - Parameter space exploration + - Input to optimization routines + - Analysis of model behavior in reduced space + + Examples + -------- + >>> # Encode training data + >>> latent_repr = emulator.encode(training_data) + >>> + >>> # Encode new observations + >>> new_latent = emulator.encode(new_observations) + >>> print(f"Latent dimensions: {new_latent.shape[1]}") + """ + # check encoder exists + if not hasattr(self, 'encoder'): + raise ValueError("Encoder not found. Fit the emulator before encoding.") + + if isinstance(X, pd.DataFrame): + index = X.index + + if self.transforms is not None: + X = self.transformer_pipeline.transform(X) + Z = self.encoder.encode(X) + Z = pd.DataFrame(Z, index=index if 'index' in locals() else None) + return Z + + + def _calc_explained_variance(self) -> int: + """ + Calculate optimal latent dimension using PCA explained variance threshold. + + Returns + ------- + int + Minimum latent dimensions to capture `energy_threshold` variance. + Falls back to full dimensionality if 99% variance threshold not reached. + + Notes + ----- + Uses scikit-learn PCA on `self.data_transformed`. The energy_threshold + represents cumulative explained variance ratio (e.g., 0.95 = 95% variance). + """ + from sklearn.decomposition import PCA # light dependency; optional + # PCA explained variance (optional) + pca = PCA() + pca.fit(self.data_transformed.values.astype(float)) + cum_explained = np.cumsum(pca.explained_variance_ratio_) + latent_dim = int(np.searchsorted(cum_explained, self.energy_threshold) + 1) if cum_explained[-1] >= 0.99 else len(cum_explained) + return latent_dim + + def fit(self, validation_split: float = 0.1, hidden_dims: tuple = (128, 64), + lr: float = 1e-3, epochs: int = 300, batch_size: int = 128, + early_stopping: bool = True, dropout_rate: float = 0.0, + random_state: int = 42, loss_type: str = 'energy', + loss_kwargs: Optional[Dict[str, Any]] = None, + sample_weight: Optional[np.ndarray] = None) -> 'DSIAE': + """ + Fit the autoencoder emulator to training data. + + Parameters + ---------- + validation_split : float, default 0.1 + Fraction of data to use for validation. + hidden_dims : tuple, default (128, 64) + Hidden layer dimensions for encoder/decoder. + lr : float, default 1e-3 + Learning rate for Adam optimizer. + epochs : int, default 300 + Maximum training epochs. + batch_size : int, default 128 + Training batch size. + early_stopping : bool, default True + Whether to use early stopping on validation loss. + dropout_rate : float, default 0.0 + Dropout rate for regularization during training. + random_state : int, default 42 + Random seed for reproducibility. + loss_type : str, default 'energy' + Type of loss function to use. Options: 'energy', 'mmd', 'wasserstein', + 'statistical', 'adaptive', 'mse', 'huber'. + loss_kwargs : dict, optional + Additional parameters for the loss function. + sample_weight : np.ndarray, optional + Sample weights for training. Shape should be (n_samples,). + + Returns + ------- + DSIAE + Self (fitted emulator instance). + """ + + if self.data_transformed is None: + self.logger.statement("transforming training data") + self.data_transformed = self._prepare_training_data() + + X = self.data_transformed.values.astype(float) + if self.latent_dim is None: + self.logger.statement("calculating latent dimension from energy threshold") + self.latent_dim = self._calc_explained_variance() + + # Configure loss function + if loss_kwargs is None: + loss_kwargs = {} + + # Set default loss parameters if not specified + if loss_type == 'energy' and 'lambda_energy' not in loss_kwargs: + loss_kwargs['lambda_energy'] = 1e-3 + elif loss_type == 'mmd' and 'lambda_mmd' not in loss_kwargs: + loss_kwargs['lambda_mmd'] = 1e-3 + elif loss_type == 'wasserstein' and 'lambda_w' not in loss_kwargs: + loss_kwargs['lambda_w'] = 1e-3 + elif loss_type == 'statistical': + if 'lambda_moments' not in loss_kwargs: + loss_kwargs['lambda_moments'] = 1e-3 + if 'lambda_corr' not in loss_kwargs: + loss_kwargs['lambda_corr'] = 5e-4 + if 'lambda_dist' not in loss_kwargs: + loss_kwargs['lambda_dist'] = 1e-3 + + loss_fn = create_distribution_loss(loss_type, **loss_kwargs) + + self.logger.statement(f"using {loss_type} loss function with parameters: {loss_kwargs}") + # train autoencoder on transformed data + ae = AutoEncoder(input_dim=X.shape[1], + latent_dim=self.latent_dim, + hidden_dims=hidden_dims, + loss=loss_fn, + lr=lr, + dropout_rate=dropout_rate, + random_state=random_state, + ) + ae.fit(X, + validation_split=validation_split, + epochs=epochs, batch_size=batch_size, + early_stopping=early_stopping, + patience=10, + sample_weight=sample_weight, + ) + self.encoder = ae + self.fitted = True + return self + + # Reuse implementation from DSI + _write_forward_run_script = DSI._write_forward_run_script + + def predict(self, pvals: Union[np.ndarray, pd.Series, pd.DataFrame]) -> pd.Series: + """ + Generate predictions from the emulator. + + Parameters + ---------- + pvals : np.ndarray, pd.Series, or pd.DataFrame + Parameter values for prediction in latent space. + Shape should match latent_dim. + + Returns + ------- + pd.Series + Predicted observation values in original scale. + + Raises + ------ + ValueError + If emulator not fitted or input dimensions incorrect. + """ + if not self.fitted: + raise ValueError("Emulator must be fitted before prediction") + + if self.transforms is not None and (not hasattr(self, 'transformer_pipeline') or self.transformer_pipeline is None): + raise ValueError("Emulator must be fitted and have valid transformations before prediction") + + if isinstance(pvals, pd.Series): + pvals = pvals.values.flatten().reshape(1,-1).astype(np.float32) + elif isinstance(pvals, np.ndarray) and len(pvals.shape) == 2 and pvals.shape[0] == 1: + pvals = pvals.flatten().reshape(1,-1) + pvals = pvals.astype(np.float32) + elif isinstance(pvals, pd.DataFrame): + index = pvals.index + pvals = pvals.values.astype(np.float32) + + #assert pvals.shape[0] == self.latent_dim , f"Input parameter dimension {pvals.shape[0]} does not match latent dimension {self.latent_dim}" + sim_vals = self.encoder.decode(pvals) + sim_vals = pd.DataFrame(sim_vals, + columns=self.data_transformed.columns, + index=index if 'index' in locals() else None) + sim_vals = sim_vals.squeeze() + #if isinstance(sim_vals, np.ndarray): + # sim_vals = pd.Series(sim_vals.flatten(), index=self.data_transformed.columns) + if self.transforms is not None: + pipeline = self.transformer_pipeline + sim_vals = pipeline.inverse(sim_vals) + sim_vals.index.name = 'obsnme' + sim_vals.name = "obsval" + self.sim_vals = sim_vals + return sim_vals + + def check_for_pdc(self): + """Check for Prior data conflict.""" + #TODO + return + + + + def prepare_dsivc(self, decvar_names: Union[List[str], str], t_d: Optional[str] = None, + pst: Optional['Pst'] = None, oe: Optional['ObservationEnsemble'] = None, + track_stack: bool = False, dsi_args: Optional[Dict[str, Any]] = None, + percentiles: List[float] = [0.25, 0.75, 0.5], + mou_population_size: Optional[int] = None, + ies_exe_path: str = "pestpp-ies") -> 'Pst': + """ + Prepare Data Space Inversion Variable Control (DSIVC) control files. + + Parameters + ---------- + decvar_names : list or str + Names of decision variables for optimization. + t_d : str, optional + Template directory path. Uses existing if None. + pst : Pst, optional + PST control file object. Uses existing if None. + oe : ObservationEnsemble, optional + Observation ensemble. Uses existing if None. + track_stack : bool, default False + Whether to include individual ensemble realizations as observations. + dsi_args : dict, optional + DSI configuration arguments. + percentiles : list, default [0.25, 0.75, 0.5] + Percentiles to calculate from ensemble statistics. + mou_population_size : int, optional + Population size for multi-objective optimization. + ies_exe_path : str, default "pestpp-ies" + Path to PEST++ IES executable. + + Returns + ------- + Pst + PEST++ control file object for DSIVC optimization. + + Notes + ----- + Sets up multi-objective optimization with decision variables constrained + to training data bounds. Creates stack statistics observations for ensemble + matching and configures PEST++-MOU options. + """ + # check that percentiles is a list or array of floats between 0 and 1. + assert isinstance(percentiles, (list, np.ndarray)), "percentiles must be a list or array of floats" + assert all([isinstance(i, (float, int)) for i in percentiles]), "percentiles must be a list or array of floats" + assert all([0 <= i <= 1 for i in percentiles]), "percentiles must be between 0 and 1" + # ensure that pecentiles are unique + percentiles = np.unique(percentiles) + + + #track dsivc args for forward run + self.dsivc_args = {"percentiles":percentiles, + "decvar_names":decvar_names, + "track_stack":track_stack, + } + + if t_d is None: + self.logger.statement("using existing DSI template dir...") + t_d = self.template_dir + self.logger.statement(f"using {t_d} as template directory...") + assert os.path.exists(t_d), f"template directory {t_d} does not exist" + + if pst is None: + self.logger.statement("no pst provided...") + self.logger.statement("using dsi.pst in DSI template dir...") + assert os.path.exists(os.path.join(t_d,"dsi.pst")), f"dsi.pst not found in {t_d}" + pst = Pst(os.path.join(t_d,"dsi.pst")) + if oe is None: + self.logger.statement(f"no posterior DSI observation ensemble provided, using dsi.{dsi_args['noptmax']}.obs.jcb in DSI template dir...") + assert os.path.exists(os.path.join(t_d,f"dsi.{dsi_args['noptmax']}.obs.jcb")), f"dsi.{dsi_args['noptmax']}.obs.jcb not found in {t_d}" + oe = ObservationEnsemble.from_binary(pst,os.path.join(t_d,f"dsi.{dsi_args['noptmax']}.obs.jcb")) + else: + assert isinstance(oe, ObservationEnsemble), "oe must be an ObservationEnsemble" + + #check if decvar_names str + if isinstance(decvar_names, str): + decvar_names = [decvar_names] + # chekc htat decvars are in the oe columns + missing = [col for col in decvar_names if col not in oe.columns] + assert len(missing) == 0, f"The following decvars are missing from the DSI obs ensemble: {missing}" + # chekc htat decvars are in the pst observation data + missing = [col for col in decvar_names if col not in pst.obs_names] + assert len(missing) == 0, f"The following decvars are missing from the DSI pst control file: {missing}" + + + # handle DSI args + default_dsi_args = {"noptmax":pst.control_data.noptmax, + "decvar_weight":1.0, + #"decvar_phi_factor":0.5, + "num_pyworkers":1, + } + # ensure it's a dict + if dsi_args is None: + dsi_args = default_dsi_args + elif not isinstance(dsi_args, dict): + raise TypeError("Expected a dictionary for 'options'") + # merge with defaults (user values override defaults) + #dsi_args = {**default_dsi_args, **dsi_args} + else: + for key, value in default_dsi_args.items(): + if key not in dsi_args: + dsi_args[key] = value + + # check that dsi_args has the required keys + required_keys = ["noptmax", "decvar_weight", "num_pyworkers"] + for key in required_keys: + if key not in dsi_args: + raise KeyError(f"Missing required key '{key}' in 'dsi_args'") + self.dsi_args = dsi_args + out_files = [] + + self.logger.statement(f"preparing stack stats observations...") + assert isinstance(oe, ObservationEnsemble), "oe must be an ObservationEnsemble" + if oe.index.name is None: + id_vars="index" + else: + id_vars=oe.index.name + stack_stats = oe._df.describe(percentiles=percentiles).reset_index().melt(id_vars=id_vars) + stack_stats.rename(columns={"value":"obsval","index":"stat"},inplace=True) + stack_stats['obsnme'] = stack_stats.apply(lambda x: x.variable+"_stat:"+x.stat,axis=1) + stack_stats.set_index("obsnme",inplace=True) + stack_stats = stack_stats.obsval + self.logger.statement(f"stack osb recorded to dsi.stack_stats.csv...") + out_file = os.path.join(t_d,"dsi.stack_stats.csv") + out_files.append(out_file) + stack_stats.to_csv(out_file,float_format="%.6e") + series_to_insfile(out_file,ins_file=None) + + + if track_stack: + self.logger.statement(f"including {oe.values.flatten().shape[0]} stack observations...") + + stack = oe._df.reset_index().melt(id_vars=id_vars) + stack.rename(columns={"value":"obsval"},inplace=True) + stack['obsnme'] = stack.apply(lambda x: x.variable+"_real:"+x.index,axis=1) + stack.set_index("obsnme",inplace=True) + stack = stack.obsval + out_file = os.path.join(t_d,"dsi.stack.csv") + out_files.append(out_file) + stack.to_csv(out_file,float_format="%.6e") + series_to_insfile(out_file,ins_file=None) + + + + self.logger.statement(f"prepare DSIVC template files...") + dsi_in_file = os.path.join(t_d, "dsivc_pars.csv") + dsi_tpl_file = dsi_in_file + ".tpl" + ftpl = open(dsi_tpl_file, 'w') + fin = open(dsi_in_file, 'w') + ftpl.write("ptf ~\n") + fin.write("parnme,parval1\n") + ftpl.write("parnme,parval1\n") + for pname in decvar_names: + val = oe._df.loc[:,pname].mean() + fin.write(f"{pname},{val:.6e}\n") + ftpl.write(f"{pname},~ {pname} ~\n") + fin.close() + ftpl.close() + + + self.logger.statement(f"building DSIVC control file...") + pst_dsivc = Pst.from_io_files([dsi_tpl_file],[dsi_in_file],[i+".ins" for i in out_files],out_files,pst_path=".") + + self.logger.statement(f"setting dec var bounds...") + par = pst_dsivc.parameter_data + # set all parameters fixed + par.loc[:,"partrans"] = "fixed" + # constrain decvar pars to training data bounds + par.loc[decvar_names,"pargp"] = "decvars" + par.loc[decvar_names,"partrans"] = "none" + par.loc[decvar_names,"parubnd"] = self.data.loc[:,decvar_names].max() + par.loc[decvar_names,"parlbnd"] = self.data.loc[:,decvar_names].min() + par.loc[decvar_names,"parval1"] = self.data.loc[:,decvar_names].quantile(.5) + + self.logger.statement(f"zero-weighting observation data...") + # prepemtpively set obs weights 0.0 + obs = pst_dsivc.observation_data + obs.loc[:,"weight"] = 0.0 + + self.logger.statement(f"getting obs metadata from DSI observation_data...") + obsorg = pst.observation_data.copy() + columns = [i for i in obsorg.columns if i !='obsnme'] + for o in obsorg.obsnme.values: + obs.loc[obs.obsnme.str.startswith(o), columns] = obsorg.loc[obsorg.obsnme==o, columns].values + + obs.loc[stack_stats.index,"obgnme"] = "stack_stats" + obs.loc[stack_stats.index,"org_obsnme"] = [i.split("_stat:")[0] for i in stack_stats.index.values] + pst_dsivc.try_parse_name_metadata() + + #obs.loc[stack.index,"obgnme"] = "stack" + + self.logger.statement(f"building dsivc_forward_run.py...") + pst_dsivc.model_command = "python dsivc_forward_run.py" + from pyemu.utils.helpers import dsivc_forward_run + function_source = inspect.getsource(dsivc_forward_run) + with open(os.path.join(t_d,"dsivc_forward_run.py"),'w') as file: + file.write(function_source) + file.write("\n\n") + file.write("if __name__ == \"__main__\":\n") + file.write(f" {function_source.split('(')[0].split('def ')[1]}(ies_exe_path='{ies_exe_path}')\n") + + self.logger.statement(f"preparing nominal initial population...") + if mou_population_size is None: + # set the population size to 2 * number of decision variables + # this is a good rule of thumb for MOU + mou_population_size = 2 * len(decvar_names) + # these should generally be twice the number of decision variables + if mou_population_size < 2 * len(decvar_names): + self.logger.statement(f"mou population is less than 2x number of decision variables, this may be too small...") + # sample 160 sets of decision variables from a unform distribution + dvpop = ParameterEnsemble.from_uniform_draw(pst_dsivc,num_reals=mou_population_size) + # record to external file for PESTPP-MOU + dvpop.to_binary(os.path.join(t_d,"initial_dvpop.jcb")) + # tell PESTPP-MOU about the new file + pst_dsivc.pestpp_options["mou_dv_population_file"] = 'initial_dvpop.jcb' + + + # some additional PESTPP-MOU options: + pst_dsivc.pestpp_options["mou_population_size"] = mou_population_size #twice the number of decision variables + pst_dsivc.pestpp_options["mou_save_population_every"] = 1 # save lots of files! + + pst_dsivc.control_data.noptmax = 0 #just for a test run + pst_dsivc.write(os.path.join(t_d,"dsivc.pst"),version=2) + + # updating the DSI pst control file + self.logger.statement(f"updating DSI pst control file...") + self.logger.statement("overwriting dsi.pst file...") + pst.observation_data.loc[decvar_names, "weight"] = dsi_args["decvar_weight"] + pst.control_data.noptmax = dsi_args["noptmax"] + + #TODO: ensure no noise for dvars obs + + pst.write(os.path.join(t_d,"dsi.pst"), version=2) + + + self.logger.statement("overwriting dsi.pickle file...") + self.decision_variable_names = decvar_names + # re-pickle dsi to track dsivc args + self.save(os.path.join(t_d,"dsi.pickle")) + + self.logger.statement("DSIVC control files created...the user still needs to specify objectives and constraints...") + return pst_dsivc + + def hyperparam_search(self, latent_dims: Optional[List[int]] = None, + latent_dim_mults: List[float] = [0.5, 1.0, 2.0], + hidden_dims_list: List[tuple] = [(64, 32), (128, 64)], + lrs: List[float] = [1e-2, 1e-3], + epochs: int = 50, batch_size: int = 32, + random_state: int = 0) -> Dict[tuple, float]: + """ + Grid search over autoencoder hyperparameters. + + Parameters + ---------- + latent_dims : list of int, optional + Latent dimensions to test. If None, uses latent_dim_mults. + latent_dim_mults : list of float, default [0.5, 1.0, 2.0] + Multipliers for current latent_dim if latent_dims not provided. + hidden_dims_list : list of tuple, default [(64, 32), (128, 64)] + Hidden layer architectures to test. + lrs : list of float, default [1e-2, 1e-3] + Learning rates to test. + epochs : int, default 50 + Training epochs for each configuration. + batch_size : int, default 32 + Training batch size. + random_state : int, default 0 + Random seed for reproducibility. + + Returns + ------- + dict + Mapping from (latent_dim, hidden_dims, lr) to validation loss. + """ + if latent_dims is None: + assert self.latent_dim is not None, "Either latent_dims or self.latent_dim must be set" + latent_dims = [int(self.latent_dim * m) for m in latent_dim_mults] + + X = self.data_transformed.values.astype(float) + results = AutoEncoder.hyperparam_search( + X, + latent_dims=latent_dims, + hidden_dims_list=hidden_dims_list, + lrs=lrs, + epochs=epochs, + batch_size=batch_size, + random_state=random_state + ) + return results + + def save(self, filename: str) -> None: + """ + Save the emulator to a file. + + Bundles the pickled object and the TensorFlow model into a zip archive. + """ + # Create a temporary directory to save components + with tempfile.TemporaryDirectory() as tmp_dir: + # 1. Save TF model + model_dir = os.path.join(tmp_dir, "tf_model") + if hasattr(self, 'encoder') and self.encoder is not None: + self.encoder.save(model_dir) + + # 2. Remove TF model from self to allow pickling + encoder_ref = self.encoder + self.encoder = None + + # 3. Pickle the rest of the object + pkl_path = os.path.join(tmp_dir, "dsiae.pkl") + with open(pkl_path, "wb") as f: + pickle.dump(self, f) + + # Restore encoder + self.encoder = encoder_ref + + # 4. Zip everything into the target filename + with zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED) as zipf: + # Add pickle + zipf.write(pkl_path, arcname="dsiae.pkl") + # Add TF model directory contents + if os.path.exists(model_dir): + for root, dirs, files in os.walk(model_dir): + for file in files: + file_path = os.path.join(root, file) + arcname = os.path.relpath(file_path, tmp_dir) + zipf.write(file_path, arcname=arcname) + + print(f"Saved emulator to {filename}") + + @classmethod + def load(cls, filename: str) -> 'DSIAE': + """ + Load the emulator from a file. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + with zipfile.ZipFile(filename, 'r') as zipf: + zipf.extractall(tmp_dir) + + # 1. Unpickle + with open(os.path.join(tmp_dir, "dsiae.pkl"), "rb") as f: + obj = pickle.load(f) + + # 2. Reload TF model if it exists + model_dir = os.path.join(tmp_dir, "tf_model") + if os.path.exists(model_dir): + # We need to reconstruct the AutoEncoder wrapper + # Since we don't have the init params easily available, we rely on the fact + # that AutoEncoder.load loads the Keras models directly. + # But we need an AutoEncoder instance first. + + # We can create a dummy AutoEncoder instance and then load the weights/models + # However, AutoEncoder.__init__ builds the model. + # We can bypass __init__ or use default params if we are just going to overwrite the models. + + # Better approach: The AutoEncoder class should have a classmethod to load from disk + # or we instantiate it with dummy params and then load. + + # Let's assume we can instantiate it with minimal params. + # We need input_dim and latent_dim. + # obj.data_transformed should be available. + input_dim = obj.data_transformed.shape[1] if obj.data_transformed is not None else 0 + latent_dim = obj.latent_dim if obj.latent_dim is not None else 2 + + # Create a blank AutoEncoder instance + # We use __new__ to bypass __init__ since we are loading the full model structure + ae = AutoEncoder.__new__(AutoEncoder) + ae.load(model_dir) + obj.encoder = ae + + return obj + + def _get_emulator_parameters(self, pst=None): + """ + Get params for DSIAE (latent variables). + """ + Z = self.encode(self.data) + if 'base' in Z.index: + pvals = Z.loc['base',:] + else: + pvals = Z.mean(axis=0) + + npar = self.latent_dim + par_names = [f"dsi_par{i:04d}" for i in range(npar)] + + df = pd.DataFrame(index=par_names) + df["parnme"] = par_names + df["parval1"] = pvals.values.flatten() + df["parlbnd"] = Z.min(axis=0).values + df["parubnd"] = Z.max(axis=0).values + df["pargp"] = "dsi_pars" + df["partrans"] = "none" + + return df + + def _get_emulator_observations(self, pst=None): + """ + Get observations for DSIAE. + """ + # Use columns from data (assuming they represent observations) + if self.data is not None: + cols = self.data.columns + df = pd.DataFrame(index=cols) + df["obsnme"] = cols + df["obsval"] = self.data.mean(axis=0) # Use mean as dummy value + df["weight"] = 0.0 + df["obgnme"] = "obgnme" + return df + + def _configure_pst_object(self, pst_obj, pst_original, t_d=None): + """ + Configure DSIAE specific PEST++ options and save dependent files. + """ + if t_d is None: + t_d = "." + + Z = self.encode(self.data) + npar = self.latent_dim + par_names = pst_obj.parameter_data.index.tolist() + assert npar == len(par_names), f"latent dim {npar} does not match number of parameters {len(par_names)}" + Z.columns = par_names + + pe = ParameterEnsemble(pst_obj, Z) + jcb_path = os.path.join(t_d, 'latent_prior.jcb') + pe.to_binary(jcb_path) + pst_obj.pestpp_options['ies_parameter_ensemble'] = 'latent_prior.jcb' + + pst_obj.pestpp_options["save_binary"] = True + pst_obj.pestpp_options["overdue_giveup_fac"] = 1e30 + pst_obj.pestpp_options["overdue_giveup_minutes"] = 1e30 + pst_obj.pestpp_options["panther_agent_freeze_on_fail"] = True + pst_obj.pestpp_options["ies_no_noise"] = False + pst_obj.pestpp_options["ies_subset_size"] = -10 + + # Save dsi.pickle for legacy forward run scripts (like runstor) + self.save(os.path.join(t_d, "dsi.pickle")) + + self.logger.statement(f"Saved latent_prior.jcb to {jcb_path}") + return pst_obj + + def prepare_pestpp(self, t_d, pst=None, verbose=False, use_runstor=False): + """ + Prepare PEST++ interface for DSIAE. + Wraps base implementation. + """ + self._use_runstor = use_runstor + pst_obj = super().prepare_pestpp(t_d=t_d, pst=pst, verbose=verbose, + tpl_filename="dsi_pars.csv.tpl", + input_filename="dsi_pars.csv", + ins_filename="dsi_sim_vals.csv.ins", + output_filename="dsi_sim_vals.csv") + + return pst_obj + +class AutoEncoder: + def __init__(self, input_dim: int, latent_dim: int = 2, + hidden_dims: tuple = (128, 64), lr: float = 1e-3, + activation: str = 'relu', loss: str = 'Huber', + dropout_rate: float = 0.0, random_state: int = 0) -> None: + """ + Initialize AutoEncoder. + + Args: + input_dim: Input feature dimension. + latent_dim: Latent space dimension. + hidden_dims: Tuple of hidden layer sizes for encoder (reversed for decoder). + lr: Learning rate. + activation: Activation function name. + loss: Loss function name. + dropout_rate: Dropout rate (0.0-1.0). + random_state: Random seed. + """ + if tf is None: + raise ImportError("TensorFlow is required for AutoEncoder but not installed.") + + self.input_dim = input_dim + self.latent_dim = latent_dim + self.hidden_dims = hidden_dims + self.lr = lr + self.activation = activation + self.loss = loss + self.dropout_rate = dropout_rate + self.random_state = random_state + + tf.random.set_seed(random_state) + pyemu.en.rng = pyemu.en.rng.default_rng(random_state) + self._build_model() + + # Build encoder/decoder + def _build_model(self): + tf.keras.backend.set_floatx('float32') + # Encoder + encoder_inputs = tf.keras.Input(shape=(self.input_dim,)) + x = encoder_inputs + for h in self.hidden_dims: + x = tf.keras.layers.Dense(h, activation=self.activation)(x) + if hasattr(self, 'dropout_rate') and self.dropout_rate > 0: + x = tf.keras.layers.Dropout(self.dropout_rate)(x) + latent = tf.keras.layers.Dense(self.latent_dim, name='latent')(x) + self.encoder = tf.keras.Model(encoder_inputs, latent, name='encoder') + + # Decoder + decoder_inputs = tf.keras.Input(shape=(self.latent_dim,)) + x = decoder_inputs + for h in reversed(self.hidden_dims): + x = tf.keras.layers.Dense(h, activation=self.activation)(x) + if hasattr(self, 'dropout_rate') and self.dropout_rate > 0: + x = tf.keras.layers.Dropout(self.dropout_rate)(x) + outputs = tf.keras.layers.Dense(self.input_dim, activation=None)(x) + self.decoder = tf.keras.Model(decoder_inputs, outputs, name='decoder') + + # Autoencoder model + ae_inputs = encoder_inputs + ae_outputs = self.decoder(self.encoder(ae_inputs)) + self.model = tf.keras.Model(ae_inputs, ae_outputs, name='autoencoder') + self.model.compile(optimizer=tf.keras.optimizers.Adam(self.lr), loss=self.loss) + + + def fit(self, X: np.ndarray, X_val: Optional[np.ndarray] = None, + epochs: int = 100, batch_size: int = 32, + validation_split: float = 0.1, early_stopping: bool = True, + patience: int = 10, lr_schedule: Optional[Any] = None, + verbose: int = 2, sample_weight: Optional[np.ndarray] = None, + validation_sample_weight: Optional[np.ndarray] = None) -> Any: + """ + Train the autoencoder. + + Args: + X: Training data. + X_val: Validation data (optional). + epochs: Max epochs. + batch_size: Batch size. + validation_split: Validation split fraction (if X_val is None). + early_stopping: Enable early stopping. + patience: Early stopping patience. + lr_schedule: Learning rate scheduler callback. + verbose: Verbosity level. + sample_weight: Training sample weights. + validation_sample_weight: Validation sample weights. + + Returns: + Training history. + """ + # Callbacks + callbacks = [] + if early_stopping: + callbacks.append(tf.keras.callbacks.EarlyStopping( + monitor='val_loss', + patience=patience, + restore_best_weights=True + )) + if lr_schedule is not None: + callbacks.append(lr_schedule) + + + # Train + history = self.model.fit( + X, X, + sample_weight=sample_weight, + validation_split=validation_split, + epochs=epochs, + batch_size=batch_size, + callbacks=callbacks, + verbose=verbose + ) + return history + + def encode(self, X: Union[np.ndarray, pd.DataFrame, pd.Series]) -> np.ndarray: + """ + Encode input data to latent representation. + + Parameters + ---------- + X : np.ndarray, pd.DataFrame, or pd.Series + Input data to encode to latent space. + + Returns + ------- + np.ndarray + Latent representation with shape (n_samples, latent_dim). + """ + if isinstance(X, pd.DataFrame): + X = X.values.astype(np.float32) + elif isinstance(X, pd.Series): + X = X.values.reshape(1,-1).astype(np.float32) + return self.encoder(X, training=False) + + def decode(self, Z: np.ndarray) -> np.ndarray: + """ + Decode latent representation back to input space. + + Parameters + ---------- + Z : np.ndarray + Latent representation with shape (n_samples, latent_dim). + + Returns + ------- + np.ndarray + Reconstructed data with shape (n_samples, input_dim). + """ + #X_hat = self.decoder.predict(Z, verbose=0,) + X_hat = self.decoder(Z,training=False) + return X_hat + + + def save(self, folder: str) -> None: + """ + Save trained models to disk. + """ + os.makedirs(folder, exist_ok=True) + self.encoder.save(os.path.join(folder, 'encoder.keras')) + self.decoder.save(os.path.join(folder, 'decoder.keras')) + self.model.save(os.path.join(folder, 'autoencoder.keras')) + + def load(self, folder: str) -> None: + """ + Load trained models from disk. + """ + self.encoder = tf.keras.models.load_model(os.path.join(folder, 'encoder.keras')) + self.decoder = tf.keras.models.load_model(os.path.join(folder, 'decoder.keras')) + self.model = tf.keras.models.load_model(os.path.join(folder, 'autoencoder.keras')) + + + @staticmethod + def hyperparam_search(X: np.ndarray, latent_dims: List[int] = [2, 3, 5], + hidden_dims_list: List[tuple] = [(64, 32), (128, 64)], + lrs: List[float] = [1e-2, 1e-3], epochs: int = 50, + batch_size: int = 32, random_state: int = 42) -> Dict[tuple, float]: + """ + Perform grid search over autoencoder hyperparameters. + + Systematically evaluates different combinations of latent dimensions, + network architectures, and learning rates to find optimal configurations + based on validation loss performance. + + Parameters + ---------- + X : np.ndarray + Training data for hyperparameter optimization. + + latent_dims : list of int, default [2, 3, 5] + Latent space dimensions to evaluate. + + hidden_dims_list : list of tuple, default [(64, 32), (128, 64)] + Network architectures to test. Each tuple specifies hidden layer sizes. + + lrs : list of float, default [1e-2, 1e-3] + Learning rates to evaluate. + + epochs : int, default 50 + Training epochs for each configuration. + + batch_size : int, default 32 + Batch size for training. + + random_state : int, default 42 + Random seed for reproducible train/validation splits. + + Returns + ------- + dict + Mapping from (latent_dim, hidden_dims, lr) tuples to validation loss values. + Lower values indicate better performance. + + Notes + ----- + Uses 10% of data for validation via train_test_split. Each configuration + is trained independently with early stopping disabled to ensure fair + comparison across hyperparameter combinations. + + Examples + -------- + >>> results = AutoEncoder.hyperparam_search(X_train, epochs=100) + >>> best_params = min(results.keys(), key=results.get) + >>> print(f"Best configuration: {best_params}") + """ + results = {} + X_train, X_val = train_test_split(X, test_size=0.1, random_state=random_state) + for ld in latent_dims: + for hd in hidden_dims_list: + for lr in lrs: + print(f"Training AE: latent_dim={ld}, hidden_dims={hd}, lr={lr}") + ae = AutoEncoder(input_dim=X.shape[1], latent_dim=ld, hidden_dims=hd, lr=lr) + history = ae.fit(X_train, X_val=X_val, epochs=epochs, batch_size=batch_size,verbose=0) + val_loss = history.history['val_loss'][-1] + results[(ld, hd, lr)] = val_loss + print(f"Validation loss: {val_loss:.4f}") + return results + + + + +# Efficient pairwise L2 distances +def pairwise_distances(x, y, eps=1e-12): + x_norm = tf.reduce_sum(tf.square(x), axis=1, keepdims=True) + y_norm = tf.reduce_sum(tf.square(y), axis=1, keepdims=True) + dist_sq = x_norm + tf.transpose(y_norm) - 2.0 * tf.matmul(x, y, transpose_b=True) + dist_sq = tf.maximum(dist_sq, eps) + return tf.sqrt(dist_sq) + + + +# Energy distance core function +def energy_distance_optimized(y_true, y_pred): + d_xy = pairwise_distances(y_true, y_pred) + cross = 2.0 * tf.reduce_mean(d_xy) + + d_xx = pairwise_distances(y_true, y_true) + d_yy = pairwise_distances(y_pred, y_pred) + + return cross - tf.reduce_mean(d_xx) - tf.reduce_mean(d_yy) + + +# UTILITY FUNCTIONS FOR DISTRIBUTION-AWARE LOSSES +def maximum_mean_discrepancy(x, y, kernel='rbf', sigma=1.0): + """Compute Maximum Mean Discrepancy between two distributions.""" + if kernel == 'rbf': + # RBF kernel k(x,y) = exp(-||x-y||^2 / (2*sigma^2)) + x_norm = tf.reduce_sum(tf.square(x), axis=1, keepdims=True) + y_norm = tf.reduce_sum(tf.square(y), axis=1, keepdims=True) + + # Pairwise distances + xx = x_norm + tf.transpose(x_norm) - 2.0 * tf.matmul(x, x, transpose_b=True) + yy = y_norm + tf.transpose(y_norm) - 2.0 * tf.matmul(y, y, transpose_b=True) + xy = x_norm + tf.transpose(y_norm) - 2.0 * tf.matmul(x, y, transpose_b=True) + + # Apply RBF kernel + k_xx = tf.exp(-xx / (2 * sigma**2)) + k_yy = tf.exp(-yy / (2 * sigma**2)) + k_xy = tf.exp(-xy / (2 * sigma**2)) + + elif kernel == 'linear': + k_xx = tf.matmul(x, x, transpose_b=True) + k_yy = tf.matmul(y, y, transpose_b=True) + k_xy = tf.matmul(x, y, transpose_b=True) + else: + raise ValueError(f"Unsupported kernel: {kernel}") + + # MMD calculation + mmd = tf.reduce_mean(k_xx) + tf.reduce_mean(k_yy) - 2.0 * tf.reduce_mean(k_xy) + return tf.maximum(mmd, 0.0) # Ensure non-negative + + +def wasserstein_distance_sliced(x, y, num_projections=50): + """Approximate Wasserstein-1 distance using sliced Wasserstein distance.""" + # Generate random projections + d = tf.shape(x)[1] + theta = tf.random.normal([d, num_projections]) + theta = theta / tf.norm(theta, axis=0, keepdims=True) + + # Project data onto random directions + x_proj = tf.matmul(x, theta) # [batch_size, num_projections] + y_proj = tf.matmul(y, theta) # [batch_size, num_projections] + + # Sort projections + x_sorted = tf.sort(x_proj, axis=0) + y_sorted = tf.sort(y_proj, axis=0) + + # Compute L1 distance between sorted projections + distances = tf.reduce_mean(tf.abs(x_sorted - y_sorted), axis=0) + return tf.reduce_mean(distances) + + +def correlation_loss(x, y): + """Penalize differences in correlation structure between datasets.""" + # Center the data + x_centered = x - tf.reduce_mean(x, axis=0, keepdims=True) + y_centered = y - tf.reduce_mean(y, axis=0, keepdims=True) + + # Compute correlation matrices + x_cov = tf.matmul(x_centered, x_centered, transpose_a=True) / tf.cast(tf.shape(x)[0] - 1, tf.float32) + y_cov = tf.matmul(y_centered, y_centered, transpose_a=True) / tf.cast(tf.shape(y)[0] - 1, tf.float32) + + # Normalize to get correlation + x_std = tf.sqrt(tf.diag_part(x_cov)) + y_std = tf.sqrt(tf.diag_part(y_cov)) + + x_corr = x_cov / (tf.expand_dims(x_std, 0) * tf.expand_dims(x_std, 1)) + y_corr = y_cov / (tf.expand_dims(y_std, 0) * tf.expand_dims(y_std, 1)) + + # Frobenius norm of difference + return tf.reduce_mean(tf.square(x_corr - y_corr)) + + + +if tf is not None: + LossBase = tf.keras.losses.Loss +else: + class LossBase: + def __init__(self, name=None, **kwargs): + pass + def __call__(self, *args, **kwargs): + pass + +@register_keras_serializable(package="pyemu_emulators", name="EnergyLoss") +class EnergyLoss(LossBase): + """ + Energy distance loss combining MSE reconstruction with energy distance. + + The energy distance measures dissimilarity between probability distributions + and helps ensure the reconstructed samples preserve the overall data distribution. + """ + + def __init__(self, lambda_energy=1e-2, name="energy_loss"): + super().__init__(name=name) + self.lambda_energy = lambda_energy + + def call(self, y_true, y_pred): + mse = tf.reduce_mean(tf.square(y_true - y_pred)) + ed = energy_distance_optimized(y_true, y_pred) + return mse + self.lambda_energy * ed + + def get_config(self): + return { + "lambda_energy": self.lambda_energy, + "name": self.name, + } + + @classmethod + def from_config(cls, config): + return cls(**config) + + +@register_keras_serializable(package="pyemu_emulators", name="MMDLoss") +class MMDLoss(LossBase): + """ + Maximum Mean Discrepancy loss for distribution matching. + + MMD measures the distance between distributions in a reproducing kernel + Hilbert space. More computationally efficient than energy distance. + """ + + def __init__(self, lambda_mmd=1e-2, kernel='rbf', sigma=1.0, name="mmd_loss"): + super().__init__(name=name) + self.lambda_mmd = lambda_mmd + self.kernel = kernel + self.sigma = sigma + + def call(self, y_true, y_pred): + mse = tf.reduce_mean(tf.square(y_true - y_pred)) + mmd = maximum_mean_discrepancy(y_true, y_pred, kernel=self.kernel, sigma=self.sigma) + return mse + self.lambda_mmd * mmd + + def get_config(self): + return { + "lambda_mmd": self.lambda_mmd, + "kernel": self.kernel, + "sigma": self.sigma, + "name": self.name, + } + + @classmethod + def from_config(cls, config): + return cls(**config) + + +@register_keras_serializable(package="pyemu_emulators", name="WassersteinLoss") +class WassersteinLoss(LossBase): + """ + Sliced Wasserstein distance loss for distribution matching. + + Uses random projections to approximate the Wasserstein-1 distance, + which is particularly effective for high-dimensional distributions. + """ + + def __init__(self, lambda_w=1e-2, num_projections=50, name="wasserstein_loss"): + super().__init__(name=name) + self.lambda_w = lambda_w + self.num_projections = num_projections + + def call(self, y_true, y_pred): + mse = tf.reduce_mean(tf.square(y_true - y_pred)) + w_dist = wasserstein_distance_sliced(y_true, y_pred, self.num_projections) + return mse + self.lambda_w * w_dist + + def get_config(self): + return { + "lambda_w": self.lambda_w, + "num_projections": self.num_projections, + "name": self.name, + } + + @classmethod + def from_config(cls, config): + return cls(**config) + + +@register_keras_serializable(package="pyemu_emulators", name="StatisticalLoss") +class StatisticalLoss(LossBase): + """ + Multi-component statistical loss for comprehensive distribution matching. + + Combines reconstruction error with multiple statistical measures: + - Moment matching (mean, variance, skewness, kurtosis) + - Correlation structure preservation + - Optional distribution distance (MMD or Energy) + """ + + def __init__(self, lambda_moments=1e-2, lambda_corr=1e-3, lambda_dist=1e-3, + dist_type='mmd', mmd_sigma=1.0, name="statistical_loss"): + super().__init__(name=name) + self.lambda_moments = lambda_moments + self.lambda_corr = lambda_corr + self.lambda_dist = lambda_dist + self.dist_type = dist_type + self.mmd_sigma = mmd_sigma + + def call(self, y_true, y_pred): + # Reconstruction loss + mse = tf.reduce_mean(tf.square(y_true - y_pred)) + + # Moment matching + moments_loss = 0.0 + for moment in range(1, 5): # mean, variance, skewness, kurtosis + true_moment = tf.reduce_mean(tf.pow(y_true - tf.reduce_mean(y_true, axis=0), moment), axis=0) + pred_moment = tf.reduce_mean(tf.pow(y_pred - tf.reduce_mean(y_pred, axis=0), moment), axis=0) + moments_loss += tf.reduce_mean(tf.square(true_moment - pred_moment)) + + # Correlation structure loss + corr_loss = correlation_loss(y_true, y_pred) + + # Distribution distance + if self.dist_type == 'mmd': + dist_loss = maximum_mean_discrepancy(y_true, y_pred, sigma=self.mmd_sigma) + elif self.dist_type == 'energy': + dist_loss = energy_distance_optimized(y_true, y_pred) + else: + dist_loss = 0.0 + + total_loss = (mse + + self.lambda_moments * moments_loss + + self.lambda_corr * corr_loss + + self.lambda_dist * dist_loss) + + return total_loss + + def get_config(self): + return { + "lambda_moments": self.lambda_moments, + "lambda_corr": self.lambda_corr, + "lambda_dist": self.lambda_dist, + "dist_type": self.dist_type, + "mmd_sigma": self.mmd_sigma, + "name": self.name, + } + + @classmethod + def from_config(cls, config): + return cls(**config) + + +@register_keras_serializable(package="pyemu_emulators", name="AdaptiveLoss") +class AdaptiveLoss(LossBase): + """ + Adaptive loss that balances reconstruction and distribution terms dynamically. + + Automatically adjusts the weighting between reconstruction and distribution + preservation based on their relative magnitudes during training. + """ + + def __init__(self, base_lambda=1e-2, adaptation_rate=0.01, min_lambda=1e-5, + max_lambda=1e-1, name="adaptive_loss"): + super().__init__(name=name) + self.base_lambda = base_lambda + self.adaptation_rate = adaptation_rate + self.min_lambda = min_lambda + self.max_lambda = max_lambda + self.current_lambda = tf.Variable(base_lambda, trainable=False, name="adaptive_lambda") + + def call(self, y_true, y_pred): + mse = tf.reduce_mean(tf.square(y_true - y_pred)) + ed = energy_distance_optimized(y_true, y_pred) + + # Adaptive weighting based on relative magnitudes + mse_magnitude = tf.stop_gradient(mse) + ed_magnitude = tf.stop_gradient(ed) + + # Update lambda to balance the terms + ratio = ed_magnitude / (mse_magnitude + 1e-8) + target_lambda = self.base_lambda * tf.clip_by_value(ratio, 0.1, 10.0) + + # Smooth update of lambda + self.current_lambda.assign( + self.current_lambda * (1 - self.adaptation_rate) + + target_lambda * self.adaptation_rate + ) + + # Clip lambda to reasonable bounds + clipped_lambda = tf.clip_by_value(self.current_lambda, self.min_lambda, self.max_lambda) + + return mse + clipped_lambda * ed + + def get_config(self): + return { + "base_lambda": self.base_lambda, + "adaptation_rate": self.adaptation_rate, + "min_lambda": self.min_lambda, + "max_lambda": self.max_lambda, + "name": self.name, + } + + @classmethod + def from_config(cls, config): + return cls(**config) + + +@register_keras_serializable(package="custom_losses") +class PerSampleMSE(LossBase): + def __init__(self, name="per_sample_mse"): + super().__init__(reduction="none", name=name) + + def call(self, y_true, y_pred): + # shape (batch,) + return tf.reduce_mean(tf.square(y_true - y_pred), axis=1) + + + def get_config(self): + return {"name": self.name} + + @classmethod + def from_config(cls, config): + return cls(**config) + + + +def create_observation_weights(data: Union[pd.DataFrame, np.ndarray], + observed_values: List[float], + critical_features: List[int], + weight_type: str = 'inverse_distance', + temperature: float = 1.0, + normalize: bool = True, + clip_range: tuple = (0.1, 10.0)) -> np.ndarray: + """ + Create sample weights based on proximity to observed values. + + Parameters + ---------- + data : pd.DataFrame or np.ndarray + Training data with shape (n_samples, n_features) + observed_values : list of float + Target observed values at critical features + critical_features : list of int + Column indices of critical observation features + weight_type : str, default 'inverse_distance' + Type of weighting: 'inverse_distance', 'gaussian', 'exponential' + temperature : float, default 1.0 + Temperature parameter for weight decay (lower = sharper weighting) + normalize : bool, default True + Whether to normalize weights to mean = 1.0 + clip_range : tuple, default (0.1, 10.0) + Range to clip extreme weights (min, max) + + Returns + ------- + np.ndarray + Sample weights with shape (n_samples,) + """ + if isinstance(data, pd.DataFrame): + data = data.values + + observed_values = np.array(observed_values) + sample_weights = np.ones(len(data)) + + for i in range(len(data)): + sample_values = data[i][critical_features] + + if weight_type == 'inverse_distance': + distance = np.sqrt(np.sum((sample_values - observed_values)**2)) + weight = 1.0 / (1.0 + distance / temperature) + + elif weight_type == 'gaussian': + distance_sq = np.sum((sample_values - observed_values)**2) + weight = np.exp(-distance_sq / (2.0 * temperature**2)) + + elif weight_type == 'exponential': + distance = np.sqrt(np.sum((sample_values - observed_values)**2)) + weight = np.exp(-distance / temperature) + + else: + raise ValueError(f"Unknown weight_type: {weight_type}") + + sample_weights[i] = weight + + if normalize: + sample_weights = sample_weights / np.mean(sample_weights) + + if clip_range is not None: + sample_weights = np.clip(sample_weights, clip_range[0], clip_range[1]) + + return sample_weights + + +def create_pest_observation_weights(pst: 'Pst', + emulator_data: pd.DataFrame, + weight_scaling: float = 1.0, + **kwargs) -> np.ndarray: + """ + Create sample weights using PEST observation data. + + Parameters + ---------- + pst : Pst + PEST control file object with observation data + emulator_data : pd.DataFrame + Training data for the emulator + weight_scaling : float, default 1.0 + Overall scaling factor for weights + **kwargs + Additional arguments passed to create_observation_weights + + Returns + ------- + np.ndarray + Sample weights based on PEST observations + """ + obs_data = pst.observation_data + + # Map observation names to column indices + critical_features = [] + observed_values = [] + + for obs_name in obs_data.index: + if obs_name in emulator_data.columns: + col_idx = emulator_data.columns.get_loc(obs_name) + critical_features.append(col_idx) + observed_values.append(obs_data.loc[obs_name, 'obsval']) + + if len(critical_features) == 0: + raise ValueError("No matching observations found between PST and emulator data") + + weights = create_observation_weights( + emulator_data, observed_values, critical_features, **kwargs + ) + + return weights * weight_scaling + + + +def create_distribution_loss(loss_type='energy', **kwargs): + """ + Factory function to create distribution-aware loss functions. + + Parameters + ---------- + loss_type : str + Type of loss function to create: + - 'energy': EnergyLoss (default, robust but computationally expensive) + - 'mmd': MMDLoss (efficient, good for high-dim data) + - 'wasserstein': WassersteinLoss (good for smooth distributions) + - 'statistical': StatisticalLoss (comprehensive statistical matching) + - 'adaptive': AdaptiveLoss (automatically balances terms) + - 'mse': Standard MSE (no distribution matching) + - 'huber': Huber loss (robust to outliers, no distribution matching) + **kwargs : dict + Additional parameters specific to each loss type + + Returns + ------- + tf.keras.losses.Loss + Configured loss function + + Examples + -------- + >>> # Energy loss with custom weighting + >>> loss = create_distribution_loss('energy', lambda_energy=1e-3) + >>> + >>> # MMD loss with RBF kernel + >>> loss = create_distribution_loss('mmd', lambda_mmd=1e-2, sigma=2.0) + >>> + >>> # Statistical loss with all components + >>> loss = create_distribution_loss('statistical', + ... lambda_moments=1e-2, + ... lambda_corr=1e-3, + ... lambda_dist=5e-3) + """ + if loss_type == 'energy': + return EnergyLoss(**kwargs) + elif loss_type == 'mmd': + return MMDLoss(**kwargs) + elif loss_type == 'wasserstein': + return WassersteinLoss(**kwargs) + elif loss_type == 'statistical': + return StatisticalLoss(**kwargs) + elif loss_type == 'adaptive': + return AdaptiveLoss(**kwargs) + elif loss_type == 'per_sample_mse': + return PerSampleMSE(**kwargs) + elif loss_type == 'mse': + return 'mse' + elif loss_type == 'huber': + return tf.keras.losses.Huber(**kwargs) + + else: + raise ValueError(f"Unknown loss type: {loss_type}. " + f"Supported types: energy, mmd, wasserstein, statistical, " f"adaptive, per_sample_mse, mse, huber") \ No newline at end of file diff --git a/pyemu/emulators/transformers.py b/pyemu/emulators/transformers.py index ff3d3412d..4f438cf34 100755 --- a/pyemu/emulators/transformers.py +++ b/pyemu/emulators/transformers.py @@ -1,853 +1,854 @@ -""" -Transformer classes for data transformations in emulators. -""" -from __future__ import print_function, division -import numpy as np -import pandas as pd -import importlib.util -import inspect - -# Check sklearn availability at module level -HAS_SKLEARN = importlib.util.find_spec("sklearn") is not None - -if HAS_SKLEARN: - from sklearn.preprocessing import StandardScaler -else: - # Create dummy classes or set to None - StandardScaler = None - - -class BaseTransformer: - """Base class for all transformers providing a consistent interface.""" - - def fit(self, X): - """Learn parameters from data if needed.""" - return self - - def transform(self, X): - """Apply transformation to X.""" - raise NotImplementedError - - def fit_transform(self, X): - """Fit and transform in one step.""" - return self.fit(X).transform(X) - - def inverse_transform(self, X): - """Inverse transform X back to original space.""" - raise NotImplementedError - -class Log10Transformer(BaseTransformer): - """Apply log10 transformation. - - Parameters - ---------- - columns : list, optional - List of column names to be transformed. If None, all columns will be transformed. - """ - - def __init__(self, columns=None): - self.columns = columns - self.shifts = {} - - def transform(self, X): - result = X.copy() - columns = self.columns if self.columns is not None else X.columns - columns = [col for col in columns if col in X.columns] - - for col in columns: - min_val = X[col].min() - shift = -min_val + 1e-6 if min_val <= 0 else 0 - self.shifts[col] = shift - result[col] = np.log10(X[col] + shift) - return result - - def inverse_transform(self, X): - result = X.copy() - for col in self.shifts.keys(): - if col in X.columns: - shift = self.shifts.get(col, 0) - result[col] = (10 ** X[col]) - shift - return result - -class RowWiseMinMaxScaler(BaseTransformer): - """Scale each row of a DataFrame to a specified range. - - Parameters - ---------- - feature_range : tuple (min, max), default=(-1, 1) - The range to scale features into. - groups : dict or None, default=None - Dict mapping group names to lists of column names to be scaled together (entire timeseries for that group). - If None, all columns will be treated as a single group. - Example: {'group1': ['col1', 'col2'], 'group2': ['col3', 'col4']} - fit_groups : dict or None, default=None - Dict mapping group names to lists of column names (subset of groups) used to compute row-wise min and max. - If None, defaults to using the same columns as in groups. - """ - - def __init__(self, feature_range=(-1, 1), groups=None, fit_groups=None): - self.feature_range = feature_range - self.groups = groups - self.fit_groups = fit_groups if fit_groups is not None else groups - self.row_params = {} # Will store per-row (min, max) for each group - - def fit(self, X): - """Compute row-wise min and max for each group. - - Parameters - ---------- - X : pandas.DataFrame - The DataFrame to fit the scaler on. - - Returns - ------- - self : object - Returns self. - """ - # If groups not specified, treat all columns as one group - if self.groups is None: - self.groups = {"all": X.columns.tolist()} - - if self.fit_groups is None: - self.fit_groups = self.groups.copy() - - # Calculate and store row-wise min and max for each group - self.row_params = {} - for group_name, group_cols in self.groups.items(): - # Determine which columns to use for computing min/max for each row - fit_cols = self.fit_groups.get(group_name, group_cols) - # Keep only columns that exist in the DataFrame - fit_cols = [col for col in fit_cols if col in X.columns] - if not fit_cols: - continue - - # Compute row-wise min and max using the fit columns - row_min = X[fit_cols].min(axis=1) - row_max = X[fit_cols].max(axis=1) - self.row_params[group_name] = (row_min, row_max) - - return self - - def transform(self, X): - """Scale each row of data to the specified range. - - Parameters - ---------- - X : pandas.DataFrame - The DataFrame to transform. - - Returns - ------- - pandas.DataFrame - The transformed DataFrame. - """ - result = X.copy() - f_min, f_max = self.feature_range - - # Auto-fit if not already fitted or if groups weren't specified - if not self.row_params or self.groups is None: - self.fit(X) - - # Transform each group - for group_name, group_cols in self.groups.items(): - # Keep only columns that exist in the DataFrame - valid_cols = [col for col in group_cols if col in X.columns] - if not valid_cols: - continue - - # Get the min and max for each row in this group - row_min, row_max = self.row_params[group_name] - - # Calculate the row range, avoiding division by zero - row_range = row_max - row_min - row_range[row_range == 0] = 1.0 # Set to 1 where range is 0 - - # For all columns in the group, scale using the row-wise parameters - group_data = X[valid_cols] - # First scale to [0, 1] - group_std = group_data.sub(row_min, axis=0).div(row_range, axis=0) - # Then scale to the desired feature range - result[valid_cols] = group_std * (f_max - f_min) + f_min - - return result - - def inverse_transform(self, X): - """Inverse transform data back to the original scale. - - Parameters - ---------- - X : pandas.DataFrame - The DataFrame to inverse transform. - - Returns - ------- - pandas.DataFrame - The inverse-transformed DataFrame. - """ - if not self.row_params: - raise ValueError("This RowWiseMinMaxScaler instance is not fitted yet. " - "Call 'fit' before using this method.") - - result = X.copy() - f_min, f_max = self.feature_range - - # Inverse transform each group - for group_name, group_cols in self.groups.items(): - # Keep only columns that exist in the DataFrame - valid_cols = [col for col in group_cols if col in X.columns] - if not valid_cols: - continue - - # Get the min and max for each row in this group - row_min, row_max = self.row_params[group_name] - row_range = row_max - row_min - row_range[row_range == 0] = 1.0 # Avoid division by zero - - # Get the scaled data for this group - group_data = X[valid_cols] - - # First convert from feature_range to [0, 1] - group_std = (group_data - f_min) / (f_max - f_min) - - # Then recover original values - result[valid_cols] = group_std.mul(row_range, axis=0).add(row_min, axis=0) - - return result - -class MinMaxScaler(BaseTransformer): - """Scale each column of a DataFrame to a specified range. - - Parameters - ---------- - feature_range : tuple (min, max), default=(-1, 1) - The range to scale features into. - columns : list, optional - List of column names to be scaled. If None, all columns will be scaled. - skip_constant : bool, optional - If True, columns with constant values will be skipped. Default is True. - """ - - def __init__(self, feature_range=(-1, 1), columns=None, skip_constant=True): - self.feature_range = feature_range - self.columns = columns - self.skip_constant = skip_constant - self.min_ = {} - self.scale_ = {} - - def fit(self, X): - """Learn min and max values for scaling. - - Parameters - ---------- - X : pandas.DataFrame - The DataFrame to fit the scaler on. - - Returns - ------- - self : object - Returns self. - """ - columns = self.columns if self.columns is not None else X.columns - - # Ensure we only work with columns that exist in the DataFrame - columns = [col for col in columns if col in X.columns] - - for col in columns: - col_min = X[col].min() - col_max = X[col].max() - - # If the column has constant values and skip_constant is True, store the values but don't transform - if self.skip_constant and col_min == col_max: - self.min_[col] = col_min - self.scale_[col] = 0 # Flag for constant column - else: - # Store min and calculate scale factor for non-constant columns - self.min_[col] = col_min - # Avoid division by zero for nearly constant columns - if col_max - col_min > 1e-10: - self.scale_[col] = (self.feature_range[1] - self.feature_range[0]) / (col_max - col_min) - else: - # For nearly constant columns, set scale to 0 to keep original value - self.scale_[col] = 0 - - return self - - def transform(self, X): - """Scale features according to feature_range. - - Parameters - ---------- - X : pandas.DataFrame - The DataFrame to transform. - - Returns - ------- - pandas.DataFrame - The transformed DataFrame. - """ - if not self.min_: - self.fit(X) - - result = X.copy() - - f_min, f_max = self.feature_range - - for col in self.min_.keys(): - if col not in X.columns: - continue - - # Skip columns marked as constant - if self.scale_[col] == 0: - continue - - # Apply scaling: X_std = (X - X.min) / (X.max - X.min) -> X_scaled = X_std * (max - min) + min - result[col] = (X[col] - self.min_[col]) * self.scale_[col] + f_min - - return result - - def inverse_transform(self, X): - """Undo the scaling of X according to feature_range. - - Parameters - ---------- - X : pandas.DataFrame - The DataFrame to inverse transform. - - Returns - ------- - pandas.DataFrame - The inverse-transformed DataFrame. - """ - if not self.min_: - raise ValueError("This MinMaxScaler instance is not fitted yet. Call 'fit' before using this method.") - - result = X.copy() - - f_min, f_max = self.feature_range - - for col in self.min_.keys(): - if col not in X.columns: - continue - - # Skip columns marked as constant - if self.scale_[col] == 0: - continue - - # Apply inverse scaling: X_original = (X_scaled - min) / (max - min) * (X.max - X.min) + X.min - result[col] = (X[col] - f_min) / self.scale_[col] + self.min_[col] - - return result - -class StandardScalerTransformer(BaseTransformer): - """Wrapper around sklearn's StandardScaler for DataFrame compatibility. - - Parameters - ---------- - with_mean : bool, default=True - If True, center the data before scaling. - with_std : bool, default=True - If True, scale the data to unit variance. - copy : bool, default=True - If True, a copy of X will be created. If False, centering and scaling happen in-place. - columns : list, optional - List of column names to be transformed. If None, all columns will be transformed. - """ - - def __init__(self, with_mean=True, with_std=True, copy=True, columns=None): - self.with_mean = with_mean - self.with_std = with_std - self.copy = copy - self.columns = columns - self._sklearn_scaler = None - self._fitted_columns = None - - def fit(self, X): - # Determine which columns to fit - columns = self.columns if self.columns is not None else X.columns - columns = [col for col in columns if col in X.columns] - self._fitted_columns = columns - - # Create sklearn StandardScaler - self._sklearn_scaler = StandardScaler( - with_mean=self.with_mean, - with_std=self.with_std, - copy=self.copy - ) - - # Fit on numpy array (sklearn expects this) - if columns: - self._sklearn_scaler.fit(X[columns].values) - return self - - def transform(self, X): - if self._sklearn_scaler is None: - raise ValueError("Transformer must be fitted before transform") - - result = X.copy() - - if self._fitted_columns: - # Transform using sklearn - transformed_values = self._sklearn_scaler.transform(X[self._fitted_columns].values) - - # Update only the fitted columns in the result - result[self._fitted_columns] = transformed_values - - return result - - def inverse_transform(self, X): - if self._sklearn_scaler is None: - raise ValueError("Transformer must be fitted before inverse_transform") - - result = X.copy() - - if self._fitted_columns: - # Inverse transform using sklearn - inverse_values = self._sklearn_scaler.inverse_transform(X[self._fitted_columns].values) - - # Update only the fitted columns in the result - result[self._fitted_columns] = inverse_values - - return result - -class GenericTransformer(BaseTransformer): - """Wrapper for generic sklearn-compatible transformers. - - Parameters - ---------- - transformer_class : class - The class of the transformer to be used (e.g. sklearn.preprocessing.QuantileTransformer). - kwargs : dict - Arguments to be passed to the transformer constructor. - """ - def __init__(self, transformer_class, **kwargs): - self.transformer = transformer_class(**kwargs) - - # Validation: check for fit, transform, inverse_transform methods on the instance - if not hasattr(self.transformer, "fit"): - raise ValueError(f"Transformer {transformer_class.__name__} must have a 'fit' method.") - if not hasattr(self.transformer, "transform"): - raise ValueError(f"Transformer {transformer_class.__name__} must have a 'transform' method.") - if not hasattr(self.transformer, "inverse_transform"): - raise ValueError(f"Transformer {transformer_class.__name__} must have an 'inverse_transform' method for use in pyemu emulators.") - - def fit(self, X): - self.transformer.fit(X.values) - return self - - def transform(self, X): - res = self.transformer.transform(X.values) - return pd.DataFrame(res, index=X.index, columns=X.columns) - - def inverse_transform(self, X): - res = self.transformer.inverse_transform(X.values) - return pd.DataFrame(res, index=X.index, columns=X.columns) - - -class NormalScoreTransformer(BaseTransformer): - """A transformer for normal score transformation. - - Parameters - ---------- - tol : float, default=1e-7 - Tolerance for convergence in random generation. - max_samples : int, default=1000000 - Maximum number of samples for random generation. - quadratic_extrapolation : bool, default=False - Whether to use quadratic extrapolation for values outside the fitted range. - columns : list, optional - List of column names to be transformed. If None, all columns will be transformed. - """ - - def __init__(self, tol=1e-7, max_samples=1000000, quadratic_extrapolation=False, columns=None): - self.tol = tol - self.max_samples = max_samples - self.quadratic_extrapolation = quadratic_extrapolation - self.columns = columns - self.column_parameters = {} - self.shared_z_scores = {} - - def fit(self, X): - """Fit the transformer to the data.""" - columns = self.columns if self.columns is not None else X.columns - columns = [col for col in columns if col in X.columns] - - for col in columns: - values = X[col].values - sorted_vals = np.sort(values) - smoothed_vals = self._moving_average_with_endpoints(sorted_vals) - - n_points = len(smoothed_vals) - if n_points not in self.shared_z_scores: - self.shared_z_scores[n_points] = self._randrealgen_optimized(n_points) - - z_scores = self.shared_z_scores[n_points] - - self.column_parameters[col] = { - 'z_scores': z_scores, - 'originals': smoothed_vals, - } - return self - - def transform(self, X): - """Transform the data using normal score transformation. - - Parameters - ---------- - X : pandas.DataFrame - The DataFrame to transform. - - Returns - ------- - pandas.DataFrame - The transformed DataFrame with normal scores. - """ - result = X.copy() - for col in self.column_parameters.keys(): - if col not in X.columns: - continue - - params = self.column_parameters.get(col, {}) - z_scores = params.get('z_scores', []) - originals = params.get('originals', []) - - if len(z_scores) == 0 or len(originals) == 0: - continue - - values = X[col].values - - # Handle values outside the original range - min_orig, max_orig = np.min(originals), np.max(originals) - min_z, max_z = np.min(z_scores), np.max(z_scores) - - # For values within range, use interpolation - within_range = (values >= min_orig) & (values <= max_orig) - if within_range.any(): - result.loc[within_range, col] = np.interp( - values[within_range], originals, z_scores - ) - - # For values outside range, use extrapolation if enabled or clamp to bounds - below_min = values < min_orig - above_max = values > max_orig - - if below_min.any(): - if self.quadratic_extrapolation: - # Use linear extrapolation below minimum - slope = (z_scores[1] - z_scores[0]) / (originals[1] - originals[0]) - result.loc[below_min, col] = min_z + slope * (values[below_min] - min_orig) - else: - # Otherwise clamp to minimum z-score - result.loc[below_min, col] = min_z - - if above_max.any(): - if self.quadratic_extrapolation: - # Use linear extrapolation above maximum - slope = (z_scores[-1] - z_scores[-2]) / (originals[-1] - originals[-2]) - result.loc[above_max, col] = max_z + slope * (values[above_max] - max_orig) - else: - # Otherwise clamp to maximum z-score - result.loc[above_max, col] = max_z - - return result - - def inverse_transform(self, X): - """Inverse transform data back to original space. - - Parameters - ---------- - X : pandas.DataFrame - The DataFrame with transformed data to inverse transform. - - Returns - ------- - pandas.DataFrame - The inverse-transformed DataFrame. - """ - result = X.astype(float).copy() - for col in self.column_parameters.keys(): - if col not in X.columns: - continue - - params = self.column_parameters.get(col, {}) - z_scores = params.get('z_scores', []) - originals = params.get('originals', []) - if len(z_scores) == 0 or len(originals) == 0: - continue - - # Get values to inverse transform - values = X[col].values - min_z, max_z = np.min(z_scores), np.max(z_scores) - min_orig, max_orig = np.min(originals), np.max(originals) - - # For values within the z-score range, use interpolation - within_range = (values >= min_z) & (values <= max_z) - if within_range.any(): - result.loc[within_range, col] = np.interp(values[within_range], z_scores, originals) - - # For values outside the z-score range, use extrapolation if enabled - below_min = values < min_z - above_max = values > max_z - - if below_min.any(): - if self.quadratic_extrapolation: - # Use linear extrapolation below minimum z-score - slope = (originals[1] - originals[0]) / (z_scores[1] - z_scores[0]) - intercept = originals[0] - slope * z_scores[0] - result.loc[below_min, col] = slope * values[below_min] + intercept - else: - # Otherwise clamp to minimum original value - result.loc[below_min, col] = min_orig - - if above_max.any(): - if self.quadratic_extrapolation: - # Use linear extrapolation above maximum z-score - slope = (originals[-1] - originals[-2]) / (z_scores[-1] - z_scores[-2]) - intercept = originals[-1] - slope * z_scores[-1] - result.loc[above_max, col] = slope * values[above_max] + intercept - else: - # Otherwise clamp to maximum original value - result.loc[above_max, col] = max_orig - - return result - - def _randrealgen_optimized(self, nreal): - rval = np.zeros(nreal) - nsamp = 0 - numsort = (nreal + 1) // 2 if nreal % 2 == 0 else nreal // 2 - - while nsamp < self.max_samples: - nsamp += 1 - work1 = np.random.normal(size=nreal) - work1.sort() - - if nsamp > 1: - previous_mean = rval[:numsort] / (nsamp - 1) - rval[:numsort] += work1[:numsort] - current_mean = rval[:numsort] / nsamp - max_diff = np.max(np.abs(current_mean - previous_mean)) - - if max_diff <= self.tol: - break - else: - rval[:numsort] = work1[:numsort] - - rval[:numsort] /= nsamp - rval[numsort:] = -rval[:numsort][::-1] if nreal % 2 == 0 else np.concatenate(([-rval[numsort]], -rval[:numsort][::-1])) - return rval - - def _moving_average_with_endpoints(self, y_values): - """Apply a moving average smoothing to an array while preserving endpoints.""" - window_size = 3 - if y_values.shape[0] > 40: - window_size = 5 - if y_values.shape[0] > 90: - window_size = 7 - if y_values.shape[0] > 200: - window_size = 9 - - if window_size % 2 == 0: - raise ValueError("window_size must be odd") - half_window = window_size // 2 - smoothed_y = np.zeros_like(y_values) - - # Handle start points correctly - for i in range(0, half_window): - smoothed_y[i] = np.mean(y_values[:i + half_window + 1]) - - # Handle end points correctly - for i in range(1, half_window + 1): - smoothed_y[-i] = np.mean(y_values[-(i + half_window):]) - - # Middle points - for i in range(half_window, len(y_values) - half_window): - smoothed_y[i] = np.mean(y_values[i - half_window:i + half_window + 1]) - - # Preserve original endpoints exactly - smoothed_y[0] = y_values[0] - smoothed_y[-1] = y_values[-1] - - # Ensure monotonicity - for i in range(1, len(smoothed_y)): - if smoothed_y[i] <= smoothed_y[i - 1]: - smoothed_y[i] = smoothed_y[i - 1] + 1e-16 - - return smoothed_y - -class TransformerPipeline: - """Apply a sequence of transformers in order.""" - - def __init__(self): - self.transformers = [] - self.fitted = False - - def add(self, transformer, columns=None): - """Add a transformer to the pipeline, optionally for specific columns.""" - self.transformers.append((transformer, columns)) - return self - - def fit(self, X): - """Fit all transformers in the pipeline.""" - for transformer, columns in self.transformers: - cols_to_transform = columns if columns is not None else X.columns - sub_X = X[cols_to_transform] - transformer.fit(sub_X) - self.fitted = True - return self - - def transform(self, X): - """Transform data using all transformers in the pipeline. - - Parameters - ---------- - X : pandas.DataFrame - The DataFrame to transform. - - Returns - ------- - pandas.DataFrame - The transformed DataFrame. - """ - result = X.copy() - for transformer, columns in self.transformers: - cols_to_transform = columns if columns is not None else X.columns - # Only use columns that exist in the input data - valid_cols = [col for col in cols_to_transform if col in X.columns] - if not valid_cols: - continue - sub_X = result[valid_cols] - result[valid_cols] = transformer.transform(sub_X) - return result - - def fit_transform(self, X): - """Fit all transformers and transform data in one operation.""" - self.fit(X) - return self.transform(X) - - def inverse_transform(self, X): - """Apply inverse transformations in reverse order. - - Parameters - ---------- - X : pandas.DataFrame - The DataFrame to inverse transform. - - Returns - ------- - pandas.DataFrame - The inverse-transformed DataFrame. - """ - - if isinstance(X, pd.Series): - result = X.copy().to_frame().T - else: - result = X.copy().astype(np.float32) - # Need to reverse the order of transformers for inverse - for transformer, columns in reversed(self.transformers): - cols_to_transform = columns if columns is not None else result.columns - # Only use columns that exist in the input data - valid_cols = [col for col in cols_to_transform if col in result.columns] - if not valid_cols: - continue - sub_X = result[valid_cols].copy() # Create a copy to avoid reference issues - inverted = transformer.inverse_transform(sub_X) - result.loc[:, valid_cols] = np.array(inverted, dtype=np.float32).flatten().reshape(result.loc[:, valid_cols].shape) # Use loc for proper assignment - if isinstance(X, pd.Series): - result = result.iloc[0] - return result - -class AutobotsAssemble: - """Class for transforming features in a DataFrame using a pipeline approach.""" - - def __init__(self, df=None): - self.df = df.copy() if df is not None else None - self.pipeline = TransformerPipeline() - - def apply(self, transform_type, columns=None, **kwargs): - """Apply a transformation to specified columns.""" - transformer = self._create_transformer(transform_type, **kwargs) - if columns is None: - columns = list(self.df.columns) # Convert to list to avoid pandas index issues - - # Fit transformer to data if needed - if hasattr(transformer, 'fit') and callable(transformer.fit): - if self.df is not None: - df_subset = self.df[columns] - transformer.fit(df_subset) - - # Add to pipeline - self.pipeline.add(transformer, columns) - - # Apply transformation to current df if available - if self.df is not None: - # Use transform directly to ensure correct application - df_subset = self.df[columns].copy() - transformed = transformer.transform(df_subset) - self.df[columns] = transformed - - return self - - def transform(self, df): - """Transform an external DataFrame using the pipeline. - - Parameters - ---------- - df : pandas.DataFrame - The DataFrame to transform. - - Returns - ------- - pandas.DataFrame - The transformed DataFrame. - """ - if self.pipeline.transformers: - return self.pipeline.transform(df) - return df.copy() - - def inverse(self, df=None): - """Apply inverse transformations in reverse order.""" - to_transform = df if df is not None else self.df - result = self.pipeline.inverse_transform(to_transform) - if df is None: - self.df = result - return result - - def inverse_on_external_df(self, df, columns=None): - """Apply inverse transformations to an external DataFrame. - - Parameters - ---------- - df : pandas.DataFrame - The DataFrame to inverse transform. - columns : list, optional - Specific columns to inverse transform. If None, all columns are processed. - - Returns - ------- - pandas.DataFrame - The inverse-transformed DataFrame. - """ - to_transform = df.copy() - if columns is not None: - # Ensure we only process specified columns - missing_cols = [col for col in columns if col not in df.columns] - if missing_cols: - raise ValueError(f"Columns not found in DataFrame: {missing_cols}") - - return self.pipeline.inverse_transform(to_transform) - - def _create_transformer(self, transform_type, **kwargs): - """Factory method to create appropriate transformer.""" - if inspect.isclass(transform_type): - return GenericTransformer(transform_type, **kwargs) - elif transform_type == "log10": - return Log10Transformer(**kwargs) - elif transform_type == "normal_score": - return NormalScoreTransformer(**kwargs) - elif transform_type == "row_wise_minmax": - return RowWiseMinMaxScaler(**kwargs) - elif transform_type == "standard_scaler": - return StandardScalerTransformer(**kwargs) - elif transform_type == "minmax_scaler": - return MinMaxScaler(**kwargs) - else: +""" +Transformer classes for data transformations in emulators. +""" +from __future__ import print_function, division +import pyemu +import numpy as np +import pandas as pd +import importlib.util +import inspect + +# Check sklearn availability at module level +HAS_SKLEARN = importlib.util.find_spec("sklearn") is not None + +if HAS_SKLEARN: + from sklearn.preprocessing import StandardScaler +else: + # Create dummy classes or set to None + StandardScaler = None + + +class BaseTransformer: + """Base class for all transformers providing a consistent interface.""" + + def fit(self, X): + """Learn parameters from data if needed.""" + return self + + def transform(self, X): + """Apply transformation to X.""" + raise NotImplementedError + + def fit_transform(self, X): + """Fit and transform in one step.""" + return self.fit(X).transform(X) + + def inverse_transform(self, X): + """Inverse transform X back to original space.""" + raise NotImplementedError + +class Log10Transformer(BaseTransformer): + """Apply log10 transformation. + + Parameters + ---------- + columns : list, optional + List of column names to be transformed. If None, all columns will be transformed. + """ + + def __init__(self, columns=None): + self.columns = columns + self.shifts = {} + + def transform(self, X): + result = X.copy() + columns = self.columns if self.columns is not None else X.columns + columns = [col for col in columns if col in X.columns] + + for col in columns: + min_val = X[col].min() + shift = -min_val + 1e-6 if min_val <= 0 else 0 + self.shifts[col] = shift + result[col] = np.log10(X[col] + shift) + return result + + def inverse_transform(self, X): + result = X.copy() + for col in self.shifts.keys(): + if col in X.columns: + shift = self.shifts.get(col, 0) + result[col] = (10 ** X[col]) - shift + return result + +class RowWiseMinMaxScaler(BaseTransformer): + """Scale each row of a DataFrame to a specified range. + + Parameters + ---------- + feature_range : tuple (min, max), default=(-1, 1) + The range to scale features into. + groups : dict or None, default=None + Dict mapping group names to lists of column names to be scaled together (entire timeseries for that group). + If None, all columns will be treated as a single group. + Example: {'group1': ['col1', 'col2'], 'group2': ['col3', 'col4']} + fit_groups : dict or None, default=None + Dict mapping group names to lists of column names (subset of groups) used to compute row-wise min and max. + If None, defaults to using the same columns as in groups. + """ + + def __init__(self, feature_range=(-1, 1), groups=None, fit_groups=None): + self.feature_range = feature_range + self.groups = groups + self.fit_groups = fit_groups if fit_groups is not None else groups + self.row_params = {} # Will store per-row (min, max) for each group + + def fit(self, X): + """Compute row-wise min and max for each group. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to fit the scaler on. + + Returns + ------- + self : object + Returns self. + """ + # If groups not specified, treat all columns as one group + if self.groups is None: + self.groups = {"all": X.columns.tolist()} + + if self.fit_groups is None: + self.fit_groups = self.groups.copy() + + # Calculate and store row-wise min and max for each group + self.row_params = {} + for group_name, group_cols in self.groups.items(): + # Determine which columns to use for computing min/max for each row + fit_cols = self.fit_groups.get(group_name, group_cols) + # Keep only columns that exist in the DataFrame + fit_cols = [col for col in fit_cols if col in X.columns] + if not fit_cols: + continue + + # Compute row-wise min and max using the fit columns + row_min = X[fit_cols].min(axis=1) + row_max = X[fit_cols].max(axis=1) + self.row_params[group_name] = (row_min, row_max) + + return self + + def transform(self, X): + """Scale each row of data to the specified range. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to transform. + + Returns + ------- + pandas.DataFrame + The transformed DataFrame. + """ + result = X.copy() + f_min, f_max = self.feature_range + + # Auto-fit if not already fitted or if groups weren't specified + if not self.row_params or self.groups is None: + self.fit(X) + + # Transform each group + for group_name, group_cols in self.groups.items(): + # Keep only columns that exist in the DataFrame + valid_cols = [col for col in group_cols if col in X.columns] + if not valid_cols: + continue + + # Get the min and max for each row in this group + row_min, row_max = self.row_params[group_name] + + # Calculate the row range, avoiding division by zero + row_range = row_max - row_min + row_range[row_range == 0] = 1.0 # Set to 1 where range is 0 + + # For all columns in the group, scale using the row-wise parameters + group_data = X[valid_cols] + # First scale to [0, 1] + group_std = group_data.sub(row_min, axis=0).div(row_range, axis=0) + # Then scale to the desired feature range + result[valid_cols] = group_std * (f_max - f_min) + f_min + + return result + + def inverse_transform(self, X): + """Inverse transform data back to the original scale. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to inverse transform. + + Returns + ------- + pandas.DataFrame + The inverse-transformed DataFrame. + """ + if not self.row_params: + raise ValueError("This RowWiseMinMaxScaler instance is not fitted yet. " + "Call 'fit' before using this method.") + + result = X.copy() + f_min, f_max = self.feature_range + + # Inverse transform each group + for group_name, group_cols in self.groups.items(): + # Keep only columns that exist in the DataFrame + valid_cols = [col for col in group_cols if col in X.columns] + if not valid_cols: + continue + + # Get the min and max for each row in this group + row_min, row_max = self.row_params[group_name] + row_range = row_max - row_min + row_range[row_range == 0] = 1.0 # Avoid division by zero + + # Get the scaled data for this group + group_data = X[valid_cols] + + # First convert from feature_range to [0, 1] + group_std = (group_data - f_min) / (f_max - f_min) + + # Then recover original values + result[valid_cols] = group_std.mul(row_range, axis=0).add(row_min, axis=0) + + return result + +class MinMaxScaler(BaseTransformer): + """Scale each column of a DataFrame to a specified range. + + Parameters + ---------- + feature_range : tuple (min, max), default=(-1, 1) + The range to scale features into. + columns : list, optional + List of column names to be scaled. If None, all columns will be scaled. + skip_constant : bool, optional + If True, columns with constant values will be skipped. Default is True. + """ + + def __init__(self, feature_range=(-1, 1), columns=None, skip_constant=True): + self.feature_range = feature_range + self.columns = columns + self.skip_constant = skip_constant + self.min_ = {} + self.scale_ = {} + + def fit(self, X): + """Learn min and max values for scaling. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to fit the scaler on. + + Returns + ------- + self : object + Returns self. + """ + columns = self.columns if self.columns is not None else X.columns + + # Ensure we only work with columns that exist in the DataFrame + columns = [col for col in columns if col in X.columns] + + for col in columns: + col_min = X[col].min() + col_max = X[col].max() + + # If the column has constant values and skip_constant is True, store the values but don't transform + if self.skip_constant and col_min == col_max: + self.min_[col] = col_min + self.scale_[col] = 0 # Flag for constant column + else: + # Store min and calculate scale factor for non-constant columns + self.min_[col] = col_min + # Avoid division by zero for nearly constant columns + if col_max - col_min > 1e-10: + self.scale_[col] = (self.feature_range[1] - self.feature_range[0]) / (col_max - col_min) + else: + # For nearly constant columns, set scale to 0 to keep original value + self.scale_[col] = 0 + + return self + + def transform(self, X): + """Scale features according to feature_range. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to transform. + + Returns + ------- + pandas.DataFrame + The transformed DataFrame. + """ + if not self.min_: + self.fit(X) + + result = X.copy() + + f_min, f_max = self.feature_range + + for col in self.min_.keys(): + if col not in X.columns: + continue + + # Skip columns marked as constant + if self.scale_[col] == 0: + continue + + # Apply scaling: X_std = (X - X.min) / (X.max - X.min) -> X_scaled = X_std * (max - min) + min + result[col] = (X[col] - self.min_[col]) * self.scale_[col] + f_min + + return result + + def inverse_transform(self, X): + """Undo the scaling of X according to feature_range. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to inverse transform. + + Returns + ------- + pandas.DataFrame + The inverse-transformed DataFrame. + """ + if not self.min_: + raise ValueError("This MinMaxScaler instance is not fitted yet. Call 'fit' before using this method.") + + result = X.copy() + + f_min, f_max = self.feature_range + + for col in self.min_.keys(): + if col not in X.columns: + continue + + # Skip columns marked as constant + if self.scale_[col] == 0: + continue + + # Apply inverse scaling: X_original = (X_scaled - min) / (max - min) * (X.max - X.min) + X.min + result[col] = (X[col] - f_min) / self.scale_[col] + self.min_[col] + + return result + +class StandardScalerTransformer(BaseTransformer): + """Wrapper around sklearn's StandardScaler for DataFrame compatibility. + + Parameters + ---------- + with_mean : bool, default=True + If True, center the data before scaling. + with_std : bool, default=True + If True, scale the data to unit variance. + copy : bool, default=True + If True, a copy of X will be created. If False, centering and scaling happen in-place. + columns : list, optional + List of column names to be transformed. If None, all columns will be transformed. + """ + + def __init__(self, with_mean=True, with_std=True, copy=True, columns=None): + self.with_mean = with_mean + self.with_std = with_std + self.copy = copy + self.columns = columns + self._sklearn_scaler = None + self._fitted_columns = None + + def fit(self, X): + # Determine which columns to fit + columns = self.columns if self.columns is not None else X.columns + columns = [col for col in columns if col in X.columns] + self._fitted_columns = columns + + # Create sklearn StandardScaler + self._sklearn_scaler = StandardScaler( + with_mean=self.with_mean, + with_std=self.with_std, + copy=self.copy + ) + + # Fit on numpy array (sklearn expects this) + if columns: + self._sklearn_scaler.fit(X[columns].values) + return self + + def transform(self, X): + if self._sklearn_scaler is None: + raise ValueError("Transformer must be fitted before transform") + + result = X.copy() + + if self._fitted_columns: + # Transform using sklearn + transformed_values = self._sklearn_scaler.transform(X[self._fitted_columns].values) + + # Update only the fitted columns in the result + result[self._fitted_columns] = transformed_values + + return result + + def inverse_transform(self, X): + if self._sklearn_scaler is None: + raise ValueError("Transformer must be fitted before inverse_transform") + + result = X.copy() + + if self._fitted_columns: + # Inverse transform using sklearn + inverse_values = self._sklearn_scaler.inverse_transform(X[self._fitted_columns].values) + + # Update only the fitted columns in the result + result[self._fitted_columns] = inverse_values + + return result + +class GenericTransformer(BaseTransformer): + """Wrapper for generic sklearn-compatible transformers. + + Parameters + ---------- + transformer_class : class + The class of the transformer to be used (e.g. sklearn.preprocessing.QuantileTransformer). + kwargs : dict + Arguments to be passed to the transformer constructor. + """ + def __init__(self, transformer_class, **kwargs): + self.transformer = transformer_class(**kwargs) + + # Validation: check for fit, transform, inverse_transform methods on the instance + if not hasattr(self.transformer, "fit"): + raise ValueError(f"Transformer {transformer_class.__name__} must have a 'fit' method.") + if not hasattr(self.transformer, "transform"): + raise ValueError(f"Transformer {transformer_class.__name__} must have a 'transform' method.") + if not hasattr(self.transformer, "inverse_transform"): + raise ValueError(f"Transformer {transformer_class.__name__} must have an 'inverse_transform' method for use in pyemu emulators.") + + def fit(self, X): + self.transformer.fit(X.values) + return self + + def transform(self, X): + res = self.transformer.transform(X.values) + return pd.DataFrame(res, index=X.index, columns=X.columns) + + def inverse_transform(self, X): + res = self.transformer.inverse_transform(X.values) + return pd.DataFrame(res, index=X.index, columns=X.columns) + + +class NormalScoreTransformer(BaseTransformer): + """A transformer for normal score transformation. + + Parameters + ---------- + tol : float, default=1e-7 + Tolerance for convergence in random generation. + max_samples : int, default=1000000 + Maximum number of samples for random generation. + quadratic_extrapolation : bool, default=False + Whether to use quadratic extrapolation for values outside the fitted range. + columns : list, optional + List of column names to be transformed. If None, all columns will be transformed. + """ + + def __init__(self, tol=1e-7, max_samples=1000000, quadratic_extrapolation=False, columns=None): + self.tol = tol + self.max_samples = max_samples + self.quadratic_extrapolation = quadratic_extrapolation + self.columns = columns + self.column_parameters = {} + self.shared_z_scores = {} + + def fit(self, X): + """Fit the transformer to the data.""" + columns = self.columns if self.columns is not None else X.columns + columns = [col for col in columns if col in X.columns] + + for col in columns: + values = X[col].values + sorted_vals = np.sort(values) + smoothed_vals = self._moving_average_with_endpoints(sorted_vals) + + n_points = len(smoothed_vals) + if n_points not in self.shared_z_scores: + self.shared_z_scores[n_points] = self._randrealgen_optimized(n_points) + + z_scores = self.shared_z_scores[n_points] + + self.column_parameters[col] = { + 'z_scores': z_scores, + 'originals': smoothed_vals, + } + return self + + def transform(self, X): + """Transform the data using normal score transformation. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to transform. + + Returns + ------- + pandas.DataFrame + The transformed DataFrame with normal scores. + """ + result = X.copy() + for col in self.column_parameters.keys(): + if col not in X.columns: + continue + + params = self.column_parameters.get(col, {}) + z_scores = params.get('z_scores', []) + originals = params.get('originals', []) + + if len(z_scores) == 0 or len(originals) == 0: + continue + + values = X[col].values + + # Handle values outside the original range + min_orig, max_orig = np.min(originals), np.max(originals) + min_z, max_z = np.min(z_scores), np.max(z_scores) + + # For values within range, use interpolation + within_range = (values >= min_orig) & (values <= max_orig) + if within_range.any(): + result.loc[within_range, col] = np.interp( + values[within_range], originals, z_scores + ) + + # For values outside range, use extrapolation if enabled or clamp to bounds + below_min = values < min_orig + above_max = values > max_orig + + if below_min.any(): + if self.quadratic_extrapolation: + # Use linear extrapolation below minimum + slope = (z_scores[1] - z_scores[0]) / (originals[1] - originals[0]) + result.loc[below_min, col] = min_z + slope * (values[below_min] - min_orig) + else: + # Otherwise clamp to minimum z-score + result.loc[below_min, col] = min_z + + if above_max.any(): + if self.quadratic_extrapolation: + # Use linear extrapolation above maximum + slope = (z_scores[-1] - z_scores[-2]) / (originals[-1] - originals[-2]) + result.loc[above_max, col] = max_z + slope * (values[above_max] - max_orig) + else: + # Otherwise clamp to maximum z-score + result.loc[above_max, col] = max_z + + return result + + def inverse_transform(self, X): + """Inverse transform data back to original space. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame with transformed data to inverse transform. + + Returns + ------- + pandas.DataFrame + The inverse-transformed DataFrame. + """ + result = X.astype(float).copy() + for col in self.column_parameters.keys(): + if col not in X.columns: + continue + + params = self.column_parameters.get(col, {}) + z_scores = params.get('z_scores', []) + originals = params.get('originals', []) + if len(z_scores) == 0 or len(originals) == 0: + continue + + # Get values to inverse transform + values = X[col].values + min_z, max_z = np.min(z_scores), np.max(z_scores) + min_orig, max_orig = np.min(originals), np.max(originals) + + # For values within the z-score range, use interpolation + within_range = (values >= min_z) & (values <= max_z) + if within_range.any(): + result.loc[within_range, col] = np.interp(values[within_range], z_scores, originals) + + # For values outside the z-score range, use extrapolation if enabled + below_min = values < min_z + above_max = values > max_z + + if below_min.any(): + if self.quadratic_extrapolation: + # Use linear extrapolation below minimum z-score + slope = (originals[1] - originals[0]) / (z_scores[1] - z_scores[0]) + intercept = originals[0] - slope * z_scores[0] + result.loc[below_min, col] = slope * values[below_min] + intercept + else: + # Otherwise clamp to minimum original value + result.loc[below_min, col] = min_orig + + if above_max.any(): + if self.quadratic_extrapolation: + # Use linear extrapolation above maximum z-score + slope = (originals[-1] - originals[-2]) / (z_scores[-1] - z_scores[-2]) + intercept = originals[-1] - slope * z_scores[-1] + result.loc[above_max, col] = slope * values[above_max] + intercept + else: + # Otherwise clamp to maximum original value + result.loc[above_max, col] = max_orig + + return result + + def _randrealgen_optimized(self, nreal): + rval = np.zeros(nreal) + nsamp = 0 + numsort = (nreal + 1) // 2 if nreal % 2 == 0 else nreal // 2 + + while nsamp < self.max_samples: + nsamp += 1 + work1 = pyemu.en.rng.normal(size=nreal) + work1.sort() + + if nsamp > 1: + previous_mean = rval[:numsort] / (nsamp - 1) + rval[:numsort] += work1[:numsort] + current_mean = rval[:numsort] / nsamp + max_diff = np.max(np.abs(current_mean - previous_mean)) + + if max_diff <= self.tol: + break + else: + rval[:numsort] = work1[:numsort] + + rval[:numsort] /= nsamp + rval[numsort:] = -rval[:numsort][::-1] if nreal % 2 == 0 else np.concatenate(([-rval[numsort]], -rval[:numsort][::-1])) + return rval + + def _moving_average_with_endpoints(self, y_values): + """Apply a moving average smoothing to an array while preserving endpoints.""" + window_size = 3 + if y_values.shape[0] > 40: + window_size = 5 + if y_values.shape[0] > 90: + window_size = 7 + if y_values.shape[0] > 200: + window_size = 9 + + if window_size % 2 == 0: + raise ValueError("window_size must be odd") + half_window = window_size // 2 + smoothed_y = np.zeros_like(y_values) + + # Handle start points correctly + for i in range(0, half_window): + smoothed_y[i] = np.mean(y_values[:i + half_window + 1]) + + # Handle end points correctly + for i in range(1, half_window + 1): + smoothed_y[-i] = np.mean(y_values[-(i + half_window):]) + + # Middle points + for i in range(half_window, len(y_values) - half_window): + smoothed_y[i] = np.mean(y_values[i - half_window:i + half_window + 1]) + + # Preserve original endpoints exactly + smoothed_y[0] = y_values[0] + smoothed_y[-1] = y_values[-1] + + # Ensure monotonicity + for i in range(1, len(smoothed_y)): + if smoothed_y[i] <= smoothed_y[i - 1]: + smoothed_y[i] = smoothed_y[i - 1] + 1e-16 + + return smoothed_y + +class TransformerPipeline: + """Apply a sequence of transformers in order.""" + + def __init__(self): + self.transformers = [] + self.fitted = False + + def add(self, transformer, columns=None): + """Add a transformer to the pipeline, optionally for specific columns.""" + self.transformers.append((transformer, columns)) + return self + + def fit(self, X): + """Fit all transformers in the pipeline.""" + for transformer, columns in self.transformers: + cols_to_transform = columns if columns is not None else X.columns + sub_X = X[cols_to_transform] + transformer.fit(sub_X) + self.fitted = True + return self + + def transform(self, X): + """Transform data using all transformers in the pipeline. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to transform. + + Returns + ------- + pandas.DataFrame + The transformed DataFrame. + """ + result = X.copy() + for transformer, columns in self.transformers: + cols_to_transform = columns if columns is not None else X.columns + # Only use columns that exist in the input data + valid_cols = [col for col in cols_to_transform if col in X.columns] + if not valid_cols: + continue + sub_X = result[valid_cols] + result[valid_cols] = transformer.transform(sub_X) + return result + + def fit_transform(self, X): + """Fit all transformers and transform data in one operation.""" + self.fit(X) + return self.transform(X) + + def inverse_transform(self, X): + """Apply inverse transformations in reverse order. + + Parameters + ---------- + X : pandas.DataFrame + The DataFrame to inverse transform. + + Returns + ------- + pandas.DataFrame + The inverse-transformed DataFrame. + """ + + if isinstance(X, pd.Series): + result = X.copy().to_frame().T + else: + result = X.copy().astype(np.float32) + # Need to reverse the order of transformers for inverse + for transformer, columns in reversed(self.transformers): + cols_to_transform = columns if columns is not None else result.columns + # Only use columns that exist in the input data + valid_cols = [col for col in cols_to_transform if col in result.columns] + if not valid_cols: + continue + sub_X = result[valid_cols].copy() # Create a copy to avoid reference issues + inverted = transformer.inverse_transform(sub_X) + result.loc[:, valid_cols] = np.array(inverted, dtype=np.float32).flatten().reshape(result.loc[:, valid_cols].shape) # Use loc for proper assignment + if isinstance(X, pd.Series): + result = result.iloc[0] + return result + +class AutobotsAssemble: + """Class for transforming features in a DataFrame using a pipeline approach.""" + + def __init__(self, df=None): + self.df = df.copy() if df is not None else None + self.pipeline = TransformerPipeline() + + def apply(self, transform_type, columns=None, **kwargs): + """Apply a transformation to specified columns.""" + transformer = self._create_transformer(transform_type, **kwargs) + if columns is None: + columns = list(self.df.columns) # Convert to list to avoid pandas index issues + + # Fit transformer to data if needed + if hasattr(transformer, 'fit') and callable(transformer.fit): + if self.df is not None: + df_subset = self.df[columns] + transformer.fit(df_subset) + + # Add to pipeline + self.pipeline.add(transformer, columns) + + # Apply transformation to current df if available + if self.df is not None: + # Use transform directly to ensure correct application + df_subset = self.df[columns].copy() + transformed = transformer.transform(df_subset) + self.df[columns] = transformed + + return self + + def transform(self, df): + """Transform an external DataFrame using the pipeline. + + Parameters + ---------- + df : pandas.DataFrame + The DataFrame to transform. + + Returns + ------- + pandas.DataFrame + The transformed DataFrame. + """ + if self.pipeline.transformers: + return self.pipeline.transform(df) + return df.copy() + + def inverse(self, df=None): + """Apply inverse transformations in reverse order.""" + to_transform = df if df is not None else self.df + result = self.pipeline.inverse_transform(to_transform) + if df is None: + self.df = result + return result + + def inverse_on_external_df(self, df, columns=None): + """Apply inverse transformations to an external DataFrame. + + Parameters + ---------- + df : pandas.DataFrame + The DataFrame to inverse transform. + columns : list, optional + Specific columns to inverse transform. If None, all columns are processed. + + Returns + ------- + pandas.DataFrame + The inverse-transformed DataFrame. + """ + to_transform = df.copy() + if columns is not None: + # Ensure we only process specified columns + missing_cols = [col for col in columns if col not in df.columns] + if missing_cols: + raise ValueError(f"Columns not found in DataFrame: {missing_cols}") + + return self.pipeline.inverse_transform(to_transform) + + def _create_transformer(self, transform_type, **kwargs): + """Factory method to create appropriate transformer.""" + if inspect.isclass(transform_type): + return GenericTransformer(transform_type, **kwargs) + elif transform_type == "log10": + return Log10Transformer(**kwargs) + elif transform_type == "normal_score": + return NormalScoreTransformer(**kwargs) + elif transform_type == "row_wise_minmax": + return RowWiseMinMaxScaler(**kwargs) + elif transform_type == "standard_scaler": + return StandardScalerTransformer(**kwargs) + elif transform_type == "minmax_scaler": + return MinMaxScaler(**kwargs) + else: raise ValueError(f"Unknown transform type: {transform_type}") \ No newline at end of file diff --git a/pyemu/en.py b/pyemu/en.py index bcc25ac6f..992d5af5d 100644 --- a/pyemu/en.py +++ b/pyemu/en.py @@ -9,7 +9,7 @@ from .pyemu_warnings import PyemuWarning SEED = 358183147 # from random.org on 5 Dec 2016 -np.random.seed(SEED) +rng = np.random.default_rng(SEED) class Loc(object): @@ -130,16 +130,17 @@ def __pow__(self, pow): @staticmethod def reseed(): - """reset the `numpy.random.seed` + """reset the `pyemu.en.rng` local random generator Note: reseeds using the pyemu.en.SEED global variable - The pyemu.en.SEED value is set as the numpy.random.seed on import, so + The pyemu.en.SEED value is used to initialize the `rng` on import, so make sure you know what you are doing if you call this method... """ - np.random.seed(SEED) + global rng + rng = np.random.default_rng(SEED) def copy(self): """get a copy of `Ensemble` @@ -477,7 +478,7 @@ def _gaussian_draw( stds = { name: std for name, std in zip(cov.row_names, np.sqrt(cov.x.flatten())) } - snv = np.random.randn(num_reals, mean_values.shape[0]) + snv = rng.standard_normal((num_reals, mean_values.shape[0])) reals = np.zeros_like(snv) reals[:, :] = np.nan for i, name in enumerate(mean_values.index): @@ -509,7 +510,7 @@ def _gaussian_draw( names = None snames = None idxs = [mv_map[name] for name in cnames] - snv = np.random.randn(num_reals, len(cnames)) + snv = rng.standard_normal((num_reals, len(cnames))) cov_grp = cov.get(cnames) if len(cnames) == 1: std = np.sqrt(cov_grp.x) @@ -540,7 +541,7 @@ def _gaussian_draw( reals[i, idxs] = group_mean_values + np.dot(a, snv[i, :]) else: - snv = np.random.randn(num_reals, cov.shape[0]) + snv = rng.standard_normal((num_reals, cov.shape[0])) if factor == "eigen": a, i = Ensemble._get_eigen_projection_matrix(cov.as_2d) elif factor == "cholesky": @@ -588,7 +589,7 @@ def _draw_new_ensemble(self,num_reals,names,include_noise=True,noise_reals=None) mu_vec = self._df.loc[:,names].mean() - snv_draws = np.random.standard_normal((num_reals,self.shape[0])) + snv_draws = rng.standard_normal((num_reals,self.shape[0])) noise = 0.0 if include_noise is not False: @@ -601,8 +602,8 @@ def _draw_new_ensemble(self,num_reals,names,include_noise=True,noise_reals=None) missing = set(self.columns.to_list()) - set(noise_reals.columns) if len(missing) > 0: raise Exception("the following names are not in `noise_reals`: "+",".join(missing)) - #noise_real_choices = np.random.choice(noise_reals.index,num_reals) - noise_real_choices = np.random.randint(0,noise_reals.shape[0],num_reals) + #noise_real_choices = rng.choice(noise_reals.index,num_reals) + noise_real_choices = rng.integers(0,noise_reals.shape[0],num_reals) noise_back_trans = False if not noise_reals.istransformed: noise_reals.transform() @@ -618,7 +619,7 @@ def _draw_new_ensemble(self,num_reals,names,include_noise=True,noise_reals=None) reals.append(real) if noise != 0.0: if noise_reals is None: - noise_real = np.random.normal(0.0,noise,real.shape[0]) + noise_real = rng.normal(0.0,noise,real.shape[0]) else: #noise_real = noise * noise_deviations.loc[noise_real_choices[i],names].values noise_real = noise * nmat[noise_real_choices[i],:] @@ -1162,7 +1163,7 @@ def from_triangular_draw(cls, pst, num_reals=100, fill=True): log-transformed parameters are drawn in log space. The returned `ParameterEnsemble` is back transformed (not in log space) - uses numpy.random.triangular + uses pyemu.en.rng.triangular Example:: @@ -1196,7 +1197,7 @@ def from_triangular_draw(cls, pst, num_reals=100, fill=True): for i, pname in enumerate(pst.parameter_data.parnme): # print(pname, lb[pname], ub[pname]) if pname in adj_par_names: - arr[:, i] = np.random.triangular( + arr[:, i] = rng.triangular( lb[pname], pv[pname], ub[pname], size=num_reals ) elif fill: @@ -1230,7 +1231,7 @@ def from_uniform_draw(cls, pst, num_reals, fill=True): log-transformed parameters are drawn in log space. The returned `ParameterEnsemble` is back transformed (not in log space) - uses numpy.random.uniform + uses pyemu.en.rng.uniform Example:: @@ -1258,7 +1259,7 @@ def from_uniform_draw(cls, pst, num_reals, fill=True): for i, pname in enumerate(pst.parameter_data.parnme): # print(pname,lb[pname],ub[pname]) if pname in adj_par_names: - arr[:, i] = np.random.uniform(lb[pname], ub[pname], size=num_reals) + arr[:, i] = rng.uniform(lb[pname], ub[pname], size=num_reals) elif fill: arr[:, i] = ( np.zeros((num_reals)) + pst.parameter_data.loc[pname, "parval1"] diff --git a/pyemu/mat/mat_handler.py b/pyemu/mat/mat_handler.py index e0742a04b..d180ccffe 100644 --- a/pyemu/mat/mat_handler.py +++ b/pyemu/mat/mat_handler.py @@ -1,4 +1,5 @@ from __future__ import print_function, division +import pyemu import os import copy import struct @@ -146,7 +147,7 @@ class Matrix(object): Example:: - data = np.random.random((10,10)) + data = pyemu.en.rng.random((10,10)) row_names = ["row_{0}".format(i) for i in range(10)] col_names = ["col_{0}".format(j) for j in range(10)] mat = pyemu.Matrix(x=data,row_names=row_names,col_names=col_names) @@ -2746,7 +2747,7 @@ def from_names( """ if random: return cls( - x=np.random.random((len(row_names), len(col_names))), + x=pyemu.en.rng.random((len(row_names), len(col_names))), row_names=row_names, col_names=col_names, isdiagonal=isdiagonal, @@ -2924,7 +2925,7 @@ class Cov(Matrix): Example:: - data = np.random.random((10,10)) + data = pyemu.en.rng.random((10,10)) names = ["par_{0}".format(i) for i in range(10)] mat = pyemu.Cov(x=data,names=names) mat.to_binary("mat.jco") diff --git a/pyemu/plot/plot_utils.py b/pyemu/plot/plot_utils.py index 20eed1682..57fd4e942 100644 --- a/pyemu/plot/plot_utils.py +++ b/pyemu/plot/plot_utils.py @@ -1795,7 +1795,7 @@ def plot_jac_test( num_obs_plotted = np.min(np.array([maxoutputpages * 32, len(targetobs)])) if num_obs_plotted < len(targetobs): # get random sample - index_plotted = np.random.choice(len(targetobs), num_obs_plotted, replace=False) + index_plotted = pyemu.en.rng.choice(len(targetobs), num_obs_plotted, replace=False) obs_plotted = [targetobs[x] for x in index_plotted] real_pages = maxoutputpages else: diff --git a/pyemu/utils/geostats.py b/pyemu/utils/geostats.py index 052265d31..ca3aea105 100644 --- a/pyemu/utils/geostats.py +++ b/pyemu/utils/geostats.py @@ -471,8 +471,8 @@ def draw_arrays(self, num_reals=1, mean_value=1.0): reals = [] for ireal in range(num_reals): - real = np.random.standard_normal(size=self.sqrt_fftc.shape) - imag = np.random.standard_normal(size=self.sqrt_fftc.shape) + real = pyemu.en.rng.standard_normal(size=self.sqrt_fftc.shape) + imag = pyemu.en.rng.standard_normal(size=self.sqrt_fftc.shape) epsilon = real + 1j * imag rand = epsilon * self.sqrt_fftc real = np.real(np.fft.ifftn(rand)) * self.num_pts @@ -691,7 +691,7 @@ def draw_conditional( # read in the base values, Z(x), assume these are not log-transformed values_krige = np.loadtxt(base_values_file) - np.random.seed(int(seed)) + pyemu.en.rng = pyemu.en.rng.default_rng(int(seed)) # draw random fields for num_reals unconditioned = self.draw_arrays(num_reals=num_reals, mean_value=mean_value) diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py index 71a7a4c19..a2a28b2b0 100644 --- a/pyemu/utils/helpers.py +++ b/pyemu/utils/helpers.py @@ -4172,7 +4172,7 @@ def apply_threshold_pars(csv_file): return tarr.mean(), 1.0 # print("WARNING: thresholding array {0} has very low standard deviation, adding noise".format(thresarr_file)) - # tarr += np.random.normal(0, tol*2.0, tarr.shape) + # tarr += pyemu.en.rng.normal(0, tol*2.0, tarr.shape) # a classic: gr = (np.sqrt(5.) + 1.) / 2. From 56112fc03929edf773e9c349ef530d0de64468f1 Mon Sep 17 00:00:00 2001 From: Michael Morphew Date: Tue, 10 Mar 2026 17:50:21 -0600 Subject: [PATCH 02/10] Switch to np.random.RandomState for strict cross-version reproducibility --- autotest/pst_from_tests.py | 2 +- autotest/utils_tests.py | 2 +- pyemu/en.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/autotest/pst_from_tests.py b/autotest/pst_from_tests.py index 30f9e085f..00a5d8b5a 100644 --- a/autotest/pst_from_tests.py +++ b/autotest/pst_from_tests.py @@ -3729,7 +3729,7 @@ def test_usg_freyberg(tmp_path): visited = set() for i in range(num_pp): while True: - idx = pyemu.en.rng.integers(0,len(sr_dict_by_layer[1])) + idx = pyemu.en.rng.randint(0,len(sr_dict_by_layer[1])) if idx not in visited: break x,y = sr_dict_by_layer[1][idx] diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py index 7cbc04066..15551cb73 100644 --- a/autotest/utils_tests.py +++ b/autotest/utils_tests.py @@ -2827,7 +2827,7 @@ def thresh_pars_test(): arr = np.ones((dim,dim)) gs = pyemu.geostats.GeoStruct(variograms=[pyemu.geostats.ExpVario(1.0,30.0)]) ss = pyemu.geostats.SpecSim2d(np.ones(dim),np.ones(dim),gs) - #seed = pyemu.en.rng.integers(100000) + #seed = pyemu.en.rng.randint(100000) pyemu.en.rng = pyemu.en.rng.default_rng(9371) #print("seed",seed) arr = 10**(ss.draw_arrays()[0]) diff --git a/pyemu/en.py b/pyemu/en.py index 992d5af5d..7739a45bf 100644 --- a/pyemu/en.py +++ b/pyemu/en.py @@ -9,7 +9,7 @@ from .pyemu_warnings import PyemuWarning SEED = 358183147 # from random.org on 5 Dec 2016 -rng = np.random.default_rng(SEED) +rng = np.random.RandomState(SEED) class Loc(object): @@ -140,7 +140,7 @@ def reseed(): """ global rng - rng = np.random.default_rng(SEED) + rng = np.random.RandomState(SEED) def copy(self): """get a copy of `Ensemble` @@ -603,7 +603,7 @@ def _draw_new_ensemble(self,num_reals,names,include_noise=True,noise_reals=None) if len(missing) > 0: raise Exception("the following names are not in `noise_reals`: "+",".join(missing)) #noise_real_choices = rng.choice(noise_reals.index,num_reals) - noise_real_choices = rng.integers(0,noise_reals.shape[0],num_reals) + noise_real_choices = rng.randint(0,noise_reals.shape[0],num_reals) noise_back_trans = False if not noise_reals.istransformed: noise_reals.transform() From f5b3a3d3847d608740523815b37e29160b10e7e4 Mon Sep 17 00:00:00 2001 From: Michael Morphew Date: Tue, 10 Mar 2026 22:16:08 -0600 Subject: [PATCH 03/10] Roll back to legacy generator for compability, attempt namespace shenanigans for en.py --- autotest/emulator_tests.py | 22 +++++++++++----------- autotest/mc_tests_ignore.py | 2 +- autotest/metrics_tests.py | 10 +++++----- autotest/pst_from_tests.py | 16 ++++++++-------- autotest/pst_tests.py | 8 ++++---- autotest/transformer_tests.py | 24 ++++++++++++------------ autotest/utils_tests.py | 28 ++++++++++++++-------------- pyemu/emulators/dsiae.py | 2 +- pyemu/en.py | 35 ++++++++++++++++++++++++----------- pyemu/utils/geostats.py | 20 +++++++++++++------- pyemu/utils/helpers.py | 23 +++++++++++++---------- pyemu/utils/pst_from.py | 5 +++-- 12 files changed, 109 insertions(+), 86 deletions(-) diff --git a/autotest/emulator_tests.py b/autotest/emulator_tests.py index 6be78820a..0b6a6e3f1 100644 --- a/autotest/emulator_tests.py +++ b/autotest/emulator_tests.py @@ -450,8 +450,8 @@ def test_autoencoder_basic(): from pyemu.emulators.dsiae import AutoEncoder # Create simple synthetic data - pyemu.en.rng = pyemu.en.rng.default_rng(42) - X = pyemu.en.rng.standard_normal((50, 10,)).astype(np.float32) # 50 samples, 10 features + rng = np.random.RandomState(42) + X = rng.standard_normal((50, 10,)).astype(np.float32) # 50 samples, 10 features # Test initialization ae = AutoEncoder(input_dim=10, latent_dim=3, hidden_dims=(8, 4)) @@ -477,8 +477,8 @@ def test_autoencoder_pandas_input(): from pyemu.emulators.dsiae import AutoEncoder # Create pandas DataFrame - pyemu.en.rng = pyemu.en.rng.default_rng(42) - data = pd.DataFrame(pyemu.en.rng.standard_normal((30, 8,)), + rng = np.random.RandomState(42) + data = pd.DataFrame(rng.standard_normal((30, 8,)), columns=[f'feature_{i}' for i in range(8)], index=[f'sample_{i}' for i in range(30)]) @@ -631,8 +631,8 @@ def test_autoencoder_basic(): from pyemu.emulators.dsiae import AutoEncoder # Create simple synthetic data - pyemu.en.rng = pyemu.en.rng.default_rng(42) - X = pyemu.en.rng.standard_normal((50, 10,)).astype(np.float32) # 50 samples, 10 features + rng = np.random.RandomState(42) + X = rng.standard_normal((50, 10,)).astype(np.float32) # 50 samples, 10 features # Test initialization ae = AutoEncoder(input_dim=10, latent_dim=3, hidden_dims=(8, 4)) @@ -658,8 +658,8 @@ def test_autoencoder_pandas_input(): from pyemu.emulators.dsiae import AutoEncoder # Create pandas DataFrame - pyemu.en.rng = pyemu.en.rng.default_rng(42) - data = pd.DataFrame(pyemu.en.rng.standard_normal((30, 8,)), + rng = np.random.RandomState(42) + data = pd.DataFrame(rng.standard_normal((30, 8,)), columns=[f'feature_{i}' for i in range(8)], index=[f'sample_{i}' for i in range(30)]) @@ -994,10 +994,10 @@ def test_lpfa_synth(tmp_path): t = np.linspace(0, 10, 50) data = [] n_real = 30 - pyemu.en.rng = pyemu.en.rng.default_rng(42) + rng = np.random.RandomState(42) for i in range(n_real): - phase = pyemu.en.rng.uniform(0, 2*np.pi) - amp = pyemu.en.rng.uniform(0.8, 1.2) + phase = rng.uniform(0, 2*np.pi) + amp = rng.uniform(0.8, 1.2) # Inputs (history) hist = amp * np.sin(t[:10] + phase) # Outputs (forecast) diff --git a/autotest/mc_tests_ignore.py b/autotest/mc_tests_ignore.py index 5edad531b..3a8167405 100644 --- a/autotest/mc_tests_ignore.py +++ b/autotest/mc_tests_ignore.py @@ -193,7 +193,7 @@ def ensemble_seed_test(): pe1.reseed() pe1.draw(cov,num_reals=10) - #pyemu.en.rng = pyemu.en.rng.default_rng(1111) + #pyemu.en.rng = np.random.RandomState(1111) pe2.reseed() pe2.draw(cov,num_reals=10) assert (pe1-pe2).apply(np.abs).as_matrix().max() == 0.0 diff --git a/autotest/metrics_tests.py b/autotest/metrics_tests.py index 3e59dc4a8..3b5db9315 100644 --- a/autotest/metrics_tests.py +++ b/autotest/metrics_tests.py @@ -5,10 +5,10 @@ def res_and_ens_test(): import pyemu # make some fake residuals - pyemu.en.rng = pyemu.en.rng.default_rng(42) + rng = np.random.RandomState(42) t = np.linspace(1,20, 200) obs = t/10 * np.sin(np.pi*t) - mod = obs+pyemu.en.rng.standard_normal((200,))*.5 + mod = obs+rng.standard_normal((200,))*.5 obsnames = ['ob_t_{:03d}'.format(i) for i in range(len(t))] obsgroups = ['start_grp' if i<80 else 'end_grp' for i in range(len(t))] res = pd.DataFrame({'name':obsnames, @@ -18,11 +18,11 @@ def res_and_ens_test(): 'residual':obs-mod, 'weight':np.ones(len(t))}) res.set_index(res['name'], inplace=True) - pyemu.en.rng = pyemu.en.rng.default_rng(98) - res.weight = [float(i>.5) for i in pyemu.en.rng.random(200)] + rng = np.random.RandomState(98) + res.weight = [float(i>.5) for i in rng.random(200)] # and an ensemble version - ens = pd.DataFrame(np.tile(obs,(10,1))+pyemu.en.rng.standard_normal((10,200,))*.5, columns=obsnames) + ens = pd.DataFrame(np.tile(obs,(10,1))+rng.standard_normal((10,200,))*.5, columns=obsnames) ens.loc['base'] = mod # cook up a PEST file for obs and weights diff --git a/autotest/pst_from_tests.py b/autotest/pst_from_tests.py index 00a5d8b5a..381efc1c4 100644 --- a/autotest/pst_from_tests.py +++ b/autotest/pst_from_tests.py @@ -46,9 +46,9 @@ def _gen_dummy_obs_file(ws='.', sep=',', ext=None): else: t.append(text[c]) c += 1 - pyemu.en.rng = pyemu.en.rng.default_rng(314) + rng = np.random.RandomState(314) df = pd.DataFrame( - pyemu.en.rng.random((15,2,))*1000, + rng.random((15,2,))*1000, columns=['no', 'yes'], index=t ) @@ -3723,13 +3723,13 @@ def test_usg_freyberg(tmp_path): zone_array_k2[:,:100] = 4 #gen up some fake pp locs - pyemu.en.rng = pyemu.en.rng.default_rng(pyemu.en.SEED) + rng = np.random.RandomState(pyemu.en.SEED) num_pp = 20 data = {"name":[],"x":[],"y":[],"zone":[]} visited = set() for i in range(num_pp): while True: - idx = pyemu.en.rng.randint(0,len(sr_dict_by_layer[1])) + idx = rng.randint(0,len(sr_dict_by_layer[1])) if idx not in visited: break x,y = sr_dict_by_layer[1][idx] @@ -3943,7 +3943,7 @@ def _add_big_obsffile(pf, profile=False, nchar=50000): else: pstfrom_add = True wd = pf.new_d - pyemu.en.rng = pyemu.en.rng.default_rng(314) + rng = np.random.RandomState(314) df = pd.DataFrame(pyemu.en.rng.random([10, nchar]), columns=[hex(c) for c in range(nchar)]) df.index.name = 'time' @@ -4916,7 +4916,7 @@ def mf6_freyberg_thresh_test(tmp_path): org_par = par.copy() num_reals = 30 - pyemu.en.rng = pyemu.en.rng.default_rng() + rng = np.random.RandomState() pe = pf.draw(num_reals, use_specsim=False) pe.enforce() # print(pe.shape) @@ -6446,8 +6446,8 @@ def draw_consistency_test(tmp_path): gpar = par.loc[par.parnme.str.contains("fix"),:] assert gpar.shape[0] == gwf.dis.nrow.data * gwf.dis.ncol.data par.loc[gpar.parnme,"partrans"] = "fixed" - pyemu.en.rng = pyemu.en.rng.default_rng(111) - pe = pf.draw(num_reals=10, use_specsim=True) # draw parameters from the prior distribution + rng = np.random.RandomState(111) + pe = pf.draw(num_reals=10, use_specsim=True, rng=rng) # draw parameters from the prior distribution print("abs max:",np.nanmax(np.abs(pe.values))) # no bs values... assert np.nanmax(np.abs(pe.values)) < 100000 diff --git a/autotest/pst_tests.py b/autotest/pst_tests.py index 5ed4ce910..8d2df6c91 100644 --- a/autotest/pst_tests.py +++ b/autotest/pst_tests.py @@ -1374,8 +1374,8 @@ def parrep_test(tmp_path): import numpy as np # make some fake parnames and values parnames = ['p_{0:03}'.format(i) for i in range(20)] - pyemu.en.rng = pyemu.en.rng.default_rng(42) - parvals = pyemu.en.rng.random(20) + 5 + rng = np.random.RandomState(42) + parvals = rng.random(20) + 5 parvals[0] = 0.001 bd = os.getcwd() os.chdir(tmp_path) @@ -1386,8 +1386,8 @@ def parrep_test(tmp_path): [ofp.write('{0:10s} {1:12.6f} 1.00 0.0\n'.format(i,j)) for i,j in zip(parnames,parvals)] # make a fake ensemble parameter file - pyemu.en.rng = pyemu.en.rng.default_rng(99) - parens = pd.DataFrame(np.tile(parvals,(5,1))+pyemu.en.rng.standard_normal((5,20,))*.5, columns=parnames) + rng = np.random.RandomState(99) + parens = pd.DataFrame(np.tile(parvals,(5,1))+rng.standard_normal((5,20,))*.5, columns=parnames) parens.index = list(range(4)) + ['base'] parens.index.name = 'real_name' parens.loc['base'] = parvals[::-1] diff --git a/autotest/transformer_tests.py b/autotest/transformer_tests.py index 50d888112..ab43825c8 100755 --- a/autotest/transformer_tests.py +++ b/autotest/transformer_tests.py @@ -108,19 +108,19 @@ def test_row_wise_minmax_scaler(): def test_normal_score_transformer(): """Test the NormalScoreTransformer functionality""" # Create test data with various distributions - pyemu.en.rng = pyemu.en.rng.default_rng(42) + rng = np.random.RandomState(42) n = 200 # Uniform data - uniform_data = pyemu.en.rng.uniform(0, 10, n) + uniform_data = rng.uniform(0, 10, n) # Log-normal data - lognormal_data = np.exp(pyemu.en.rng.normal(0, 1, n)) + lognormal_data = np.exp(rng.normal(0, 1, n)) # Bimodal data bimodal_data = np.concatenate([ - pyemu.en.rng.normal(-3, 1, n//2), - pyemu.en.rng.normal(3, 1, n//2) + rng.normal(-3, 1, n//2), + rng.normal(3, 1, n//2) ]) df = pd.DataFrame({ @@ -355,20 +355,20 @@ def kurtosis(x): def test_normal_score_with_external_data(): """Test NormalScoreTransformer with external already-transformed data""" # Create training data with a specific distribution - pyemu.en.rng = pyemu.en.rng.default_rng(42) + rng = np.random.RandomState(42) n = 100 training_data = pd.DataFrame({ - 'normal': pyemu.en.rng.normal(5, 2, n), - 'lognormal': np.exp(pyemu.en.rng.normal(1, 0.5, n)), - 'uniform': pyemu.en.rng.uniform(0, 10, n) + 'normal': rng.normal(5, 2, n), + 'lognormal': np.exp(rng.normal(1, 0.5, n)), + 'uniform': rng.uniform(0, 10, n) }) # Create "external" data that we'll pretend is already transformed # For this test, we'll generate values in the typical normal score range (-3 to 3) external_transformed = pd.DataFrame({ - 'normal': pyemu.en.rng.normal(0, 1, 1), # Already in normal score space - 'lognormal': pyemu.en.rng.normal(0, 1, 1), - 'uniform': pyemu.en.rng.normal(0, 1, 1) + 'normal': rng.normal(0, 1, 1), # Already in normal score space + 'lognormal': rng.normal(0, 1, 1), + 'uniform': rng.normal(0, 1, 1) }) # Initialize and fit transformer on training data diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py index 15551cb73..be0018aa7 100644 --- a/autotest/utils_tests.py +++ b/autotest/utils_tests.py @@ -1069,10 +1069,10 @@ def geostat_draws_test(tmp_path): df = pyemu.pp_utils.pp_tpl_to_dataframe(tpl_file) df.loc[:,"zone"] = np.arange(df.shape[0]) gs = pyemu.geostats.read_struct_file(str_file) - pyemu.en.rng = pyemu.en.rng.default_rng(pyemu.en.SEED) + rng = np.random.RandomState(pyemu.en.SEED) pe = pyemu.helpers.geostatistical_draws(pst_file,{gs:df}, sigma_range=4) - pyemu.en.rng = pyemu.en.rng.default_rng(pyemu.en.SEED) + rng = np.random.RandomState(pyemu.en.SEED) pe2 = pyemu.helpers.geostatistical_draws(pst_file,{gs:df}, sigma_range=4) pe.to_csv(os.path.join(os.path.join("utils","geostat_pe.csv"))) @@ -2284,7 +2284,7 @@ def specsim_test(): variograms = [pyemu.geostats.ExpVario(contribution=contrib, a=a, anisotropy=10, bearing=0)] gs = pyemu.geostats.GeoStruct(variograms=variograms, transform="log", nugget=nugget) - pyemu.en.rng = pyemu.en.rng.default_rng(1) + rng = np.random.RandomState(1) ss = pyemu.geostats.SpecSim2d(geostruct=gs, delx=delr, dely=delc) mean_value = 15.0 @@ -2302,13 +2302,13 @@ def specsim_test(): assert np.abs(var - theo_var) < 0.1 assert np.abs(mean - mean_value) < 0.1 - pyemu.en.rng = pyemu.en.rng.default_rng(1) + rng = np.random.RandomState(1) variograms = [pyemu.geostats.ExpVario(contribution=contrib, a=a, anisotropy=10, bearing=0)] gs = pyemu.geostats.GeoStruct(variograms=variograms, transform="none", nugget=nugget) ss = pyemu.geostats.SpecSim2d(geostruct=gs, delx=delr, dely=delc) mean_value = 25.0 - reals = ss.draw_arrays(num_reals=num_reals,mean_value=mean_value) + reals = ss.draw_arrays(num_reals=num_reals,mean_value=mean_value, rng=rng) assert reals.shape == (num_reals,nrow,ncol) var = np.var(reals,axis=0).mean() mean = reals.mean() @@ -2336,18 +2336,18 @@ def aniso_invest(): variograms = [pyemu.geostats.ExpVario(contribution=2.5,a=2500.0,anisotropy=10,bearing=90)] gs = pyemu.geostats.GeoStruct(variograms=variograms,transform="none",nugget=0.0) - pyemu.en.rng = pyemu.en.rng.default_rng(1) + rng = np.random.RandomState(1) num_reals = 100 start = datetime.now() ss = pyemu.geostats.SpecSim2d(geostruct=gs, delx=delr, dely=delc) mean_value = 1.0 - reals1 = ss.draw_arrays(num_reals=num_reals,mean_value=mean_value) + reals1 = ss.draw_arrays(num_reals=num_reals,mean_value=mean_value,rng=rng) print((datetime.now() - start).total_seconds()) variograms = [pyemu.geostats.ExpVario(contribution=2.5, a=2000.0, anisotropy=10, bearing=0)] gs = pyemu.geostats.GeoStruct(variograms=variograms, transform="none", nugget=0.0) ss = pyemu.geostats.SpecSim2d(geostruct=gs, delx=delr, dely=delc) - reals2 = ss.draw_arrays(num_reals=num_reals, mean_value=mean_value) + reals2 = ss.draw_arrays(num_reals=num_reals, mean_value=mean_value,rng=rng) import matplotlib.pyplot as plt fig,axes = plt.subplots(1,2,figsize=(6,3)) @@ -2709,10 +2709,10 @@ def ac_draw_test(tmp_path): pst.write(os.path.join(tmp_path, "test.pst")) print(pst.observation_data.distance) - pyemu.en.rng = pyemu.en.rng.default_rng(pyemu.en.SEED) - oe = pyemu.helpers.autocorrelated_draw(pst, struct_dict, num_reals=100, enforce_bounds=True) - pyemu.en.rng = pyemu.en.rng.default_rng(pyemu.en.SEED) - oe2 = pyemu.helpers.autocorrelated_draw(pst, struct_dict, num_reals=100, enforce_bounds=True) + rng = np.random.RandomState(pyemu.en.SEED) + oe = pyemu.helpers.autocorrelated_draw(pst, struct_dict, num_reals=100, enforce_bounds=True, rng=rng) + rng = np.random.RandomState(pyemu.en.SEED) + oe2 = pyemu.helpers.autocorrelated_draw(pst, struct_dict, num_reals=100, enforce_bounds=True, rng=rng) diff = oe - oe2 print(diff.max()) assert diff.max().max() == 0.0 @@ -2828,9 +2828,9 @@ def thresh_pars_test(): gs = pyemu.geostats.GeoStruct(variograms=[pyemu.geostats.ExpVario(1.0,30.0)]) ss = pyemu.geostats.SpecSim2d(np.ones(dim),np.ones(dim),gs) #seed = pyemu.en.rng.randint(100000) - pyemu.en.rng = pyemu.en.rng.default_rng(9371) + rng = np.random.RandomState(9371) #print("seed",seed) - arr = 10**(ss.draw_arrays()[0]) + arr = 10**(ss.draw_arrays(rng=rng)[0]) print(arr) inact_arr = np.ones_like(arr,dtype=int) diff --git a/pyemu/emulators/dsiae.py b/pyemu/emulators/dsiae.py index f3dd74a46..5cf255ef5 100755 --- a/pyemu/emulators/dsiae.py +++ b/pyemu/emulators/dsiae.py @@ -861,7 +861,7 @@ def __init__(self, input_dim: int, latent_dim: int = 2, self.random_state = random_state tf.random.set_seed(random_state) - pyemu.en.rng = pyemu.en.rng.default_rng(random_state) + pyemu.en.rng = np.random.RandomState(random_state) self._build_model() # Build encoder/decoder diff --git a/pyemu/en.py b/pyemu/en.py index 7739a45bf..50b950498 100644 --- a/pyemu/en.py +++ b/pyemu/en.py @@ -456,8 +456,10 @@ def from_dataframe(cls, pst, df, istransformed=False): @staticmethod def _gaussian_draw( - cov, mean_values, num_reals, grouper=None, fill=True, factor="cholesky" + cov, mean_values, num_reals, grouper=None, fill=True, factor="cholesky", rng=None ): + if rng is None: + rng = pyemu.en.rng factor = factor.lower() if factor not in ["eigen", "svd", "cholesky"]: @@ -558,7 +560,7 @@ def _gaussian_draw( df.dropna(inplace=True, axis=1) return df - def _draw_new_ensemble(self,num_reals,names,include_noise=True,noise_reals=None): + def _draw_new_ensemble(self,num_reals,names,include_noise=True,noise_reals=None, rng=None): """Draw a new (potentially larger) Ensemble instance using the realizations in `self`. @@ -572,13 +574,15 @@ def _draw_new_ensemble(self,num_reals,names,include_noise=True,noise_reals=None) noise_reals (Ensemble): other existing realizations (likely prior realizations) that are used as noise realizations in place of IID noise that is used if `include_noise` is True and `noise_reals` is None. + rng (`numpy.random.RandomState`, optional): random number generator if not using default from pyemu.en Returns Ensemble """ - + if rng is None: + rng = pyemu.en.rng back_trans = False if not self.istransformed: self.transform() @@ -824,8 +828,8 @@ def __init__(self, pst, df, istransformed=False): @classmethod def from_gaussian_draw( - cls, pst, cov=None, num_reals=100, by_groups=True, fill=False, factor="cholesky" - ): + cls, pst, cov=None, num_reals=100, by_groups=True, fill=False, factor="cholesky", + rng=None): """generate an `ObservationEnsemble` from a (multivariate) gaussian distribution @@ -846,6 +850,7 @@ def from_gaussian_draw( be "eigen", "svd", or "cholesky. The "cholesky" option is default and is faster. But for (nearly) singular cov matrices (such as those generated empirically from ensembles), "svd" and/or "eigen" might be required. Ignored for diagonal `cov`. + rng (`numpy.random.RandomState`, optional): random number generator if not using default from pyemu.en Returns: `ObservationEnsemble`: the realized `ObservationEnsemble` instance @@ -877,6 +882,8 @@ def from_gaussian_draw( oe3 = pyemu.ObservationEnsemble.from_gaussian_draw(pst,cov=cov) """ + if rng is None: + rng = pyemu.en.rng if cov is None: cov = pyemu.Cov.from_observation_data(pst) obs = pst.observation_data @@ -904,6 +911,7 @@ def from_gaussian_draw( grouper=grouper, fill=fill, factor=factor, + rng=rng ) if fill: df.loc[:, pst.zero_weight_obs_names] = pst.observation_data.loc[ @@ -912,7 +920,7 @@ def from_gaussian_draw( return cls(pst, df, istransformed=False) - def draw_new_ensemble(self,num_reals,include_noise=True,noise_reals=None): + def draw_new_ensemble(self,num_reals,include_noise=True,noise_reals=None, rng=None): """Draw a new (potentially larger) ObservationEnsemble instance using the realizations in `self`. @@ -926,6 +934,7 @@ def draw_new_ensemble(self,num_reals,include_noise=True,noise_reals=None): noise_reals (ObservationEnsemble): other existing realizations (likely prior realizations) that are used as noise realizations in place of IID noise that is used if `include_noise` is True and `noise_reals` is None. + rng (np.random.RandomState): random number generator if not using default from pyemu.en Returns ObservationEnsemble @@ -937,7 +946,7 @@ def draw_new_ensemble(self,num_reals,include_noise=True,noise_reals=None): names = self.pst.nnz_obs_names return self._draw_new_ensemble(num_reals,names,include_noise=include_noise, - noise_reals=noise_reals) + noise_reals=noise_reals, rng=rng) @property def phi_vector(self): @@ -1065,7 +1074,7 @@ def __init__(self, pst, df, istransformed=False): @classmethod def from_gaussian_draw( - cls, pst, cov=None, num_reals=100, by_groups=True, fill=True, factor="cholesky" + cls, pst, cov=None, num_reals=100, by_groups=True, fill=True, factor="cholesky", rng=None ): """generate a `ParameterEnsemble` from a (multivariate) (log) gaussian distribution @@ -1089,6 +1098,7 @@ def from_gaussian_draw( be "eigen", "svd", or "cholesky". The "cholesky" option is default and is faster. But for (nearly) singular cov matrices (such as those generated empirically from ensembles), "svd" and/or "eigen" might be required. Ignored for diagonal `cov`. + rng (`numpy.random.RandomState`, optional): random number generator if not using default from pyemu.en Returns: `ParameterEnsemble`: the parameter ensemble realized from the gaussian @@ -1138,13 +1148,14 @@ def from_gaussian_draw( num_reals=num_reals, grouper=grouper, fill=fill, - factor=factor + factor=factor, + rng=rng ) df.loc[:, li] = 10.0 ** df.loc[:, li] return cls(pst, df, istransformed=False) @classmethod - def from_triangular_draw(cls, pst, num_reals=100, fill=True): + def from_triangular_draw(cls, pst, num_reals=100, fill=True, rng=None): """generate a `ParameterEnsemble` from a (multivariate) (log) triangular distribution Args: @@ -1152,6 +1163,7 @@ def from_triangular_draw(cls, pst, num_reals=100, fill=True): num_reals (`int`, optional): number of realizations to generate. Default is 100 fill (`bool`): flag to fill in fixed and/or tied parameters with control file values. Default is True. + rng (`numpy.random.RandomState`, optional): random number generator if not using default from pyemu.en Returns: `ParameterEnsemble`: a parameter ensemble drawn from the multivariate (log) triangular @@ -1212,7 +1224,7 @@ def from_triangular_draw(cls, pst, num_reals=100, fill=True): return new_pe @classmethod - def from_uniform_draw(cls, pst, num_reals, fill=True): + def from_uniform_draw(cls, pst, num_reals, fill=True, rng=None): """generate a `ParameterEnsemble` from a (multivariate) (log) uniform distribution @@ -1221,6 +1233,7 @@ def from_uniform_draw(cls, pst, num_reals, fill=True): num_reals (`int`, optional): number of realizations to generate. Default is 100 fill (`bool`): flag to fill in fixed and/or tied parameters with control file values. Default is True. + rng (`numpy.random.RandomState`, optional): random number generator if not using default from pyemu.en Returns: `ParameterEnsemble`: a parameter ensemble drawn from the multivariate (log) uniform diff --git a/pyemu/utils/geostats.py b/pyemu/utils/geostats.py index ca3aea105..340e5fb14 100644 --- a/pyemu/utils/geostats.py +++ b/pyemu/utils/geostats.py @@ -453,12 +453,13 @@ def initialize(self): self.num_pts = np.prod(xgrid.shape) self.sqrt_fftc = np.sqrt(fftc / self.num_pts) - def draw_arrays(self, num_reals=1, mean_value=1.0): + def draw_arrays(self, num_reals=1, mean_value=1.0, rng=None): """draw realizations Args: num_reals (`int`): number of realizations to generate mean_value (`float`): the mean value of the realizations + rng (`numpy.random.RandomState`): random number generator if not using default from pyemu.en Returns: `numpy.ndarray`: a 3-D array of realizations. Shape @@ -471,8 +472,12 @@ def draw_arrays(self, num_reals=1, mean_value=1.0): reals = [] for ireal in range(num_reals): - real = pyemu.en.rng.standard_normal(size=self.sqrt_fftc.shape) - imag = pyemu.en.rng.standard_normal(size=self.sqrt_fftc.shape) + if rng is None: + real = pyemu.en.rng.standard_normal(size=self.sqrt_fftc.shape) + imag = pyemu.en.rng.standard_normal(size=self.sqrt_fftc.shape) + else: + real = rng.standard_normal(size=self.sqrt_fftc.shape) + imag = rng.standard_normal(size=self.sqrt_fftc.shape) epsilon = real + 1j * imag rand = epsilon * self.sqrt_fftc real = np.real(np.fft.ifftn(rand)) * self.num_pts @@ -488,7 +493,7 @@ def draw_arrays(self, num_reals=1, mean_value=1.0): return reals def grid_par_ensemble_helper( - self, pst, gr_df, num_reals, sigma_range=6, logger=None + self, pst, gr_df, num_reals, sigma_range=6, logger=None, rng=None ): """wrapper around `SpecSim2d.draw()` designed to support `PstFromFlopy` and `PstFrom` grid-based parameters @@ -501,6 +506,7 @@ def grid_par_ensemble_helper( sigma_range (`float` (optional)): number of standard deviations implied by parameter bounds in control file. Default is 6 logger (`pyemu.Logger` (optional)): a logger instance for logging + rng (`numpy.random.RandomState` (optional)): random number generator if not using default from pyemu.en Returns: `pyemu.ParameterEnsemble`: an untransformed parameter ensemble of @@ -581,7 +587,7 @@ def grid_par_ensemble_helper( ) ) self.initialize() - reals = self.draw_arrays(num_reals=num_reals, mean_value=mean_arr) + reals = self.draw_arrays(num_reals=num_reals, mean_value=mean_arr, rng=rng) # put the pieces into the par en reals = reals[:, gp_df.i, gp_df.j].reshape(num_reals, gp_df.shape[0]) real_arrs.append(reals) @@ -691,10 +697,10 @@ def draw_conditional( # read in the base values, Z(x), assume these are not log-transformed values_krige = np.loadtxt(base_values_file) - pyemu.en.rng = pyemu.en.rng.default_rng(int(seed)) + rng = np.random.RandomState(int(seed)) # draw random fields for num_reals - unconditioned = self.draw_arrays(num_reals=num_reals, mean_value=mean_value) + unconditioned = self.draw_arrays(num_reals=num_reals, mean_value=mean_value, rng=rng) # If geostruct is log transformed, then work with log10 of field if self.geostruct.transform == "log": diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py index a2a28b2b0..b4b28d63b 100644 --- a/pyemu/utils/helpers.py +++ b/pyemu/utils/helpers.py @@ -108,7 +108,7 @@ def _try_pdcol_numeric(x, first=True, intadj=0, **kwargs): def autocorrelated_draw(pst,struct_dict,time_distance_col="distance",num_reals=100,verbose=True, - enforce_bounds=False, draw_ineq=False): + enforce_bounds=False, draw_ineq=False, rng=None): """construct an autocorrelated observation noise ensemble from covariance matrices implied by geostatistical structure(s). @@ -129,6 +129,7 @@ def autocorrelated_draw(pst,struct_dict,time_distance_col="distance",num_reals=1 these are present in `* observation data`. Default is False draw_ineq (`bool`, optional): flag to generate noise realizations for inequality observations. If False, noise will not be added inequality observations in the ensemble. Default is False + rng (`numpy.random.RandomState`, optional): random number generator if not using default from pyemu.en Returns @@ -178,7 +179,7 @@ def autocorrelated_draw(pst,struct_dict,time_distance_col="distance",num_reals=1 fcov_dict = {o:np.sqrt(fcov.x[i]) for i,o in enumerate(fcov.names)} if verbose: print("-->draw full obs en from diagonal cov") - full_oe = pyemu.ObservationEnsemble.from_gaussian_draw(pst,fcov,num_reals=num_reals,fill=True) + full_oe = pyemu.ObservationEnsemble.from_gaussian_draw(pst,fcov,num_reals=num_reals,fill=True,rng=rng) keys = list(struct_dict.keys()) keys.sort() #for gs,onames in struct_dict.items(): @@ -195,7 +196,7 @@ def autocorrelated_draw(pst,struct_dict,time_distance_col="distance",num_reals=1 gcov.x[i, :] *= fcov_dict[name] if verbose: print("...draw") - oe = pyemu.ObservationEnsemble.from_gaussian_draw(pst,gcov,num_reals=num_reals,fill=False,by_groups=False) + oe = pyemu.ObservationEnsemble.from_gaussian_draw(pst,gcov,num_reals=num_reals,fill=False,by_groups=False,rng=rng) oe = oe.loc[:,gcov.names] full_oe.loc[:,gcov.names] = oe._df.values @@ -245,7 +246,7 @@ def autocorrelated_draw(pst,struct_dict,time_distance_col="distance",num_reals=1 def draw_by_group(pst, num_reals=100, sigma_range=6, use_specsim=False, struct_dict=None, delr=None, delc=None, scale_offset=True, - echo=True, logger=False): + echo=True, logger=False, rng=None): """Draw a parameter ensemble from the distribution implied by the initial parameter values in the control file and a prior parameter covariance matrix derived from grouped geostructures. Previously in pst_from. @@ -283,6 +284,7 @@ def draw_by_group(pst, num_reals=100, sigma_range=6, use_specsim=False, echo (`bool`): Verbosity flag passed to new Logger instance if `logger`is None logger (`pyemu.Logger`, optional): Object for logging process + rng (`numpy.random.RandomState`, optional): random number generator if not using default from pyemu.en Returns: `pyemu.ParameterEnsemble`: a prior parameter ensemble @@ -397,7 +399,7 @@ def draw_by_group(pst, num_reals=100, sigma_range=6, use_specsim=False, def geostatistical_draws( pst, struct_dict, num_reals=100, sigma_range=4, verbose=True, - scale_offset=True, subset=None + scale_offset=True, subset=None, rng=None ): """construct a parameter ensemble from a prior covariance matrix implied by geostatistical structure(s) and parameter bounds. @@ -421,6 +423,7 @@ def geostatistical_draws( Default is True. subset (`array-like`, optional): list, array, set or pandas index defining subset of parameters for draw. + rng (`numpy.random.RandomState`, optional): random number generator if not using default from pyemu.en Returns **pyemu.ParameterEnsemble**: the realized parameter ensemble. @@ -558,7 +561,7 @@ def geostatistical_draws( cov.x[i, i] = full_cov_dict[name] # no fixed values here pe = pyemu.ParameterEnsemble.from_gaussian_draw( - pst=pst, cov=cov, num_reals=num_reals, by_groups=False, fill=False + pst=pst, cov=cov, num_reals=num_reals, by_groups=False, fill=False, rng=rng ) par_ens.append(pe._df) pars_in_cov.update(set(pe.columns)) @@ -575,7 +578,7 @@ def geostatistical_draws( # cov = full_cov.get(diff,diff) # here we fill in the fixed values pe = pyemu.ParameterEnsemble.from_gaussian_draw( - pst, cov, num_reals=num_reals, fill=False + pst, cov, num_reals=num_reals, fill=False, rng=rng ) par_ens.append(pe._df) par_ens = pd.concat(par_ens, axis=1) @@ -3850,7 +3853,7 @@ def _maha(delta,v,x,z,lower_inv): return d_m -def get_maha_obs_summary(sim_en, l1_crit_val=6.34, l2_crit_val=9.2): +def get_maha_obs_summary(sim_en, l1_crit_val=6.34, l2_crit_val=9.2, rng=None): """calculate the 1-D and 2-D mahalanobis distance between simulated ensemble and observed values. Used for detecting prior-data conflict @@ -3860,6 +3863,7 @@ def get_maha_obs_summary(sim_en, l1_crit_val=6.34, l2_crit_val=9.2): mahalanobis distance. Default is 6.4 (p=0.01,df=1) l2_crit_val (`float`): the chi squared critical value for the 2-D mahalanobis distance. Default is 9.2 (p=0.01,df=2) + rng (np.random.RandomState): random number generator if not using default from pyemu.en Returns: @@ -3875,7 +3879,6 @@ def get_maha_obs_summary(sim_en, l1_crit_val=6.34, l2_crit_val=9.2): noise. """ - if not isinstance(sim_en, pyemu.ObservationEnsemble): raise Exception("'sim_en' must be a " + " pyemu.ObservationEnsemble instance") if sim_en.pst.nnz_obs < 1: @@ -3892,7 +3895,7 @@ def get_maha_obs_summary(sim_en, l1_crit_val=6.34, l2_crit_val=9.2): nnz_en.reseed() obsmean = obs.loc[nnz_en.columns.values, "obsval"] noise_en = pyemu.ObservationEnsemble.from_gaussian_draw( - sim_en.pst, num_reals=sim_en.shape[0] + sim_en.pst, num_reals=sim_en.shape[0], rng=rng ) noise_en -= obsmean # subtract off the obs val bc we just want the noise noise_en.index = nnz_en.index diff --git a/pyemu/utils/pst_from.py b/pyemu/utils/pst_from.py index 2f652ec70..ab733f1f3 100644 --- a/pyemu/utils/pst_from.py +++ b/pyemu/utils/pst_from.py @@ -656,7 +656,7 @@ def build_prior( self.logger.log("building prior covariance matrix") return cov - def draw(self, num_reals=100, sigma_range=6, use_specsim=False, scale_offset=True): + def draw(self, num_reals=100, sigma_range=6, use_specsim=False, scale_offset=True, rng=None): """Draw a parameter ensemble from the distribution implied by the initial parameter values in the control file and the prior parameter covariance matrix. @@ -669,6 +669,7 @@ def draw(self, num_reals=100, sigma_range=6, use_specsim=False, scale_offset=Tru scale_offset (`bool`): flag to apply scale and offset to parameter bounds before calculating prior variance. Dfault is True. If you are using non-default scale and/or offset and you get an exception during draw, try changing this value to False. + rng (`numpy.random.RandomState`, optional): random number generator if not using default from pyemu.en Returns: `pyemu.ParameterEnsemble`: a prior parameter ensemble @@ -700,7 +701,7 @@ def draw(self, num_reals=100, sigma_range=6, use_specsim=False, scale_offset=Tru pe = pyemu.helpers.draw_by_group(self.pst, num_reals=num_reals, sigma_range=sigma_range, use_specsim=use_specsim, scale_offset=scale_offset, struct_dict=struct_dict, delr=delr, delc=delc, - logger=self.logger) + logger=self.logger, rng=rng) return pe def build_pst(self, filename=None, update=False, version=1): From 26c8eff8ff4eeef71599879ced67597f08f107bf Mon Sep 17 00:00:00 2001 From: Michael Morphew Date: Tue, 10 Mar 2026 23:02:23 -0600 Subject: [PATCH 04/10] Fix up some broken references, fix rng assignment to some tests --- autotest/utils_tests.py | 4 ++-- pyemu/en.py | 3 ++- pyemu/utils/geostats.py | 6 +++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py index be0018aa7..f72d34226 100644 --- a/autotest/utils_tests.py +++ b/autotest/utils_tests.py @@ -1071,10 +1071,10 @@ def geostat_draws_test(tmp_path): gs = pyemu.geostats.read_struct_file(str_file) rng = np.random.RandomState(pyemu.en.SEED) pe = pyemu.helpers.geostatistical_draws(pst_file,{gs:df}, - sigma_range=4) + sigma_range=4,rng=rng) rng = np.random.RandomState(pyemu.en.SEED) pe2 = pyemu.helpers.geostatistical_draws(pst_file,{gs:df}, - sigma_range=4) + sigma_range=4,rng=rng) pe.to_csv(os.path.join(os.path.join("utils","geostat_pe.csv"))) diff = pe - pe2 diff --git a/pyemu/en.py b/pyemu/en.py index 50b950498..0951a8430 100644 --- a/pyemu/en.py +++ b/pyemu/en.py @@ -1184,7 +1184,8 @@ def from_triangular_draw(cls, pst, num_reals=100, fill=True, rng=None): pe.to_csv("my_tri_pe.csv") """ - + if rng is None: + rng = pyemu.en.rng li = pst.parameter_data.partrans == "log" ub = pst.parameter_data.parubnd.copy() ub.loc[li] = ub.loc[li].apply(np.log10) diff --git a/pyemu/utils/geostats.py b/pyemu/utils/geostats.py index 340e5fb14..4186b937e 100644 --- a/pyemu/utils/geostats.py +++ b/pyemu/utils/geostats.py @@ -14,7 +14,7 @@ from pyemu.mat.mat_handler import Cov from pyemu.utils.pp_utils import pp_file_to_dataframe from ..pyemu_warnings import PyemuWarning - +from pyemu import en EPSILON = 1.0e-7 # class KrigeFactors(pd.DataFrame): @@ -473,8 +473,8 @@ def draw_arrays(self, num_reals=1, mean_value=1.0, rng=None): for ireal in range(num_reals): if rng is None: - real = pyemu.en.rng.standard_normal(size=self.sqrt_fftc.shape) - imag = pyemu.en.rng.standard_normal(size=self.sqrt_fftc.shape) + real = en.rng.standard_normal(size=self.sqrt_fftc.shape) + imag = en.rng.standard_normal(size=self.sqrt_fftc.shape) else: real = rng.standard_normal(size=self.sqrt_fftc.shape) imag = rng.standard_normal(size=self.sqrt_fftc.shape) From 675ec2028766cb7b9f5035a65de3d47b02bc0a37 Mon Sep 17 00:00:00 2001 From: Michael Morphew Date: Tue, 10 Mar 2026 23:35:43 -0600 Subject: [PATCH 05/10] I wish my local runners worked --- pyemu/en.py | 9 ++++++--- pyemu/utils/helpers.py | 4 +++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pyemu/en.py b/pyemu/en.py index 0951a8430..360cc7be8 100644 --- a/pyemu/en.py +++ b/pyemu/en.py @@ -1255,7 +1255,8 @@ def from_uniform_draw(cls, pst, num_reals, fill=True, rng=None): """ - + if rng is None: + rng = pyemu.en.rng li = pst.parameter_data.partrans == "log" ub = pst.parameter_data.parubnd.copy() ub.loc[li] = ub.loc[li].apply(np.log10) @@ -1298,6 +1299,7 @@ def from_mixed_draws( enforce_bounds=True, partial=False, fill=True, + rng=None ): """generate a `ParameterEnsemble` using a mixture of distributions. Available distributions include (log) "uniform", (log) "triangular", @@ -1323,6 +1325,7 @@ def from_mixed_draws( Default is `False`. fill (`bool`): flag to fill in fixed and/or tied parameters with control file values. Default is True. + rng (`numpy.random.RandomState`, optional): random number generator if not using default from pyemu.en Example:: @@ -1396,9 +1399,9 @@ def from_mixed_draws( ) else: - cov = pyemu.Cov.from_parameter_data(pst, sigma_range=sigma_range) + cov = pyemu.Cov.from_parameter_data(pst, sigma_range=sigma_range, rng=rng) pe_gauss = ParameterEnsemble.from_gaussian_draw( - pst, cov, num_reals=num_reals + pst, cov, num_reals=num_reals, rng=rng ) pes.append(pe_gauss) diff --git a/pyemu/utils/helpers.py b/pyemu/utils/helpers.py index b4b28d63b..ea8391626 100644 --- a/pyemu/utils/helpers.py +++ b/pyemu/utils/helpers.py @@ -354,6 +354,7 @@ def draw_by_group(pst, num_reals=100, sigma_range=6, use_specsim=False, num_reals=num_reals, sigma_range=sigma_range, logger=logger, + rng=rng ) # append to list of specsim drawn pars gr_pe_l.append(gr_pe1) @@ -381,7 +382,8 @@ def draw_by_group(pst, num_reals=100, sigma_range=6, use_specsim=False, num_reals=num_reals, sigma_range=sigma_range, scale_offset=scale_offset, - subset=subset + subset=subset, + rng=rng ) logger.log(f"Drawing {len(subset)} non-specsim pars") if gr_par_pe is not None: From 666999c1a7a579f6d5fbb5ff97b8db92dd928b62 Mon Sep 17 00:00:00 2001 From: Michael Morphew Date: Tue, 10 Mar 2026 23:54:36 -0600 Subject: [PATCH 06/10] Add missing rng references --- pyemu/en.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pyemu/en.py b/pyemu/en.py index 360cc7be8..a0691a51e 100644 --- a/pyemu/en.py +++ b/pyemu/en.py @@ -1336,7 +1336,8 @@ def from_mixed_draws( pe.to_csv("my_mixed_pe.csv") """ - + if rng is None: + rng = pyemu.en.rng # error checking accept = {"uniform", "triangular", "gaussian"} assert ( @@ -1399,7 +1400,7 @@ def from_mixed_draws( ) else: - cov = pyemu.Cov.from_parameter_data(pst, sigma_range=sigma_range, rng=rng) + cov = pyemu.Cov.from_parameter_data(pst, sigma_range=sigma_range) pe_gauss = ParameterEnsemble.from_gaussian_draw( pst, cov, num_reals=num_reals, rng=rng ) @@ -1410,7 +1411,7 @@ def from_mixed_draws( # par_uniform.sort_values(by="parnme",inplace=True) par_uniform.sort_index(inplace=True) pst.parameter_data = par_uniform - pe_uniform = ParameterEnsemble.from_uniform_draw(pst, num_reals=num_reals) + pe_uniform = ParameterEnsemble.from_uniform_draw(pst, num_reals=num_reals, rng=rng) pes.append(pe_uniform) if len(how_groups["triangular"]) > 0: @@ -1418,7 +1419,7 @@ def from_mixed_draws( # par_tri.sort_values(by="parnme", inplace=True) par_tri.sort_index(inplace=True) pst.parameter_data = par_tri - pe_tri = ParameterEnsemble.from_triangular_draw(pst, num_reals=num_reals) + pe_tri = ParameterEnsemble.from_triangular_draw(pst, num_reals=num_reals, rng=rng) pes.append(pe_tri) df = pd.DataFrame(index=np.arange(num_reals), columns=par_org.parnme.values) @@ -1519,7 +1520,7 @@ def from_parfiles(cls, pst, parfile_names, real_names=None): return ParameterEnsemble(pst=pst, df=df_all) - def draw_new_ensemble(self,num_reals,include_noise=True,noise_reals=None): + def draw_new_ensemble(self,num_reals,include_noise=True,noise_reals=None, rng=None): """Draw a new (potentially larger) ParameterEnsemble instance using the realizations in `self`. @@ -1533,6 +1534,7 @@ def draw_new_ensemble(self,num_reals,include_noise=True,noise_reals=None): noise_reals (ParameterEnsemble): other existing realizations (likely prior realizations) that are used as noise realizations in place of IID noise that is used if `include_noise` is True and `noise_reals` is None. + rng (`numpy.random.RandomState`, optional): random number generator if not using default from pyemu.en Returns ParameterEnsemble @@ -1544,7 +1546,7 @@ def draw_new_ensemble(self,num_reals,include_noise=True,noise_reals=None): names = self.pst.adj_par_names return self._draw_new_ensemble(num_reals,names,include_noise=include_noise, - noise_reals=noise_reals) + noise_reals=noise_reals,rng=rng) def back_transform(self): """back transform parameters with respect to `partrans` value. From 032ca46dfe317f2045067c2b673f0107873e2ea6 Mon Sep 17 00:00:00 2001 From: Michael Morphew Date: Wed, 11 Mar 2026 00:21:46 -0600 Subject: [PATCH 07/10] Add rng pass to specsim test --- autotest/utils_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotest/utils_tests.py b/autotest/utils_tests.py index f72d34226..6c4e4c538 100644 --- a/autotest/utils_tests.py +++ b/autotest/utils_tests.py @@ -2288,7 +2288,7 @@ def specsim_test(): ss = pyemu.geostats.SpecSim2d(geostruct=gs, delx=delr, dely=delc) mean_value = 15.0 - reals = ss.draw_arrays(num_reals=num_reals, mean_value=mean_value) + reals = ss.draw_arrays(num_reals=num_reals, mean_value=mean_value, rng=rng) assert reals.shape == (num_reals, nrow, ncol),reals.shape reals = np.log10(reals) mean_value = np.log10(mean_value) From 70bbbeecdf036f6f89ee20ce254a004dbd227e5f Mon Sep 17 00:00:00 2001 From: Michael Morphew Date: Wed, 11 Mar 2026 08:51:15 -0600 Subject: [PATCH 08/10] Modify example notebooks, revert dsiae to using the old global seeds --- examples/MatrixCovariance_demo.ipynb | 29 +- examples/helpers.py | 2 +- examples/pstfrom_mf6.ipynb | 14 +- examples/pstfrom_mf6_ppu.ipynb | 504 ++++++++++-------- examples/pstfrom_relaxed.ipynb | 8 +- .../understanding_array_thresholding.ipynb | 8 +- pyemu/emulators/dsiae.py | 2 +- 7 files changed, 312 insertions(+), 255 deletions(-) diff --git a/examples/MatrixCovariance_demo.ipynb b/examples/MatrixCovariance_demo.ipynb index c9a92b5bc..eb7c30f33 100644 --- a/examples/MatrixCovariance_demo.ipynb +++ b/examples/MatrixCovariance_demo.ipynb @@ -12,14 +12,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'pyemu'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpyemu\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Matrix, Cov\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpyemu\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m en\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pyemu'" + ] + } + ], "source": [ "from __future__ import print_function\n", "import os\n", "import numpy as np\n", - "from pyemu import Matrix, Cov" + "from pyemu import Matrix, Cov\n", + "from pyemu import en" ] }, { @@ -51,7 +64,7 @@ "metadata": {}, "outputs": [], "source": [ - "a = np.random.random((5, 5))\n", + "a = en.rng.random((5, 5))\n", "row_names = []\n", "[row_names.append(\"row_{0:02d}\".format(i)) for i in range(5)]\n", "col_names = []\n", @@ -209,7 +222,7 @@ "#a new matrix object that is not \"aligned\" with m\n", "row_names = [\"row_03\",\"row_02\",\"row_00\"]\n", "col_names = [\"col_01\",\"col_10\",\"col_100\"]\n", - "m_mix = Matrix(x=np.random.random((3,3)),row_names=row_names,col_names=col_names)\n", + "m_mix = Matrix(x=en.rng.random((3,3)),row_names=row_names,col_names=col_names)\n", "m_mix.to_dataframe()\n" ] }, @@ -349,7 +362,7 @@ "metadata": { "anaconda-cloud": {}, "kernelspec": { - "display_name": "Python 3", + "display_name": "crmass", "language": "python", "name": "python3" }, @@ -363,7 +376,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/examples/helpers.py b/examples/helpers.py index 2f23654b8..3cbf318a3 100644 --- a/examples/helpers.py +++ b/examples/helpers.py @@ -4,7 +4,7 @@ def process_model_outputs(): import numpy as np print("processing model outputs") - arr = np.random.random(100) + arr = pyemu.en.rng.random(100) np.savetxt("special_outputs.dat",arr) return arr diff --git a/examples/pstfrom_mf6.ipynb b/examples/pstfrom_mf6.ipynb index c2e21f67a..16a7a89b9 100644 --- a/examples/pstfrom_mf6.ipynb +++ b/examples/pstfrom_mf6.ipynb @@ -400,8 +400,8 @@ "ymx = m.modelgrid.yvertices.max()\n", "\n", "numpp = 20\n", - "xvals = np.random.uniform(xmn,xmx,numpp)\n", - "yvals = np.random.uniform(ymn, ymx, numpp)\n", + "xvals = pyemu.en.rng.random.uniform(xmn,xmx,numpp)\n", + "yvals = pyemu.en.rng.random.uniform(ymn, ymx, numpp)\n", "pp_locs = pd.DataFrame({\"x\":xvals,\"y\":yvals})\n", "pp_locs.loc[:,\"zone\"] = 1\n", "pp_locs.loc[:,\"name\"] = [\"pp_{0}\".format(i) for i in range(numpp)]\n", @@ -911,10 +911,10 @@ ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# load the mf6 model with flopy to get the spatial reference\n", "sim = flopy.mf6.MFSimulation.load(sim_ws=tmp_model_ws)\n", @@ -1092,11 +1092,11 @@ ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "" + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/examples/pstfrom_mf6_ppu.ipynb b/examples/pstfrom_mf6_ppu.ipynb index d4fd4414b..ec1c5a3bc 100644 --- a/examples/pstfrom_mf6_ppu.ipynb +++ b/examples/pstfrom_mf6_ppu.ipynb @@ -1,16 +1,19 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", - "source": "# Setting up a PEST interface from MODFLOW6 using the `PstFrom` class with `PyPestUtils` for advanced pilot point parameterization", - "id": "597647f9253af23f" + "id": "597647f9253af23f", + "metadata": {}, + "source": [ + "# Setting up a PEST interface from MODFLOW6 using the `PstFrom` class with `PyPestUtils` for advanced pilot point parameterization" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "9dd3398b66b0b998", + "metadata": {}, + "outputs": [], "source": [ "import os\n", "import shutil\n", @@ -19,293 +22,315 @@ "import matplotlib.pyplot as plt\n", "import pyemu\n", "import flopy" - ], - "id": "9dd3398b66b0b998" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "b04bf3f138f02ffc", + "metadata": {}, + "outputs": [], "source": [ "import sys\n", "sys.path.append(os.path.join(\"..\",\"..\",\"pypestutils\"))" - ], - "id": "b04bf3f138f02ffc" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "import pypestutils as ppu", - "id": "53f26ecf6ecf8e74" + "id": "53f26ecf6ecf8e74", + "metadata": {}, + "outputs": [], + "source": [ + "import pypestutils as ppu" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "An existing MODFLOW6 model is in the directory `freyberg_mf6`. Lets check it out:", - "id": "98769e861ab3f840" + "id": "98769e861ab3f840", + "metadata": {}, + "source": [ + "An existing MODFLOW6 model is in the directory `freyberg_mf6`. Lets check it out:" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "c007d8b59aa1e92f", + "metadata": {}, + "outputs": [], "source": [ "org_model_ws = os.path.join('freyberg_mf6')\n", "os.listdir(org_model_ws)" - ], - "id": "c007d8b59aa1e92f" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "bfcb10c3e3999ab2", + "metadata": {}, "source": [ "You can see that all the input array and list data for this model have been written \"externally\" - this is key to using the `PstFrom` class. \n", "\n", "Let's quickly viz the model top just to remind us of what we are dealing with:" - ], - "id": "bfcb10c3e3999ab2" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "bc1bfe933fa33b1", + "metadata": {}, + "outputs": [], "source": [ "id_arr = np.loadtxt(os.path.join(org_model_ws,\"freyberg6.dis_idomain_layer3.txt\"))\n", "top_arr = np.loadtxt(os.path.join(org_model_ws,\"freyberg6.dis_top.txt\"))\n", "top_arr[id_arr==0] = np.nan\n", "plt.imshow(top_arr)" - ], - "id": "bc1bfe933fa33b1" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "Now let's copy those files to a temporary location just to make sure we don't goof up those original files:", - "id": "2d0a09c8caa969c6" + "id": "2d0a09c8caa969c6", + "metadata": {}, + "source": [ + "Now let's copy those files to a temporary location just to make sure we don't goof up those original files:" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "a074d2977f031532", + "metadata": {}, + "outputs": [], "source": [ "tmp_model_ws = \"temp_pst_from_ppu\"\n", "if os.path.exists(tmp_model_ws):\n", " shutil.rmtree(tmp_model_ws)\n", "shutil.copytree(org_model_ws,tmp_model_ws)\n", "os.listdir(tmp_model_ws)" - ], - "id": "a074d2977f031532" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "b7448da37fb0d0db", + "metadata": {}, "source": [ "Now we need just a tiny bit of info about the spatial discretization of the model - this is needed to work out separation distances between parameters for build a geostatistical prior covariance matrix later.\n", "\n", "Here we will load the flopy sim and model instance just to help us define some quantities later - flopy is not required to use the `PstFrom` class." - ], - "id": "b7448da37fb0d0db" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "6d5e3c75d4ccff87", + "metadata": {}, + "outputs": [], "source": [ "sim = flopy.mf6.MFSimulation.load(sim_ws=tmp_model_ws)\n", "m = sim.get_model(\"freyberg6\")\n" - ], - "id": "6d5e3c75d4ccff87" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "Here we use the simple `SpatialReference` pyemu implements to help us spatially locate parameters", - "id": "791e3a32322ba060" + "id": "791e3a32322ba060", + "metadata": {}, + "source": [ + "Here we use the simple `SpatialReference` pyemu implements to help us spatially locate parameters" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "3348e2c5449e3065", + "metadata": {}, + "outputs": [], "source": [ "sr = pyemu.helpers.SpatialReference.from_namfile(\n", " os.path.join(tmp_model_ws, \"freyberg6.nam\"),\n", " delr=m.dis.delr.array, delc=m.dis.delc.array)\n", "sr" - ], - "id": "3348e2c5449e3065" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "Now we can instantiate a `PstFrom` class instance", - "id": "51fd4fad812b4a45" + "id": "51fd4fad812b4a45", + "metadata": {}, + "source": [ + "Now we can instantiate a `PstFrom` class instance" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "38b193a142016006", + "metadata": {}, + "outputs": [], "source": [ "template_ws = \"freyberg6_template\"\n", "pf = pyemu.utils.PstFrom(original_d=tmp_model_ws, new_d=template_ws,\n", " remove_existing=True,\n", " longnames=True, spatial_reference=sr,\n", " zero_based=False,start_datetime=\"1-1-2018\")\n" - ], - "id": "38b193a142016006" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "4d34cbe82af7d566", + "metadata": {}, "source": [ "## Observations\n", "\n", "So now that we have a `PstFrom` instance, but its just an empty container at this point, so we need to add some PEST interface \"observations\" and \"parameters\". Let's start with observations using MODFLOW6 head. These are stored in `heads.csv`:" - ], - "id": "4d34cbe82af7d566" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "12f8b4718e9d6cbd", + "metadata": {}, + "outputs": [], "source": [ "df = pd.read_csv(os.path.join(tmp_model_ws,\"heads.csv\"),index_col=0)\n", "df" - ], - "id": "12f8b4718e9d6cbd" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "The main entry point for adding observations is (surprise) `PstFrom.add_observations()`. This method works on the list-type observation output file. We need to tell it what column is the index column (can be string if there is a header or int if no header) and then what columns contain quantities we want to monitor (e.g. \"observe\") in the control file - in this case we want to monitor all columns except the index column:", - "id": "90444d9d7ea6b006" + "id": "90444d9d7ea6b006", + "metadata": {}, + "source": [ + "The main entry point for adding observations is (surprise) `PstFrom.add_observations()`. This method works on the list-type observation output file. We need to tell it what column is the index column (can be string if there is a header or int if no header) and then what columns contain quantities we want to monitor (e.g. \"observe\") in the control file - in this case we want to monitor all columns except the index column:" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "139e2128db6100c", + "metadata": {}, + "outputs": [], "source": [ "hds_df = pf.add_observations(\"heads.csv\",insfile=\"heads.csv.ins\",index_cols=\"time\",\n", " use_cols=list(df.columns.values),prefix=\"hds\",)\n", "hds_df" - ], - "id": "139e2128db6100c" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "We can see that it returned a dataframe with lots of useful info: the observation names that were formed (`obsnme`), the values that were read from `heads.csv` (`obsval`) and also some generic weights and group names. At this point, no control file has been created, we have simply prepared to add this observations to the control file later. ", - "id": "4935231d1ffd7d8e" + "id": "4935231d1ffd7d8e", + "metadata": {}, + "source": [ + "We can see that it returned a dataframe with lots of useful info: the observation names that were formed (`obsnme`), the values that were read from `heads.csv` (`obsval`) and also some generic weights and group names. At this point, no control file has been created, we have simply prepared to add this observations to the control file later. " + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "[f for f in os.listdir(template_ws) if f.endswith(\".ins\")]", - "id": "4e4baaae9a812573" + "id": "4e4baaae9a812573", + "metadata": {}, + "outputs": [], + "source": [ + "[f for f in os.listdir(template_ws) if f.endswith(\".ins\")]" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "d56c174ee65114a7", + "metadata": {}, "source": [ "Nice! We also have a PEST-style instruction file for those obs.\n", "\n", "Now lets do the same for SFR observations:" - ], - "id": "d56c174ee65114a7" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "e72eb48997a89ccd", + "metadata": {}, + "outputs": [], "source": [ "df = pd.read_csv(os.path.join(tmp_model_ws, \"sfr.csv\"), index_col=0)\n", "sfr_df = pf.add_observations(\"sfr.csv\", insfile=\"sfr.csv.ins\", index_cols=\"time\", use_cols=list(df.columns.values))\n", "sfr_df" - ], - "id": "e72eb48997a89ccd" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "b8c36e9db08b7a1a", + "metadata": {}, "source": [ "Sweet as! Now that we have some observations, let's add parameters!\n", "\n", "## Pilot points and `PyPestUtils`\n", "\n", "This notebook is mostly meant to demonstrate some advanced pilot point parameterization that is possible with `PyPestUtils`, so we will only focus on HK and VK pilot point parameters. This is just to keep the example short. In practice, please please please parameterize boundary conditions too!" - ], - "id": "b8c36e9db08b7a1a" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "e2fce4e4ae545c61", + "metadata": {}, + "outputs": [], "source": [ "v = pyemu.geostats.ExpVario(contribution=1.0,a=5000,bearing=0,anisotropy=5)\n", "pp_gs = pyemu.geostats.GeoStruct(variograms=v, transform='log')" - ], - "id": "e2fce4e4ae545c61" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "4cf91a232ef26e6e", + "metadata": {}, + "outputs": [], "source": [ "pp_gs.plot()\n", "print(\"spatial variogram\")" - ], - "id": "4cf91a232ef26e6e" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "Now let's get the idomain array to use as a zone array - this keeps us from setting up parameters in inactive model cells:", - "id": "f7de7f611542a0d9" + "id": "f7de7f611542a0d9", + "metadata": {}, + "source": [ + "Now let's get the idomain array to use as a zone array - this keeps us from setting up parameters in inactive model cells:" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "ib = m.dis.idomain[0].array", - "id": "87d94be55152e1e0" + "id": "87d94be55152e1e0", + "metadata": {}, + "outputs": [], + "source": [ + "ib = m.dis.idomain[0].array" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "Find HK files for the upper and lower model layers (assuming model layer 2 is a semi-confining unit)", - "id": "8cf040c4ce99d692" + "id": "8cf040c4ce99d692", + "metadata": {}, + "source": [ + "Find HK files for the upper and lower model layers (assuming model layer 2 is a semi-confining unit)" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "459ed4d1c3139dde", + "metadata": {}, + "outputs": [], "source": [ "hk_arr_files = [f for f in os.listdir(tmp_model_ws) if \"npf_k_\" in f and f.endswith(\".txt\") and \"layer2\" not in f]\n", "hk_arr_files" - ], - "id": "459ed4d1c3139dde" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "3292bf885ab36a90", + "metadata": {}, + "outputs": [], "source": [ "arr_file = \"freyberg6.npf_k_layer1.txt\"\n", "tag = arr_file.split('.')[1].replace(\"_\",\"-\")\n", @@ -317,57 +342,61 @@ "# so we can make easy plots later...\n", "pf.add_observations(arr_file,prefix=tag,\n", " obsgp=tag,zone_array=ib)" - ], - "id": "3292bf885ab36a90" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "If you are familiar with how `PstFrom` has worked historically, we handed off the process to solve for the factor file (which requires solving the kriging equations for each active node) to a pure python (well, with pandas and numpy). This was ok for toy models, but hella slow for big ugly models. If you look at the log entries above, you should see that the instead, `PstFrom` successfully handed off the solve to `PyPestUtils`, which is exponentially faster for big models. sweet ez! ", - "id": "4a892258c71c90ce" + "id": "4a892258c71c90ce", + "metadata": {}, + "source": [ + "If you are familiar with how `PstFrom` has worked historically, we handed off the process to solve for the factor file (which requires solving the kriging equations for each active node) to a pure python (well, with pandas and numpy). This was ok for toy models, but hella slow for big ugly models. If you look at the log entries above, you should see that the instead, `PstFrom` successfully handed off the solve to `PyPestUtils`, which is exponentially faster for big models. sweet ez! " + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "d62cca5a36767595", + "metadata": {}, + "outputs": [], "source": [ "tpl_files = [f for f in os.listdir(template_ws) if f.endswith(\".tpl\")]\n", "tpl_files" - ], - "id": "d62cca5a36767595" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "b8f9f52c986dbe6", + "metadata": {}, + "outputs": [], "source": [ "with open(os.path.join(template_ws,tpl_files[0]),'r') as f:\n", " for _ in range(2):\n", " print(f.readline().strip())\n", " " - ], - "id": "b8f9f52c986dbe6" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "b2491117265b8653", + "metadata": {}, "source": [ "\n", "So those might look like pretty redic parameter names, but they contain heaps of metadata to help you post process things later..." - ], - "id": "b2491117265b8653" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "So those are you standard pilot points for HK in layer 1 - same as it ever was...", - "id": "4740988747c39978" + "id": "4740988747c39978", + "metadata": {}, + "source": [ + "So those are you standard pilot points for HK in layer 1 - same as it ever was..." + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "572d96a02212dfa6", + "metadata": {}, "source": [ "## Geostatistical hyper-parameters\n", "\n", @@ -376,27 +405,27 @@ "In `PyPestUtils`, we can supply the pilot-point-to-grid interpolation process with arrays of hyper-parameter values, one array for each variogram property. The result of this hyper parameter mess is referred to as a non-stationary spatial parameterization. buckle up...\n", "\n", "First let's define some additional geostatistical structures:" - ], - "id": "572d96a02212dfa6" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "343f86f826327d3f", + "metadata": {}, + "outputs": [], "source": [ "value_v = pyemu.geostats.ExpVario(contribution=1, a=5000, anisotropy=5, bearing=0.0)\n", "value_gs = pyemu.geostats.GeoStruct(variograms=value_v)\n", "bearing_v = pyemu.geostats.ExpVario(contribution=1,a=10000,anisotropy=5,bearing=0.0)\n", "bearing_gs = pyemu.geostats.GeoStruct(variograms=bearing_v)" - ], - "id": "343f86f826327d3f" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "be859202f72172fd", + "metadata": {}, + "outputs": [], "source": [ "arr_file = \"freyberg6.npf_k_layer3.txt\"\n", "tag = arr_file.split('.')[1].replace(\"_\",\"-\")\n", @@ -407,37 +436,37 @@ " apply_order=2)\n", "pf.add_observations(arr_file,prefix=tag,\n", " obsgp=tag,zone_array=ib)" - ], - "id": "be859202f72172fd" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "cbfcf6fe35fc9fa0", + "metadata": {}, + "outputs": [], "source": [ "hyperpar_files = [f for f in os.listdir(pf.new_d) if tag in f]\n", "hyperpar_files" - ], - "id": "cbfcf6fe35fc9fa0" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "4b16a43536e4689a", + "metadata": {}, "source": [ "when we supplied the \"prep_hyperpars\" as `True` above, that triggered `PstFrom` to do something different. Instead of solving for the pilot point kriging factors as before, now, we have array-based files for the geostatistical hyper parameters, as well as some additional quantities we need to \"apply\" these hyper parameter at runtime. This is a key difference: When the pilot point variogram is changing for each model run, we need to re-solve for the kriging factors for each model run...\n", "\n", "We snuck in something else too - see that `apply_order` argument? That is how we can control what order of files being processed by the run-time multiplier parameter function. Since we are going to parameterize the hyper parameters and there is an implicit order between these hyper parameters and the underlying pilot points, we need to make sure the hyper parameters are processed first. \n", "\n", "Lets setup some hyper parameters for estimation. We will use a constant for the anisotropy ratio, but use pilot points for the bearing:" - ], - "id": "4b16a43536e4689a" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "c72bac997d4c0a41", + "metadata": {}, + "outputs": [], "source": [ "afile = 'npf-k-layer3.aniso.dat'\n", "tag = afile.split('.')[0].replace(\"_\",\"-\")+\"-aniso\"\n", @@ -454,18 +483,20 @@ " pp_options={\"try_use_ppu\":True},\n", " apply_order=1,geostruct=bearing_gs)\n", "pf.add_observations(bfile, prefix=tag, obsgp=tag) " - ], - "id": "c72bac997d4c0a41" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "Notice that the `apply_order` for these hyper pars is 1 so that any processing for these quantities happens before the actual underlying pilot points are processed", - "id": "17a6f9a495cc9976" + "id": "17a6f9a495cc9976", + "metadata": {}, + "source": [ + "Notice that the `apply_order` for these hyper pars is 1 so that any processing for these quantities happens before the actual underlying pilot points are processed" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "649aa96d83db3844", + "metadata": {}, "source": [ "## \"These go to 11\" - amp'ing things up with categorization\n", "\n", @@ -476,14 +507,14 @@ "lets setup non-stationary categorical parameterization for the VK of layer 2 (the semi confining unit). We can conceptualize this as a semi-confining unit that has \"windows\" in it that connects the two aquifers. Where there is not a window, the Vk will be very low, where there is a window, the VK will be much higher. Let's also assume the windows in the confining unit where created when a stream eroded thru it, so the shape of these windows will be higher-order (not derived from a standard geostatistical 2-point process), but rather from connected features.\n", "\n", "In what follows, we setup this complex parameterization. We also add lots of aux observations to lets plot and viz the steps in this parameterization process." - ], - "id": "649aa96d83db3844" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "42067ca6bd431a2", + "metadata": {}, + "outputs": [], "source": [ "arr_file = \"freyberg6.npf_k33_layer2.txt\"\n", "print(arr_file)\n", @@ -564,54 +595,56 @@ "df = pd.read_csv(threshcsv.replace(\".csv\",\"_results.csv\"),index_col=0)\n", "pf.add_observations(os.path.split(threshcsv)[1].replace(\".csv\",\"_results.csv\"),index_cols=\"threshcat\",use_cols=df.columns.tolist(),prefix=prefix+\"-results_k:{0}\".format(k),\n", " obsgp=prefix+\"-results_k:{0}\".format(k),ofile_sep=\",\")\n" - ], - "id": "42067ca6bd431a2" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "324fd56e3a4a8d8d", + "metadata": {}, "source": [ "### build the control file, pest interface files, and forward run script\n", "At this point, we have some parameters and some observations, so we can create a control file:" - ], - "id": "324fd56e3a4a8d8d" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "e8390ebb7f31243e", + "metadata": {}, + "outputs": [], "source": [ "pf.mod_sys_cmds.append(\"mf6\")\n", "pf.pre_py_cmds.insert(0,\"import sys\")\n", "pf.pre_py_cmds.insert(1,\"sys.path.append(os.path.join('..','..','..','pypestutils'))\")\n", "pst = pf.build_pst()" - ], - "id": "e8390ebb7f31243e" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "_ = [print(line.rstrip()) for line in open(os.path.join(template_ws,\"forward_run.py\"))]", - "id": "27a01a3a59425112" + "id": "27a01a3a59425112", + "metadata": {}, + "outputs": [], + "source": [ + "_ = [print(line.rstrip()) for line in open(os.path.join(template_ws,\"forward_run.py\"))]" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "15c49fe430cd1d0a", + "metadata": {}, "source": [ "## Setting initial parameter bounds and values\n", "\n", "Here is some gory detail regarding defining the hyper parameters for both layer 3 HK and layer 2 VK..." - ], - "id": "15c49fe430cd1d0a" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "7e131502d074f897", + "metadata": {}, + "outputs": [], "source": [ "#set the initial and bounds for the fill values\n", "par = pst.parameter_data\n", @@ -674,108 +707,114 @@ "par.loc[cat2par, \"parubnd\"] = 1\n", "par.loc[cat2par, \"parlbnd\"] = 1\n", "par.loc[cat2par,\"partrans\"] = \"fixed\"\n" - ], - "id": "7e131502d074f897" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "# Generating a prior parameter ensemble, then run and viz a real", - "id": "c2154e42f31ec8dd" + "id": "c2154e42f31ec8dd", + "metadata": {}, + "source": [ + "# Generating a prior parameter ensemble, then run and viz a real" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "f09a17cf6493216f", + "metadata": {}, + "outputs": [], "source": [ - "np.random.seed(122341)\n", - "pe = pf.draw(num_reals=100)" - ], - "id": "f09a17cf6493216f" + "rng = np.random.RandomState(122341)\n", + "pe = pf.draw(num_reals=100, rng=rng)" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "pe.to_csv(os.path.join(template_ws,\"prior.csv\"))", - "id": "89c5c4280a568acc" + "id": "89c5c4280a568acc", + "metadata": {}, + "outputs": [], + "source": [ + "pe.to_csv(os.path.join(template_ws,\"prior.csv\"))" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "a5bf620e94c93da4", + "metadata": {}, + "outputs": [], "source": [ "real = 0\n", "pst_name = \"real_{0}.pst\".format(real)\n", "pst.parameter_data.loc[pst.adj_par_names,\"parval1\"] = pe.loc[real,pst.adj_par_names].values" - ], - "id": "a5bf620e94c93da4" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "60804fce3b5ab3a6", + "metadata": {}, + "outputs": [], "source": [ "pst.control_data.noptmax = 0\n", "pst.write(os.path.join(pf.new_d,pst_name))" - ], - "id": "60804fce3b5ab3a6" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "pyemu.os_utils.run(\"pestpp-ies {0}\".format(pst_name),cwd=pf.new_d)", - "id": "b0724f47a40afc38" + "id": "b0724f47a40afc38", + "metadata": {}, + "outputs": [], + "source": [ + "pyemu.os_utils.run(\"pestpp-ies {0}\".format(pst_name),cwd=pf.new_d)" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "e7d925c78449818d", + "metadata": {}, + "outputs": [], "source": [ "pst.set_res(os.path.join(pf.new_d,pst_name.replace(\".pst\",\".base.rei\")))\n", "res = pst.res\n", "obs = pst.observation_data\n", "grps = [o for o in obs.obgnme.unique() if o.startswith(\"npf\") and \"result\" not in o and \"aniso\" not in o]\n", "grps" - ], - "id": "e7d925c78449818d" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "1bffb643bfe5b4b2", + "metadata": {}, + "outputs": [], "source": [ "gobs = obs.loc[obs.obgnme.isin(grps),:].copy()\n", "gobs[\"i\"] = gobs.i.astype(int)\n", "gobs[\"j\"] = gobs.j.astype(int)\n", "gobs[\"k\"] = gobs.obgnme.apply(lambda x: int(x.split('-')[2].replace(\"layer\",\"\")) - 1)" - ], - "id": "1bffb643bfe5b4b2" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "6107397c97e67caf", + "metadata": {}, + "outputs": [], "source": [ "uk = gobs.k.unique()\n", "uk.sort()" - ], - "id": "6107397c97e67caf" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "id": "55c643ac16ae39b2", + "metadata": {}, + "outputs": [], "source": [ "for k in uk:\n", " kobs = gobs.loc[gobs.k==k,:]\n", @@ -804,25 +843,30 @@ " plt.tight_layout()\n", " plt.show()\n", " plt.close(fig)" - ], - "id": "55c643ac16ae39b2" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "Stunning isn't it?! There is clearly a lot subjectivity in the form of defining the prior for the hyper parameters required to use these non-stationary geostats, but they do afford more opportunities to express (stochastic) expert knowledge. To be honest, there was a lot of experimenting with this notebook to get these figures to look this way - playing with variograms and parameter initial values and bounds a lot. You encouraged to do the same! scroll back up, change things, and \"restart kernel and run all\" - this will help build some better intution, promise....", - "id": "f61492b41877971d" + "id": "f61492b41877971d", + "metadata": {}, + "source": [ + "Stunning isn't it?! There is clearly a lot subjectivity in the form of defining the prior for the hyper parameters required to use these non-stationary geostats, but they do afford more opportunities to express (stochastic) expert knowledge. To be honest, there was a lot of experimenting with this notebook to get these figures to look this way - playing with variograms and parameter initial values and bounds a lot. You encouraged to do the same! scroll back up, change things, and \"restart kernel and run all\" - this will help build some better intution, promise...." + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "", - "id": "f7133475a2328689" + "id": "f7133475a2328689", + "metadata": {}, + "outputs": [], + "source": [] } ], - "metadata": {}, + "metadata": { + "language_info": { + "name": "python" + } + }, "nbformat": 4, "nbformat_minor": 5 } diff --git a/examples/pstfrom_relaxed.ipynb b/examples/pstfrom_relaxed.ipynb index efef8bf67..61d7a5eee 100644 --- a/examples/pstfrom_relaxed.ipynb +++ b/examples/pstfrom_relaxed.ipynb @@ -451,8 +451,8 @@ "metadata": {}, "outputs": [], "source": [ - "np.random.seed(122341)\n", - "pe = pf.draw(num_reals=100)" + "rng = np.random.RandomState(122341)\n", + "pe = pf.draw(num_reals=100, rng=rng)" ] }, { @@ -535,8 +535,8 @@ "metadata": {}, "outputs": [], "source": [ - "np.random.seed(122341)\n", - "pe_dialated = pf.draw(num_reals=100)" + "rng = np.random.RandomState(122341)\n", + "pe_dialated = pf.draw(num_reals=100, rng=rng)" ] }, { diff --git a/examples/understanding_array_thresholding.ipynb b/examples/understanding_array_thresholding.ipynb index 34d5baa98..535aa3d8e 100644 --- a/examples/understanding_array_thresholding.ipynb +++ b/examples/understanding_array_thresholding.ipynb @@ -46,8 +46,8 @@ "v = pyemu.geostats.ExpVario(contribution=1.0,a=500)\n", "gs = pyemu.geostats.GeoStruct(variograms=v)\n", "ss = pyemu.geostats.SpecSim2d(delx=delx,dely=dely,geostruct=gs)\n", - "np.random.seed(122341)\n", - "org_arr = ss.draw_arrays(1,mean_value=10)[0,:,:]\n", + "rng = np.random.RandomState(122341)\n", + "org_arr = ss.draw_arrays(1,mean_value=10, rng=rng)[0,:,:]\n", "assert org_arr.min() > 0.0\n", "cb = plt.imshow(org_arr)\n", "_ = plt.colorbar(cb)" @@ -352,7 +352,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "pyemu", "language": "python", "name": "python3" }, @@ -366,7 +366,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/pyemu/emulators/dsiae.py b/pyemu/emulators/dsiae.py index 5cf255ef5..98671de5d 100755 --- a/pyemu/emulators/dsiae.py +++ b/pyemu/emulators/dsiae.py @@ -861,7 +861,7 @@ def __init__(self, input_dim: int, latent_dim: int = 2, self.random_state = random_state tf.random.set_seed(random_state) - pyemu.en.rng = np.random.RandomState(random_state) + np.random.seed(random_state) self._build_model() # Build encoder/decoder From 63b6f253b61283b216f72e0c3c48004c3b4badd5 Mon Sep 17 00:00:00 2001 From: Michael Morphew Date: Wed, 11 Mar 2026 09:19:07 -0600 Subject: [PATCH 09/10] Remove unneeded import now that dsiae is reverted --- pyemu/emulators/dsiae.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyemu/emulators/dsiae.py b/pyemu/emulators/dsiae.py index 98671de5d..f3c1c1c09 100755 --- a/pyemu/emulators/dsiae.py +++ b/pyemu/emulators/dsiae.py @@ -2,7 +2,6 @@ Data Space Inversion (DSI) Autoencoder (AE) emulator implementation. """ from __future__ import print_function, division -import pyemu from typing import Optional, List, Dict, Any, Union import numpy as np import pandas as pd From 6059a0060963f006d50f474783c6ae2707b0fc07 Mon Sep 17 00:00:00 2001 From: Michael Morphew Date: Wed, 11 Mar 2026 09:56:45 -0600 Subject: [PATCH 10/10] Correct reference to uniform in pstfrom_mf6 example notebook --- examples/pstfrom_mf6.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pstfrom_mf6.ipynb b/examples/pstfrom_mf6.ipynb index 16a7a89b9..b5cee3bbe 100644 --- a/examples/pstfrom_mf6.ipynb +++ b/examples/pstfrom_mf6.ipynb @@ -400,8 +400,8 @@ "ymx = m.modelgrid.yvertices.max()\n", "\n", "numpp = 20\n", - "xvals = pyemu.en.rng.random.uniform(xmn,xmx,numpp)\n", - "yvals = pyemu.en.rng.random.uniform(ymn, ymx, numpp)\n", + "xvals = pyemu.en.rng.uniform(xmn,xmx,numpp)\n", + "yvals = pyemu.en.rng.uniform(ymn, ymx, numpp)\n", "pp_locs = pd.DataFrame({\"x\":xvals,\"y\":yvals})\n", "pp_locs.loc[:,\"zone\"] = 1\n", "pp_locs.loc[:,\"name\"] = [\"pp_{0}\".format(i) for i in range(numpp)]\n",