Merge pull request #12 from brandynlucca:updated_biodata_loading

brandynlucca · web-flow · commit da78a47e8d40 · 2025-04-29T14:33:32.000-07:00
Updated_biodata_loading
diff --git a/config_files/survey_year_2011_single_biodata_config.yml b/config_files/survey_year_2011_single_biodata_config.yml
@@ -0,0 +1,82 @@
+# This YAML file is a configuration file specifying
+# input filenames & some process parameter settings.
+# Relative file paths defined below are concatenated
+# with the data_root_dir path also set below.
+
+---
+##############################################################################
+# Parameters
+
+survey_year: 2011            # survey year being considered
+
+ship_id:
+  160:
+    survey: 201103
+    name: NOAA Ship Bell M Shimada
+    description: NOAA Fisheries Survey Vessel
+  499:
+    survey: 201103
+    name: W.E. Ricker
+    description: Canadian Coast Guard
+    haul_offset: 100
+species:
+  text_code: pacific_hake    # target species for the survey year -- species name
+  number_code: 22500         # target species for the survey year -- numeric code
+CAN_haul_offset: 100         # The value to be added to the Canadian's haul number
+
+##############################################################################
+# Report generation
+###################
+# Where the reports are saved
+report_path: C:/Users/Brandyn/Documents/GitHub/echopop_2011/reports
+
+##############################################################################
+# Directory path that contains all input data needed
+data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/echopop_2011
+
+##############################################################################
+# Input data files
+
+biological:
+  filename: Biological/1995-2023_biodata_redo.xlsx
+  sheetname:
+    catch: biodata_catch
+    length: biodata_length
+    specimen: biodata_specimen
+stratification:
+  strata:
+    # The two stratification types are found in two sheets: "Base KS" and "INPFC"
+    filename: Stratification/US&CAN strata 2011.xlsx
+    sheetname: ["stratification #0 (INPFC)", "stratification by haul #1"]
+  geo_strata:
+    # The two stratification types are found in two sheets: "stratification1" and "INPFC"
+    filename: Stratification/Stratification_geographic_Lat_rev.xlsx
+    sheetname: ["stratification #0 (INPFC)", "stratification #1"]
+NASC:
+  # NASC values
+  no_age1:
+    # file that excludes age1 values
+    filename: Exports/US_CAN_NASC_2011_table_no_age1.xlsx
+    sheetname: Sheet1
+  all_ages:
+    # file that includes all ages
+    filename: Exports/US_CAN_NASC_2011_table_all_ages.xlsx
+    sheetname: Sheet1
+transect_filter:
+  # Transect interval filtering
+  filename: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/echopop_2011/Kriging_files/Kriging_grid_files/Transect Bounds to 2011.xlsx
+  sheetname: "1995-2011"
+export_regions:
+  filename: Stratification/US&CAN_T_reg_haul_final.csv
+kriging:
+  mesh:
+    filename: Kriging_files/Kriging_grid_files/krig_grid2_5nm_cut_centroids_2013.xlsx
+    sheetname: krigedgrid2_5nm_forChu
+  isobath_200m:
+    filename: Kriging_files/Kriging_grid_files/transformation_isobath_coordinates.xlsx
+    sheetname: Smoothing_EasyKrig
+  vario_krig_para:
+    # NOTE: This file is not currently used
+    filename: Kriging_files/default_vario_krig_settings_final.xlsx
+    sheetname: Sheet1
+...
diff --git a/config_files/survey_year_2015_config.yml b/config_files/survey_year_2015_config.yml
@@ -36,7 +36,7 @@ biological:
       sheetname: biodata_length_CAN
   specimen:
     US:
-      filename: Biological/aged shoreside US/2015_biodata_specimen.xlsx
+      filename: Biological/aged shoreside US/2015_biodata_specimen_bml.xlsx
       sheetname: 2015_biodata_specimen
     CAN:
       filename: Biological/aged shoreside CAN/2015_biodata_specimen_CAN.xlsx
diff --git a/config_files/survey_year_2019_single_biodata_config.yml b/config_files/survey_year_2019_single_biodata_config.yml
@@ -32,8 +32,8 @@ report_path: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/Data/reports
 ##############################################################################
 # Directory path that contains all input data needed
 
-data_root_dir: C:/Users/Brandyn Lucca/Documents/Data/echopop_2019
-# data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/Data/
+# data_root_dir: C:/Users/Brandyn Lucca/Documents/Data/echopop_2019
+data_root_dir: C:/Users/Brandyn/Documents/GitHub/EchoPro_data/Data/
 
 ##############################################################################
 # Input data files
@@ -48,7 +48,7 @@ stratification:
   strata:
     # The two stratification types are found in two sheets: "Base KS" and "INPFC"
     filename: Stratification/US_CAN strata 2019_final.xlsx
-    sheetname: Base KS
+    sheetname: [INPFC, Base KS]
   geo_strata:
     # The two stratification types are found in two sheets: "stratification1" and "INPFC"
     filename: Stratification/Stratification_geographic_Lat_2019_final.xlsx
diff --git a/echopop/test_survey.py b/echopop/test_survey.py
@@ -18,7 +18,25 @@
 import json
 import os
 import sys
+import copy
+from pathlib import Path
+from typing import List, Literal, Optional, Union
+
+import numpy as np
+import pandas as pd
+import yaml
+
+from echopop.core import BIODATA_HAUL_MAP, DATA_STRUCTURE, LAYER_NAME_MAP, NAME_CONFIG
+from echopop.utils.data_structure_utils import map_imported_datasets
+from echopop.utils.validate_df import DATASET_DF_MODEL
+from echopop.utils.validate_dict import CONFIG_DATA_MODEL, CONFIG_INIT_MODEL
+import copy
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Union
 
+import numpy as np
+from IPython.display import display
+import echopop.utils.load
 ####################################################################################################
 # CURRENT SURVEY YEAR BEING TESTED: 2019
 ####################################################################################################
@@ -31,7 +49,7 @@
 
 # Filepath/dataset configuration
 survey_year_config_path = f"C:/Users/Brandyn/Documents/GitHub/echopop/config_files\
-/survey_year_{SURVEY_YEAR}_config.yml"
+/survey_year_{SURVEY_YEAR}_single_biodata_config.yml"
 
 # Load json settings
 # ---- File open
diff --git a/echopop/utils/load.py b/echopop/utils/load.py
@@ -305,9 +305,13 @@ def read_validated_data(
     # Step 2: Determine whether the dataframe already exists
     if sub_attribute in ["biology", "statistics", "spatial"]:
         # ---- A single dataframe per entry is expected, so no other fancy operations are needed
-        if sheet_name.lower() == "inpfc":
-            df_list = [input_dict[sub_attribute]["inpfc_strata_df"], df]
-            input_dict[sub_attribute]["inpfc_strata_df"] = pd.concat(df_list)
+        if "inpfc" in sheet_name.lower():
+            # ---- Create the full key name
+            keyname = "inpfc_" + config_map[-1] + "_df"
+            # ---- Create DataFrame list
+            df_list = [input_dict[sub_attribute][keyname], df]
+            # ---- Concatenate/update
+            input_dict[sub_attribute][keyname] = pd.concat(df_list, ignore_index=True)
         else:
             if config_map[0] == "kriging" and config_map[1] == "vario_krig_para":
                 df_list = [input_dict[sub_attribute]["kriging"][config_map[1] + "_df"], df]
@@ -356,7 +360,6 @@ def read_validated_data(
             "the configuration YAML and core.py."
         )
 
-
 def write_haul_to_transect_key(configuration_dict: dict, verbose: bool):
     """
     Function for writing the haul-transect mapping key .xlsx file.
diff --git a/echopop/utils/load_nasc.py b/echopop/utils/load_nasc.py
@@ -10,7 +10,7 @@
 
 from ..core import ECHOVIEW_TO_ECHOPOP_NAMES, NAME_CONFIG, REGION_EXPORT_MAP
 from ..spatial.transect import export_transect_layers, export_transect_spacing
-from ..utils.validate_df import ECHOVIEW_DF_MODEL, KSStrata
+from .validate_df import ECHOVIEW_DF_MODEL, KSStrata
 from .operations import compile_patterns, extract_parts_and_labels, group_merge
 
 
diff --git a/echopop/utils/validate_dict.py b/echopop/utils/validate_dict.py
@@ -43,6 +43,24 @@ def create(cls, **kwargs):
 
         return cls.judge(**kwargs).model_dump(exclude_none=True)
 
+class CSVFile(InputModel, title="*.xlsx file tree"):
+    """
+    .csv file tree structure
+    Parameters
+    ----------
+    filename: str
+        Filename (as a string) with a *.csv file extension.
+    """
+
+    filename: str
+
+    @field_validator("filename", mode="before")
+    def validate_file_extension(cls, v):
+        if not v.lower().endswith(".csv"):
+            raise ValueError(
+                f"The file '{v}' must be a '.csv'."
+                )
+        return v
 
 class XLSXFile(InputModel, title="*.xlsx file tree"):
     """
@@ -59,6 +77,13 @@ class XLSXFile(InputModel, title="*.xlsx file tree"):
     filename: str
     sheetname: Union[str, List[str], Dict[str, str]]
 
+    @field_validator("filename", mode="before")
+    def validate_file_extension(cls, v):
+        if not v.lower().endswith(".xlsx"):
+            raise ValueError(
+                f"The file '{v}' must be a '.xlsx'."
+                )
+        return v
 
 class FileSettings(InputModel, title="parameter file settings"):
     """
@@ -74,8 +99,6 @@ class FileSettings(InputModel, title="parameter file settings"):
 
     directory: str
     sheetname: str
-
-
 class StratifiedSurveyMeanParameters(
     InputModel, title="stratified survey parameters", arbitrary_types_allowed=True
 ):
@@ -290,7 +313,6 @@ def validate_save_file_template(cls, v):
         # ---- Return value
         return v
 
-
 class TSLRegressionParameters(InputModel, title="TS-length regression parameters"):
     """
     Target strength - length regression parameters
@@ -493,7 +515,6 @@ class BiologicalFile(InputModel, title="consolidated biological file input"):
     filename: str
     sheetname: BiologicalSheets
 
-
 class BiologicalFiles(InputModel, title="biological file inputs"):
     """
     Biological data files
@@ -576,7 +597,9 @@ class CONFIG_DATA_MODEL(InputModel):
     data_root_dir: Optional[str] = None
     CAN_haul_offset: Optional[int] = None
     ship_id: Optional[Union[int, str, float, Dict[Any, Any]]] = None
-    export_regions: Optional[Dict[str, XLSXFile]] = None
+    transect_filter: Optional[XLSXFile] = None
+    export_regions: Optional[Union[Union[CSVFile, XLSXFile], 
+                                    Dict[str, Union[CSVFile, XLSXFile]]]] = None
 
     def __init__(self, filename, **kwargs):
         try: