Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 35 additions & 4 deletions echopop/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,37 @@
},
}

# Name configuration dictionary
NAME_CONFIG = {
"Age": "age",
"Cell portion": "fraction_cell_in_polygon",
"Frequency": "length_count",
"haul": "haul_num",
"haul end": "haul_end",
"haul start": "haul_start",
"Haul": "haul_num",
"Latitude": "latitude",
"Latitude (upper limit)": "northlimit_latitude",
"Latitude of centroid": "centroid_latitude",
"Length": "length",
"Longitude": "longitude",
"Longitude of centroid": "centroid_longitude",
"strata": "stratum_num",
"Sex": "sex",
"Ship": "ship_id",
"Spacing": "transect_spacing",
"Species_Code": "species_id",
"Species_Name": "species_name",
"Strata Index": "stratum_num",
"Stratum": "stratum_num",
"Transect": "transect_num",
"VL start": "vessel_log_start",
"VL end": "vessel_log_end",
"wt": "fraction_hake",
"Weight": "weight",
"Weight_In-Haul": "haul_weight",
}

# `Survey` object data structure
CONFIG_MAP = {
"biological": {
Expand Down Expand Up @@ -115,14 +146,14 @@
"geo_strata": {
"stratum_num": int,
"northlimit_latitude": np.float64,
"haul start": int,
"haul end": int,
"haul_start": int,
"haul_end": int,
},
"inpfc_strata": {
"stratum_num": int,
"northlimit_latitude": np.float64,
"haul start": int,
"haul end": int,
"haul_start": int,
"haul_end": int,
},
},
"NASC": {
Expand Down
87 changes: 19 additions & 68 deletions echopop/utils/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import numpy as np
import pandas as pd
import yaml
from openpyxl import load_workbook

from ..core import (
BIODATA_HAUL_MAP,
Expand All @@ -14,6 +13,7 @@
CONFIG_MAP,
DATA_STRUCTURE,
LAYER_NAME_MAP,
NAME_CONFIG,
)
from .data_structure_utils import map_imported_datasets

Expand Down Expand Up @@ -199,9 +199,6 @@ def load_dataset(
else:
config_map[2] = region_id

# Validate column names of this iterated file
validate_data_columns(file_name, sheet_name, config_map, validation_settings)

# Validate datatypes within dataset and make appropriate changes to dtypes
# ---- This first enforces the correct dtype for each imported column
# ---- This then assigns the imported data to the correct class attribute
Expand Down Expand Up @@ -237,12 +234,6 @@ def load_dataset(
# Update configuration key map
config_map = [dataset, datalayer]

# Validate datatypes within dataset and make appropriate changes to dtypes
# (if necessary)
# ---- This first enforces the correct dtype for each imported column
# ---- This then assigns the imported data to the correct class attribute
validate_data_columns(file_name, sheets, config_map, validation_settings)

# Read in data and add to `Survey` object
read_validated_data(
input_dict,
Expand Down Expand Up @@ -296,11 +287,14 @@ def read_validated_data(
df_initial = df_initial.drop(0)

# Slice only the columns that are relevant to the echopop module functionality
valid_columns = list(set(validation_settings.keys()).intersection(set(df_initial.columns)))
df_filtered = df_initial[valid_columns]
df_filtered = df_initial.filter(validation_settings)

# Ensure the order of columns in df_filtered matches df_initial
df_filtered = df_filtered[df_initial.columns]
# Error evaluation and print message (if applicable)
if not set(validation_settings).issubset(set(df_filtered)):
missing_columns = set(validation_settings.keys()) - set(df_filtered)
raise ValueError(
f"Missing kriging/variogram parameters in the Excel file: {missing_columns}"
)

# Apply data types from validation_settings to the filtered DataFrame
df = df_filtered.apply(
Expand All @@ -311,7 +305,15 @@ def read_validated_data(

else:
# Read Excel file into memory -- this only reads in the required columns
df = pd.read_excel(file_name, sheet_name=sheet_name, usecols=validation_settings.keys())
# df = pd.read_excel(file_name, sheet_name=sheet_name, usecols=validation_settings.keys())
df = pd.read_excel(file_name, sheet_name=sheet_name)
# ---- Rename the columns, if needed, and then filter them
df = df.rename(columns=NAME_CONFIG).filter(validation_settings)

# Error evaluation and print message (if applicable)
if not set(validation_settings).issubset(set(df)):
missing_columns = set(validation_settings.keys()) - set(df)
raise ValueError(f"Missing columns in the Excel file: {missing_columns}")

# Apply data types from validation_settings to the filtered DataFrame
df = df.apply(lambda col: col.astype(validation_settings.get(col.name, type(col[0]))))
Expand Down Expand Up @@ -373,62 +375,11 @@ def read_validated_data(
input_dict["acoustics"]["nasc_df"][column_to_add] = df[column_to_add]
else:
raise ValueError(
"""Unexpected data attribute structure. Check API settings located in"""
"""the configuration YAML and core.py"""
"Unexpected data attribute structure. Check API settings located in "
"the configuration YAML and core.py."
)


def validate_data_columns(
file_name: Path, sheet_name: str, config_map: list, validation_settings: dict
):
"""
Opens a virtual instance of each .xlsx file to validate the presence
of require data column/variable names

Parameters
----------
file_name: Path
File path of data
sheet_name: str
Name of Excel sheet containing data
config_map: list
A list parsed from the file name that indicates how data attributes
within `self` are organized
validation_settings: dict
The subset CONFIG_MAP settings that contain the target column names
"""

# Open connection with the workbook and specific sheet
# This is useful for not calling the workbook into memory and allows for parsing
# only the necessary rows/column names
try:
workbook = load_workbook(file_name, read_only=True)

# If multiple sheets, iterate through
sheet_name = [sheet_name] if isinstance(sheet_name, str) else sheet_name

for sheets in sheet_name:
sheet = workbook[sheets]

# Validate that the expected columns are contained within the parsed
# column names of the workbook
if "vario_krig_para" in config_map:
data_columns = [list(row) for row in zip(*sheet.iter_rows(values_only=True))][0]
else:
data_columns = {col.value for col in sheet[1]}

# Error evaluation and print message (if applicable)
if not set(validation_settings.keys()).issubset(set(data_columns)):
missing_columns = set(validation_settings.keys()) - set(data_columns)
raise ValueError(f"Missing columns in the Excel file: {missing_columns}")

# Close connection to the work book
workbook.close()

except Exception as e:
print(f"Error reading file '{str(file_name)}': {e}")


def write_haul_to_transect_key(configuration_dict: dict, verbose: bool):
"""
Function for writing the haul-transect mapping key .xlsx file.
Expand Down
2 changes: 0 additions & 2 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@ dependencies:
# Computational stack
- geopandas
- geopy
- openpyxl
# 9/27/23: Pin to <2 to avoid new warning occurring in several cases
- pandas<2
- PyYAML
- shapely<2
- scipy
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ traitlets
geopandas
geopy
lmfit
openpyxl>=3.1.3
# 9/27/23: Pin to <2 to avoid new warning occurring in several cases
pandas
python-dateutil
Expand Down